From dcb9111be1ef36858e181134d6540da4df3cc4f7 Mon Sep 17 00:00:00 2001 From: "zhuyiqing.wiz" Date: Sat, 6 Sep 2025 22:34:15 +0800 Subject: [PATCH] [feat] stormservice support podgroup Signed-off-by: zhuyiqing.wiz --- .../v1alpha1/groupversion_info.go | 1 + api/orchestration/v1alpha1/podset_types.go | 3 + api/orchestration/v1alpha1/roleset_types.go | 93 +++++++- .../v1alpha1/zz_generated.deepcopy.go | 120 ++++++++++- .../orchestration.aibrix.ai_podsets.yaml | 158 ++++++++++++++ .../orchestration.aibrix.ai_rolesets.yaml | 202 +++++++++++++++++- ...orchestration.aibrix.ai_stormservices.yaml | 202 +++++++++++++++++- config/rbac/controller-manager/role.yaml | 18 ++ .../crds/orchestration.aibrix.ai_podsets.yaml | 158 ++++++++++++++ .../orchestration.aibrix.ai_rolesets.yaml | 202 +++++++++++++++++- ...orchestration.aibrix.ai_stormservices.yaml | 202 +++++++++++++++++- go.mod | 16 +- go.sum | 32 +-- .../coschedulingschedulingstrategyspec.go | 60 ++++++ .../v1alpha1/godelschedulingstrategyspec.go | 78 +++++++ .../orchestration/v1alpha1/rolespec.go | 9 + .../v1alpha1/schedulingstrategy.go | 32 ++- .../v1alpha1/volcanoschedulingstrategyspec.go | 84 ++++++++ pkg/client/applyconfiguration/utils.go | 6 + pkg/controller/constants/stormservice.go | 4 +- pkg/controller/podset/podset_controller.go | 84 ++++++++ pkg/controller/roleset/podset_rollsyncer.go | 34 ++- pkg/controller/roleset/roleset_controller.go | 10 +- pkg/controller/roleset/sync.go | 87 +++++--- pkg/controller/roleset/utils.go | 21 +- pkg/controller/roleset/utils_test.go | 86 +++++++- pkg/controller/util/orchestration/util.go | 56 +++++ test/utils/wrapper/roleset.go | 9 +- 28 files changed, 1973 insertions(+), 94 deletions(-) create mode 100644 pkg/client/applyconfiguration/orchestration/v1alpha1/coschedulingschedulingstrategyspec.go create mode 100644 pkg/client/applyconfiguration/orchestration/v1alpha1/godelschedulingstrategyspec.go create mode 100644 pkg/client/applyconfiguration/orchestration/v1alpha1/volcanoschedulingstrategyspec.go diff --git a/api/orchestration/v1alpha1/groupversion_info.go b/api/orchestration/v1alpha1/groupversion_info.go index 9a219accb..c94e75b1c 100644 --- a/api/orchestration/v1alpha1/groupversion_info.go +++ b/api/orchestration/v1alpha1/groupversion_info.go @@ -27,6 +27,7 @@ import ( const ( StormServiceKind = "StormService" RoleSetKind = "RoleSet" + PodSetKind = "PodSet" ) var ( diff --git a/api/orchestration/v1alpha1/podset_types.go b/api/orchestration/v1alpha1/podset_types.go index 46d6d1f2c..8f3e28499 100644 --- a/api/orchestration/v1alpha1/podset_types.go +++ b/api/orchestration/v1alpha1/podset_types.go @@ -35,6 +35,9 @@ type PodSetSpec struct { // Stateful indicates whether pods should have stable network identities // +optional Stateful bool `json:"stateful,omitempty"` + + // +optional + SchedulingStrategy *SchedulingStrategy `json:"schedulingStrategy,omitempty"` } // PodSetStatus defines the observed state of PodSet diff --git a/api/orchestration/v1alpha1/roleset_types.go b/api/orchestration/v1alpha1/roleset_types.go index e2248fdd2..d33b87f97 100644 --- a/api/orchestration/v1alpha1/roleset_types.go +++ b/api/orchestration/v1alpha1/roleset_types.go @@ -35,12 +35,96 @@ type RoleSetSpec struct { UpdateStrategy RoleSetUpdateStrategyType `json:"updateStrategy,omitempty"` // +optional - SchedulingStrategy SchedulingStrategy `json:"schedulingStrategy,omitempty"` + SchedulingStrategy *SchedulingStrategy `json:"schedulingStrategy,omitempty"` } -// +enum +// +kubebuilder:validation:MaxProperties=1 type SchedulingStrategy struct { - PodGroup *schedv1alpha1.PodGroupSpec `json:"podGroup,omitempty"` + GodelSchedulingStrategy *GodelSchedulingStrategySpec `json:"godelSchedulingStrategy,omitempty"` + + CoschedulingSchedulingStrategy *CoschedulingSchedulingStrategySpec `json:"coschedulingSchedulingStrategy,omitempty"` + + VolcanoSchedulingStrategy *VolcanoSchedulingStrategySpec `json:"volcanoSchedulingStrategy,omitempty"` +} + +// GodelSchedulingStrategySpec uses godel scheduler's podgroup definition +type GodelSchedulingStrategySpec struct { + // MinMember defines the minimal number of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start anyone. + MinMember int32 `json:"minMember,omitempty"` + + // If specified, indicates the PodGroup's priority. "system-node-critical" and + // "system-cluster-critical" are two special reserved keywords which indicate the highest priorities. + // Any other name must be defined by creating a PriorityClass object with that name. + // If not specified, the PodGroup priority will be default. + // If default priority class doesn't exist, the PodGroup priority will be zero. + // +optional + PriorityClassName string `json:"priorityClassName,omitempty"` + + // ScheduleTimeoutSeconds defines the maximal time of tasks to wait before run the pod group; + // +optional + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` + + // Application indicates the podGroup belongs to a logical Application + // This will be used for coordinate with features like drf and faire share. + // +optional + Application string `json:"application,omitempty"` + + // Affinity shows the affinity/anti-affinity rules that scheduler needs to follow + // when scheduling instances of this pod group. + // +optional + Affinity *schedv1alpha1.Affinity `json:"affinity,omitempty"` +} + +// CoschedulingSchedulingStrategySpec uses coscheduling scheduler-plugin's podgroup definition +type CoschedulingSchedulingStrategySpec struct { + // MinMember defines the minimal number of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start any. + // The minimum is 1 + // +kubebuilder:validation:Minimum=1 + MinMember int32 `json:"minMember,omitempty"` + + // MinResources defines the minimal resource of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start any. + MinResources v1.ResourceList `json:"minResources,omitempty"` + + // ScheduleTimeoutSeconds defines the maximal time of members/tasks to wait before run the pod group; + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} + +// VolcanoSchedulingStrategySpec uses volcano's podgroup definition +type VolcanoSchedulingStrategySpec struct { + // MinMember defines the minimal number of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start anyone. + MinMember int32 `json:"minMember,omitempty" protobuf:"bytes,1,opt,name=minMember"` + + // MinTaskMember defines the minimal number of pods to run each task in the pod group; + // if there's not enough resources to start each task, the scheduler + // will not start anyone. + MinTaskMember map[string]int32 `json:"minTaskMember,omitempty" protobuf:"bytes,1,opt,name=minTaskMember"` + + // Queue defines the queue to allocate resource for PodGroup; if queue does not exist, + // the PodGroup will not be scheduled. Defaults to `default` Queue with the lowest weight. + // +optional + Queue string `json:"queue,omitempty" protobuf:"bytes,2,opt,name=queue"` + + // If specified, indicates the PodGroup's priority. "system-node-critical" and + // "system-cluster-critical" are two special keywords which indicate the + // highest priorities with the former being the highest priority. Any other + // name must be defined by creating a PriorityClass object with that name. + // If not specified, the PodGroup priority will be default or zero if there is no + // default. + // +optional + PriorityClassName string `json:"priorityClassName,omitempty" protobuf:"bytes,3,opt,name=priorityClassName"` + + // MinResources defines the minimal resource of members/tasks to run the pod group; + // if there's not enough resources to start all tasks, the scheduler + // will not start anyone. + MinResources *v1.ResourceList `json:"minResources,omitempty" protobuf:"bytes,4,opt,name=minResources"` } // +enum @@ -94,6 +178,9 @@ type RoleSpec struct { // DisruptionTolerance indicates how many pods can be unavailable during the preemption/eviction. // +optional DisruptionTolerance DisruptionTolerance `json:"disruptionTolerance,omitempty"` + + // +optional + SchedulingStrategy *SchedulingStrategy `json:"schedulingStrategy,omitempty"` } // +enum diff --git a/api/orchestration/v1alpha1/zz_generated.deepcopy.go b/api/orchestration/v1alpha1/zz_generated.deepcopy.go index c2cd2bc4c..82cdb2288 100644 --- a/api/orchestration/v1alpha1/zz_generated.deepcopy.go +++ b/api/orchestration/v1alpha1/zz_generated.deepcopy.go @@ -22,7 +22,8 @@ package v1alpha1 import ( schedulingv1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + resource "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" @@ -76,6 +77,33 @@ func (in Conditions) DeepCopy() Conditions { return *out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CoschedulingSchedulingStrategySpec) DeepCopyInto(out *CoschedulingSchedulingStrategySpec) { + *out = *in + if in.MinResources != nil { + in, out := &in.MinResources, &out.MinResources + *out = make(v1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.ScheduleTimeoutSeconds != nil { + in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CoschedulingSchedulingStrategySpec. +func (in *CoschedulingSchedulingStrategySpec) DeepCopy() *CoschedulingSchedulingStrategySpec { + if in == nil { + return nil + } + out := new(CoschedulingSchedulingStrategySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DisruptionTolerance) DeepCopyInto(out *DisruptionTolerance) { *out = *in @@ -111,6 +139,31 @@ func (in *ExternalConnectionConfig) DeepCopy() *ExternalConnectionConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GodelSchedulingStrategySpec) DeepCopyInto(out *GodelSchedulingStrategySpec) { + *out = *in + if in.ScheduleTimeoutSeconds != nil { + in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds + *out = new(int32) + **out = **in + } + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(schedulingv1alpha1.Affinity) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GodelSchedulingStrategySpec. +func (in *GodelSchedulingStrategySpec) DeepCopy() *GodelSchedulingStrategySpec { + if in == nil { + return nil + } + out := new(GodelSchedulingStrategySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *KVCache) DeepCopyInto(out *KVCache) { *out = *in @@ -332,6 +385,11 @@ func (in *PodSetList) DeepCopyObject() runtime.Object { func (in *PodSetSpec) DeepCopyInto(out *PodSetSpec) { *out = *in in.Template.DeepCopyInto(&out.Template) + if in.SchedulingStrategy != nil { + in, out := &in.SchedulingStrategy, &out.SchedulingStrategy + *out = new(SchedulingStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSetSpec. @@ -699,7 +757,11 @@ func (in *RoleSetSpec) DeepCopyInto(out *RoleSetSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - in.SchedulingStrategy.DeepCopyInto(&out.SchedulingStrategy) + if in.SchedulingStrategy != nil { + in, out := &in.SchedulingStrategy, &out.SchedulingStrategy + *out = new(SchedulingStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RoleSetSpec. @@ -781,6 +843,11 @@ func (in *RoleSpec) DeepCopyInto(out *RoleSpec) { in.UpdateStrategy.DeepCopyInto(&out.UpdateStrategy) in.Template.DeepCopyInto(&out.Template) in.DisruptionTolerance.DeepCopyInto(&out.DisruptionTolerance) + if in.SchedulingStrategy != nil { + in, out := &in.SchedulingStrategy, &out.SchedulingStrategy + *out = new(SchedulingStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RoleSpec. @@ -864,9 +931,19 @@ func (in *RuntimeSpec) DeepCopy() *RuntimeSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingStrategy) DeepCopyInto(out *SchedulingStrategy) { *out = *in - if in.PodGroup != nil { - in, out := &in.PodGroup, &out.PodGroup - *out = new(schedulingv1alpha1.PodGroupSpec) + if in.GodelSchedulingStrategy != nil { + in, out := &in.GodelSchedulingStrategy, &out.GodelSchedulingStrategy + *out = new(GodelSchedulingStrategySpec) + (*in).DeepCopyInto(*out) + } + if in.CoschedulingSchedulingStrategy != nil { + in, out := &in.CoschedulingSchedulingStrategy, &out.CoschedulingSchedulingStrategy + *out = new(CoschedulingSchedulingStrategySpec) + (*in).DeepCopyInto(*out) + } + if in.VolcanoSchedulingStrategy != nil { + in, out := &in.VolcanoSchedulingStrategy, &out.VolcanoSchedulingStrategy + *out = new(VolcanoSchedulingStrategySpec) (*in).DeepCopyInto(*out) } } @@ -1046,3 +1123,36 @@ func (in *StormServiceUpdateStrategy) DeepCopy() *StormServiceUpdateStrategy { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VolcanoSchedulingStrategySpec) DeepCopyInto(out *VolcanoSchedulingStrategySpec) { + *out = *in + if in.MinTaskMember != nil { + in, out := &in.MinTaskMember, &out.MinTaskMember + *out = make(map[string]int32, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.MinResources != nil { + in, out := &in.MinResources, &out.MinResources + *out = new(v1.ResourceList) + if **in != nil { + in, out := *in, *out + *out = make(map[v1.ResourceName]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VolcanoSchedulingStrategySpec. +func (in *VolcanoSchedulingStrategySpec) DeepCopy() *VolcanoSchedulingStrategySpec { + if in == nil { + return nil + } + out := new(VolcanoSchedulingStrategySpec) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/orchestration/orchestration.aibrix.ai_podsets.yaml b/config/crd/orchestration/orchestration.aibrix.ai_podsets.yaml index 38e2c9fa1..157c7b340 100644 --- a/config/crd/orchestration/orchestration.aibrix.ai_podsets.yaml +++ b/config/crd/orchestration/orchestration.aibrix.ai_podsets.yaml @@ -31,6 +31,164 @@ spec: maximum: 100 minimum: 2 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: diff --git a/config/crd/orchestration/orchestration.aibrix.ai_rolesets.yaml b/config/crd/orchestration/orchestration.aibrix.ai_rolesets.yaml index 4a10ebe6a..f8711329b 100644 --- a/config/crd/orchestration/orchestration.aibrix.ai_rolesets.yaml +++ b/config/crd/orchestration/orchestration.aibrix.ai_rolesets.yaml @@ -47,6 +47,164 @@ spec: replicas: format: int32 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: @@ -3697,8 +3855,27 @@ spec: type: object type: array schedulingStrategy: + maxProperties: 1 properties: - podGroup: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: properties: affinity: properties: @@ -3811,6 +3988,29 @@ spec: format: int32 type: integer type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object type: object updateStrategy: default: Sequential diff --git a/config/crd/orchestration/orchestration.aibrix.ai_stormservices.yaml b/config/crd/orchestration/orchestration.aibrix.ai_stormservices.yaml index 601550950..1a9bf01d1 100644 --- a/config/crd/orchestration/orchestration.aibrix.ai_stormservices.yaml +++ b/config/crd/orchestration/orchestration.aibrix.ai_stormservices.yaml @@ -114,6 +114,164 @@ spec: replicas: format: int32 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: @@ -3764,8 +3922,27 @@ spec: type: object type: array schedulingStrategy: + maxProperties: 1 properties: - podGroup: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: properties: affinity: properties: @@ -3878,6 +4055,29 @@ spec: format: int32 type: integer type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object type: object updateStrategy: default: Sequential diff --git a/config/rbac/controller-manager/role.yaml b/config/rbac/controller-manager/role.yaml index ae41e0087..2d6527adf 100644 --- a/config/rbac/controller-manager/role.yaml +++ b/config/rbac/controller-manager/role.yaml @@ -291,3 +291,21 @@ rules: - list - update - watch +- apiGroups: + - scheduling.godel.kubewharf.io + resources: + - podgroups + verbs: + - '*' +- apiGroups: + - scheduling.x-k8s.io + resources: + - podgroups + verbs: + - '*' +- apiGroups: + - scheduling.volcano.sh + resources: + - podgroups + verbs: + - '*' \ No newline at end of file diff --git a/dist/chart/crds/orchestration.aibrix.ai_podsets.yaml b/dist/chart/crds/orchestration.aibrix.ai_podsets.yaml index 38e2c9fa1..157c7b340 100644 --- a/dist/chart/crds/orchestration.aibrix.ai_podsets.yaml +++ b/dist/chart/crds/orchestration.aibrix.ai_podsets.yaml @@ -31,6 +31,164 @@ spec: maximum: 100 minimum: 2 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: diff --git a/dist/chart/crds/orchestration.aibrix.ai_rolesets.yaml b/dist/chart/crds/orchestration.aibrix.ai_rolesets.yaml index 4a10ebe6a..f8711329b 100644 --- a/dist/chart/crds/orchestration.aibrix.ai_rolesets.yaml +++ b/dist/chart/crds/orchestration.aibrix.ai_rolesets.yaml @@ -47,6 +47,164 @@ spec: replicas: format: int32 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: @@ -3697,8 +3855,27 @@ spec: type: object type: array schedulingStrategy: + maxProperties: 1 properties: - podGroup: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: properties: affinity: properties: @@ -3811,6 +3988,29 @@ spec: format: int32 type: integer type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object type: object updateStrategy: default: Sequential diff --git a/dist/chart/crds/orchestration.aibrix.ai_stormservices.yaml b/dist/chart/crds/orchestration.aibrix.ai_stormservices.yaml index 601550950..1a9bf01d1 100644 --- a/dist/chart/crds/orchestration.aibrix.ai_stormservices.yaml +++ b/dist/chart/crds/orchestration.aibrix.ai_stormservices.yaml @@ -114,6 +114,164 @@ spec: replicas: format: int32 type: integer + schedulingStrategy: + maxProperties: 1 + properties: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: + properties: + affinity: + properties: + podGroupAffinity: + properties: + nodeSelector: + properties: + nodeSelectorTerms: + items: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + sortRules: + items: + properties: + dimension: + type: string + order: + type: string + resource: + type: string + required: + - order + - resource + type: object + type: array + type: object + podGroupAntiAffinity: + properties: + preferred: + items: + properties: + topologyKey: + type: string + type: object + type: array + required: + items: + properties: + topologyKey: + type: string + type: object + type: array + type: object + type: object + application: + type: string + minMember: + format: int32 + type: integer + priorityClassName: + type: string + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object + type: object stateful: type: boolean template: @@ -3764,8 +3922,27 @@ spec: type: object type: array schedulingStrategy: + maxProperties: 1 properties: - podGroup: + coschedulingSchedulingStrategy: + properties: + minMember: + format: int32 + minimum: 1 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scheduleTimeoutSeconds: + format: int32 + type: integer + type: object + godelSchedulingStrategy: properties: affinity: properties: @@ -3878,6 +4055,29 @@ spec: format: int32 type: integer type: object + volcanoSchedulingStrategy: + properties: + minMember: + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + minTaskMember: + additionalProperties: + format: int32 + type: integer + type: object + priorityClassName: + type: string + queue: + type: string + type: object type: object updateStrategy: default: Sequential diff --git a/go.mod b/go.mod index 13a8ebeca..29c96abc2 100644 --- a/go.mod +++ b/go.mod @@ -30,18 +30,20 @@ require ( github.com/stretchr/testify v1.10.0 go.uber.org/atomic v1.11.0 google.golang.org/grpc v1.65.0 - k8s.io/api v0.31.2 - k8s.io/apiextensions-apiserver v0.31.2 - k8s.io/apimachinery v0.31.2 - k8s.io/client-go v0.31.2 - k8s.io/code-generator v0.31.2 + k8s.io/api v0.31.8 + k8s.io/apiextensions-apiserver v0.31.8 + k8s.io/apimachinery v0.31.8 + k8s.io/client-go v0.31.8 + k8s.io/code-generator v0.31.8 k8s.io/klog/v2 v2.130.1 k8s.io/kube-openapi v0.0.0-20240620174524-b456828f718b - k8s.io/metrics v0.29.6 + k8s.io/metrics v0.31.8 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 - sigs.k8s.io/controller-runtime v0.19.1 + sigs.k8s.io/controller-runtime v0.19.2 sigs.k8s.io/gateway-api v1.0.0 + sigs.k8s.io/scheduler-plugins v0.31.8 sigs.k8s.io/structured-merge-diff/v4 v4.4.1 + volcano.sh/apis v1.11.2 ) require ( diff --git a/go.sum b/go.sum index e8cf6f663..60e507bf4 100644 --- a/go.sum +++ b/go.sum @@ -295,16 +295,16 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.31.2 h1:3wLBbL5Uom/8Zy98GRPXpJ254nEFpl+hwndmk9RwmL0= -k8s.io/api v0.31.2/go.mod h1:bWmGvrGPssSK1ljmLzd3pwCQ9MgoTsRCuK35u6SygUk= -k8s.io/apiextensions-apiserver v0.31.2 h1:W8EwUb8+WXBLu56ser5IudT2cOho0gAKeTOnywBLxd0= -k8s.io/apiextensions-apiserver v0.31.2/go.mod h1:i+Geh+nGCJEGiCGR3MlBDkS7koHIIKWVfWeRFiOsUcM= -k8s.io/apimachinery v0.31.2 h1:i4vUt2hPK56W6mlT7Ry+AO8eEsyxMD1U44NR22CLTYw= -k8s.io/apimachinery v0.31.2/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= -k8s.io/client-go v0.31.2 h1:Y2F4dxU5d3AQj+ybwSMqQnpZH9F30//1ObxOKlTI9yc= -k8s.io/client-go v0.31.2/go.mod h1:NPa74jSVR/+eez2dFsEIHNa+3o09vtNaWwWwb1qSxSs= -k8s.io/code-generator v0.31.2 h1:xLWxG0HEpMSHfcM//3u3Ro2Hmc6AyyLINQS//Z2GEOI= -k8s.io/code-generator v0.31.2/go.mod h1:eEQHXgBU/m7LDaToDoiz3t97dUUVyOblQdwOr8rivqc= +k8s.io/api v0.31.8 h1:d5WuCZpFqpkQ7a4JuxSI0/IQuFWT+dUE3jeptRoZkto= +k8s.io/api v0.31.8/go.mod h1:Sq38Y1MdXkkp4thnHFYgErPgP0jhZ9sTOppFkt14YQ8= +k8s.io/apiextensions-apiserver v0.31.8 h1:lV5/5Z28J+gbtcH+5LnLtqI3BEYmQ6tUcyGD51lDxNM= +k8s.io/apiextensions-apiserver v0.31.8/go.mod h1:NmrKYZI78Zqm3PiDxzwE5m9DBByRAPt65AHZfq0iRF0= +k8s.io/apimachinery v0.31.8 h1:zRA9bpuLwdVqODPrWaAT9eRVB4GuTYLSRLoO3XrzYUU= +k8s.io/apimachinery v0.31.8/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/client-go v0.31.8 h1:sMlDa9W+2y3tHo0D+XYeovhOTww7lKiOTTqqyxABcM8= +k8s.io/client-go v0.31.8/go.mod h1:7g9whHSnLT2Eilwpw1Ozdl2vRr2zwwqO5RPBDDkT5xo= +k8s.io/code-generator v0.31.8 h1:PuRUhBUxR/sApmtWxVF1drZZaRCDWAC6MEfgQ0SvuP0= +k8s.io/code-generator v0.31.8/go.mod h1:s3DxX9wpP9vc9h0VAtEzoG2fz4KIJASscPUl6pTm8u0= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -313,17 +313,21 @@ k8s.io/kube-aggregator v0.31.2 h1:Uw1zUP2D/4wiSjKWVVzSOcCGLuW/+IdRwjjC0FJooYU= k8s.io/kube-aggregator v0.31.2/go.mod h1:41/VIXH+/Qcg9ERNAY6bRF/WQR6xL1wFgYagdHac1X4= k8s.io/kube-openapi v0.0.0-20240620174524-b456828f718b h1:Q9xmGWBvOGd8UJyccgpYlLosk/JlfP3xQLNkQlHJeXw= k8s.io/kube-openapi v0.0.0-20240620174524-b456828f718b/go.mod h1:UxDHUPsUwTOOxSU+oXURfFBcAS6JwiRXTYqYwfuGowc= -k8s.io/metrics v0.29.6 h1:kjMGPYxtCi4OO0fUar76y0CiUoeGYDNmUV0LXJIis4Q= -k8s.io/metrics v0.29.6/go.mod h1:vqGzOaYGuNSSAI7GM1+v6L5z8aAUSzui1W0eQB3wVJY= +k8s.io/metrics v0.31.8 h1:pNhZzkpi3YjfboQYY/5LWWbtyw3n3j1vRGMtnEx69SU= +k8s.io/metrics v0.31.8/go.mod h1:sAoKLdycsqjVBrTw3H6fdfBYzTPMKETxbZyjnjSmL+Y= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= -sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= +sigs.k8s.io/controller-runtime v0.19.2 h1:3sPrF58XQEPzbE8T81TN6selQIMGbtYwuaJ6eDssDF8= +sigs.k8s.io/controller-runtime v0.19.2/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/gateway-api v1.0.0 h1:iPTStSv41+d9p0xFydll6d7f7MOBGuqXM6p2/zVYMAs= sigs.k8s.io/gateway-api v1.0.0/go.mod h1:4cUgr0Lnp5FZ0Cdq8FdRwCvpiWws7LVhLHGIudLlf4c= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/scheduler-plugins v0.31.8 h1:Ie2EFRnkE9T2tBjxwypww7hJJyPRIwrXJNZeNxjP6QY= +sigs.k8s.io/scheduler-plugins v0.31.8/go.mod h1:KkcXEbf9CYaoZ5ntbAMSYmquPq9MtSfXVpI31R6mHeM= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +volcano.sh/apis v1.11.2 h1:Vz8NzP0af8vyxRccrEUt6/FikD5eeEOnCZRolVzZvK8= +volcano.sh/apis v1.11.2/go.mod h1:FOdmG++9+8lgENJ9XXDh+O3Jcb9YVRnlMSpgIh3NSVI= diff --git a/pkg/client/applyconfiguration/orchestration/v1alpha1/coschedulingschedulingstrategyspec.go b/pkg/client/applyconfiguration/orchestration/v1alpha1/coschedulingschedulingstrategyspec.go new file mode 100644 index 000000000..a21367916 --- /dev/null +++ b/pkg/client/applyconfiguration/orchestration/v1alpha1/coschedulingschedulingstrategyspec.go @@ -0,0 +1,60 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" +) + +// CoschedulingSchedulingStrategySpecApplyConfiguration represents a declarative configuration of the CoschedulingSchedulingStrategySpec type for use +// with apply. +type CoschedulingSchedulingStrategySpecApplyConfiguration struct { + MinMember *int32 `json:"minMember,omitempty"` + MinResources *v1.ResourceList `json:"minResources,omitempty"` + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} + +// CoschedulingSchedulingStrategySpecApplyConfiguration constructs a declarative configuration of the CoschedulingSchedulingStrategySpec type for use with +// apply. +func CoschedulingSchedulingStrategySpec() *CoschedulingSchedulingStrategySpecApplyConfiguration { + return &CoschedulingSchedulingStrategySpecApplyConfiguration{} +} + +// WithMinMember sets the MinMember field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MinMember field is set to the value of the last call. +func (b *CoschedulingSchedulingStrategySpecApplyConfiguration) WithMinMember(value int32) *CoschedulingSchedulingStrategySpecApplyConfiguration { + b.MinMember = &value + return b +} + +// WithMinResources sets the MinResources field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MinResources field is set to the value of the last call. +func (b *CoschedulingSchedulingStrategySpecApplyConfiguration) WithMinResources(value v1.ResourceList) *CoschedulingSchedulingStrategySpecApplyConfiguration { + b.MinResources = &value + return b +} + +// WithScheduleTimeoutSeconds sets the ScheduleTimeoutSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ScheduleTimeoutSeconds field is set to the value of the last call. +func (b *CoschedulingSchedulingStrategySpecApplyConfiguration) WithScheduleTimeoutSeconds(value int32) *CoschedulingSchedulingStrategySpecApplyConfiguration { + b.ScheduleTimeoutSeconds = &value + return b +} diff --git a/pkg/client/applyconfiguration/orchestration/v1alpha1/godelschedulingstrategyspec.go b/pkg/client/applyconfiguration/orchestration/v1alpha1/godelschedulingstrategyspec.go new file mode 100644 index 000000000..970d8daef --- /dev/null +++ b/pkg/client/applyconfiguration/orchestration/v1alpha1/godelschedulingstrategyspec.go @@ -0,0 +1,78 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" +) + +// GodelSchedulingStrategySpecApplyConfiguration represents a declarative configuration of the GodelSchedulingStrategySpec type for use +// with apply. +type GodelSchedulingStrategySpecApplyConfiguration struct { + MinMember *int32 `json:"minMember,omitempty"` + PriorityClassName *string `json:"priorityClassName,omitempty"` + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` + Application *string `json:"application,omitempty"` + Affinity *v1alpha1.Affinity `json:"affinity,omitempty"` +} + +// GodelSchedulingStrategySpecApplyConfiguration constructs a declarative configuration of the GodelSchedulingStrategySpec type for use with +// apply. +func GodelSchedulingStrategySpec() *GodelSchedulingStrategySpecApplyConfiguration { + return &GodelSchedulingStrategySpecApplyConfiguration{} +} + +// WithMinMember sets the MinMember field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MinMember field is set to the value of the last call. +func (b *GodelSchedulingStrategySpecApplyConfiguration) WithMinMember(value int32) *GodelSchedulingStrategySpecApplyConfiguration { + b.MinMember = &value + return b +} + +// WithPriorityClassName sets the PriorityClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PriorityClassName field is set to the value of the last call. +func (b *GodelSchedulingStrategySpecApplyConfiguration) WithPriorityClassName(value string) *GodelSchedulingStrategySpecApplyConfiguration { + b.PriorityClassName = &value + return b +} + +// WithScheduleTimeoutSeconds sets the ScheduleTimeoutSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ScheduleTimeoutSeconds field is set to the value of the last call. +func (b *GodelSchedulingStrategySpecApplyConfiguration) WithScheduleTimeoutSeconds(value int32) *GodelSchedulingStrategySpecApplyConfiguration { + b.ScheduleTimeoutSeconds = &value + return b +} + +// WithApplication sets the Application field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Application field is set to the value of the last call. +func (b *GodelSchedulingStrategySpecApplyConfiguration) WithApplication(value string) *GodelSchedulingStrategySpecApplyConfiguration { + b.Application = &value + return b +} + +// WithAffinity sets the Affinity field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Affinity field is set to the value of the last call. +func (b *GodelSchedulingStrategySpecApplyConfiguration) WithAffinity(value v1alpha1.Affinity) *GodelSchedulingStrategySpecApplyConfiguration { + b.Affinity = &value + return b +} diff --git a/pkg/client/applyconfiguration/orchestration/v1alpha1/rolespec.go b/pkg/client/applyconfiguration/orchestration/v1alpha1/rolespec.go index 65b8d32cb..6dfc6eae9 100644 --- a/pkg/client/applyconfiguration/orchestration/v1alpha1/rolespec.go +++ b/pkg/client/applyconfiguration/orchestration/v1alpha1/rolespec.go @@ -32,6 +32,7 @@ type RoleSpecApplyConfiguration struct { Stateful *bool `json:"stateful,omitempty"` Template *v1.PodTemplateSpec `json:"template,omitempty"` DisruptionTolerance *DisruptionToleranceApplyConfiguration `json:"disruptionTolerance,omitempty"` + SchedulingStrategy *SchedulingStrategyApplyConfiguration `json:"schedulingStrategy,omitempty"` } // RoleSpecApplyConfiguration constructs a declarative configuration of the RoleSpec type for use with @@ -103,3 +104,11 @@ func (b *RoleSpecApplyConfiguration) WithDisruptionTolerance(value *DisruptionTo b.DisruptionTolerance = value return b } + +// WithSchedulingStrategy sets the SchedulingStrategy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the SchedulingStrategy field is set to the value of the last call. +func (b *RoleSpecApplyConfiguration) WithSchedulingStrategy(value *SchedulingStrategyApplyConfiguration) *RoleSpecApplyConfiguration { + b.SchedulingStrategy = value + return b +} diff --git a/pkg/client/applyconfiguration/orchestration/v1alpha1/schedulingstrategy.go b/pkg/client/applyconfiguration/orchestration/v1alpha1/schedulingstrategy.go index b0d9bcd1e..ebe784a48 100644 --- a/pkg/client/applyconfiguration/orchestration/v1alpha1/schedulingstrategy.go +++ b/pkg/client/applyconfiguration/orchestration/v1alpha1/schedulingstrategy.go @@ -17,14 +17,12 @@ limitations under the License. package v1alpha1 -import ( - v1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" -) - // SchedulingStrategyApplyConfiguration represents a declarative configuration of the SchedulingStrategy type for use // with apply. type SchedulingStrategyApplyConfiguration struct { - PodGroup *v1alpha1.PodGroupSpec `json:"podGroup,omitempty"` + GodelSchedulingStrategy *GodelSchedulingStrategySpecApplyConfiguration `json:"godelSchedulingStrategy,omitempty"` + CoschedulingSchedulingStrategy *CoschedulingSchedulingStrategySpecApplyConfiguration `json:"coschedulingSchedulingStrategy,omitempty"` + VolcanoSchedulingStrategy *VolcanoSchedulingStrategySpecApplyConfiguration `json:"volcanoSchedulingStrategy,omitempty"` } // SchedulingStrategyApplyConfiguration constructs a declarative configuration of the SchedulingStrategy type for use with @@ -33,10 +31,26 @@ func SchedulingStrategy() *SchedulingStrategyApplyConfiguration { return &SchedulingStrategyApplyConfiguration{} } -// WithPodGroup sets the PodGroup field in the declarative configuration to the given value +// WithGodelSchedulingStrategy sets the GodelSchedulingStrategy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GodelSchedulingStrategy field is set to the value of the last call. +func (b *SchedulingStrategyApplyConfiguration) WithGodelSchedulingStrategy(value *GodelSchedulingStrategySpecApplyConfiguration) *SchedulingStrategyApplyConfiguration { + b.GodelSchedulingStrategy = value + return b +} + +// WithCoschedulingSchedulingStrategy sets the CoschedulingSchedulingStrategy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the CoschedulingSchedulingStrategy field is set to the value of the last call. +func (b *SchedulingStrategyApplyConfiguration) WithCoschedulingSchedulingStrategy(value *CoschedulingSchedulingStrategySpecApplyConfiguration) *SchedulingStrategyApplyConfiguration { + b.CoschedulingSchedulingStrategy = value + return b +} + +// WithVolcanoSchedulingStrategy sets the VolcanoSchedulingStrategy field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the PodGroup field is set to the value of the last call. -func (b *SchedulingStrategyApplyConfiguration) WithPodGroup(value v1alpha1.PodGroupSpec) *SchedulingStrategyApplyConfiguration { - b.PodGroup = &value +// If called multiple times, the VolcanoSchedulingStrategy field is set to the value of the last call. +func (b *SchedulingStrategyApplyConfiguration) WithVolcanoSchedulingStrategy(value *VolcanoSchedulingStrategySpecApplyConfiguration) *SchedulingStrategyApplyConfiguration { + b.VolcanoSchedulingStrategy = value return b } diff --git a/pkg/client/applyconfiguration/orchestration/v1alpha1/volcanoschedulingstrategyspec.go b/pkg/client/applyconfiguration/orchestration/v1alpha1/volcanoschedulingstrategyspec.go new file mode 100644 index 000000000..7d44f38ee --- /dev/null +++ b/pkg/client/applyconfiguration/orchestration/v1alpha1/volcanoschedulingstrategyspec.go @@ -0,0 +1,84 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1 "k8s.io/api/core/v1" +) + +// VolcanoSchedulingStrategySpecApplyConfiguration represents a declarative configuration of the VolcanoSchedulingStrategySpec type for use +// with apply. +type VolcanoSchedulingStrategySpecApplyConfiguration struct { + MinMember *int32 `json:"minMember,omitempty"` + MinTaskMember map[string]int32 `json:"minTaskMember,omitempty"` + Queue *string `json:"queue,omitempty"` + PriorityClassName *string `json:"priorityClassName,omitempty"` + MinResources *v1.ResourceList `json:"minResources,omitempty"` +} + +// VolcanoSchedulingStrategySpecApplyConfiguration constructs a declarative configuration of the VolcanoSchedulingStrategySpec type for use with +// apply. +func VolcanoSchedulingStrategySpec() *VolcanoSchedulingStrategySpecApplyConfiguration { + return &VolcanoSchedulingStrategySpecApplyConfiguration{} +} + +// WithMinMember sets the MinMember field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MinMember field is set to the value of the last call. +func (b *VolcanoSchedulingStrategySpecApplyConfiguration) WithMinMember(value int32) *VolcanoSchedulingStrategySpecApplyConfiguration { + b.MinMember = &value + return b +} + +// WithMinTaskMember puts the entries into the MinTaskMember field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the MinTaskMember field, +// overwriting an existing map entries in MinTaskMember field with the same key. +func (b *VolcanoSchedulingStrategySpecApplyConfiguration) WithMinTaskMember(entries map[string]int32) *VolcanoSchedulingStrategySpecApplyConfiguration { + if b.MinTaskMember == nil && len(entries) > 0 { + b.MinTaskMember = make(map[string]int32, len(entries)) + } + for k, v := range entries { + b.MinTaskMember[k] = v + } + return b +} + +// WithQueue sets the Queue field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Queue field is set to the value of the last call. +func (b *VolcanoSchedulingStrategySpecApplyConfiguration) WithQueue(value string) *VolcanoSchedulingStrategySpecApplyConfiguration { + b.Queue = &value + return b +} + +// WithPriorityClassName sets the PriorityClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PriorityClassName field is set to the value of the last call. +func (b *VolcanoSchedulingStrategySpecApplyConfiguration) WithPriorityClassName(value string) *VolcanoSchedulingStrategySpecApplyConfiguration { + b.PriorityClassName = &value + return b +} + +// WithMinResources sets the MinResources field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MinResources field is set to the value of the last call. +func (b *VolcanoSchedulingStrategySpecApplyConfiguration) WithMinResources(value v1.ResourceList) *VolcanoSchedulingStrategySpecApplyConfiguration { + b.MinResources = &value + return b +} diff --git a/pkg/client/applyconfiguration/utils.go b/pkg/client/applyconfiguration/utils.go index a25ee3e78..8c3d9e16a 100644 --- a/pkg/client/applyconfiguration/utils.go +++ b/pkg/client/applyconfiguration/utils.go @@ -55,8 +55,12 @@ func ForKind(kind schema.GroupVersionKind) interface{} { // Group=orchestration, Version=v1alpha1 case orchestrationv1alpha1.SchemeGroupVersion.WithKind("Condition"): return &applyconfigurationorchestrationv1alpha1.ConditionApplyConfiguration{} + case orchestrationv1alpha1.SchemeGroupVersion.WithKind("CoschedulingSchedulingStrategySpec"): + return &applyconfigurationorchestrationv1alpha1.CoschedulingSchedulingStrategySpecApplyConfiguration{} case orchestrationv1alpha1.SchemeGroupVersion.WithKind("DisruptionTolerance"): return &applyconfigurationorchestrationv1alpha1.DisruptionToleranceApplyConfiguration{} + case orchestrationv1alpha1.SchemeGroupVersion.WithKind("GodelSchedulingStrategySpec"): + return &applyconfigurationorchestrationv1alpha1.GodelSchedulingStrategySpecApplyConfiguration{} case orchestrationv1alpha1.SchemeGroupVersion.WithKind("RayClusterFleet"): return &applyconfigurationorchestrationv1alpha1.RayClusterFleetApplyConfiguration{} case orchestrationv1alpha1.SchemeGroupVersion.WithKind("RayClusterFleetCondition"): @@ -91,6 +95,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &applyconfigurationorchestrationv1alpha1.StormServiceStatusApplyConfiguration{} case orchestrationv1alpha1.SchemeGroupVersion.WithKind("StormServiceUpdateStrategy"): return &applyconfigurationorchestrationv1alpha1.StormServiceUpdateStrategyApplyConfiguration{} + case orchestrationv1alpha1.SchemeGroupVersion.WithKind("VolcanoSchedulingStrategySpec"): + return &applyconfigurationorchestrationv1alpha1.VolcanoSchedulingStrategySpecApplyConfiguration{} } return nil diff --git a/pkg/controller/constants/stormservice.go b/pkg/controller/constants/stormservice.go index d1f8433d3..746c48eb9 100644 --- a/pkg/controller/constants/stormservice.go +++ b/pkg/controller/constants/stormservice.go @@ -17,7 +17,9 @@ limitations under the License. package constants const ( - GodelPodGroupNameAnnotationKey = "godel.bytedance.com/pod-group-name" + GodelPodGroupNameAnnotationKey = "godel.bytedance.com/pod-group-name" + CoschedulingPodGroupNameLabelKey = "scheduling.x-k8s.io/pod-group" + VolcanoPodGroupNameAnnotationKey = "scheduling.volcano.sh/group-name" RoleSetNameLabelKey = "roleset-name" StormServiceNameLabelKey = "storm-service-name" diff --git a/pkg/controller/podset/podset_controller.go b/pkg/controller/podset/podset_controller.go index 20bdf8f0d..94cb1e5d9 100644 --- a/pkg/controller/podset/podset_controller.go +++ b/pkg/controller/podset/podset_controller.go @@ -21,6 +21,7 @@ import ( "fmt" "sort" "strconv" + "time" v1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" @@ -29,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/selection" + "k8s.io/client-go/dynamic" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -37,16 +39,24 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + schedv1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1" "github.com/vllm-project/aibrix/pkg/config" "github.com/vllm-project/aibrix/pkg/controller/constants" ctrlutil "github.com/vllm-project/aibrix/pkg/controller/util" + utils "github.com/vllm-project/aibrix/pkg/controller/util/orchestration" podutil "github.com/vllm-project/aibrix/pkg/utils" + schedulerpluginsv1aplha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + volcanoschedv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" ) const ( ControllerName = "podset-controller" PodSetFinalizer = "orchestration.aibrix.ai/podset-finalizer" + + PodGroupSyncedEventType = "PodGroupSynced" + + DefaultRequeueAfter = 15 * time.Second ) // controllerKind contains the schema.GroupVersionKind for this controller type. @@ -82,6 +92,7 @@ func newReconciler(mgr manager.Manager, runtimeConfig config.RuntimeConfig) (rec Client: mgr.GetClient(), Scheme: mgr.GetScheme(), EventRecorder: mgr.GetEventRecorderFor(ControllerName), + DynamicClient: dynamic.NewForConfigOrDie(mgr.GetConfig()), } return reconciler, nil } @@ -91,6 +102,7 @@ type PodSetReconciler struct { client.Client Scheme *runtime.Scheme EventRecorder record.EventRecorder + DynamicClient dynamic.Interface } //+kubebuilder:rbac:groups=orchestration.aibrix.ai,resources=podsets,verbs=get;list;watch;create;update;patch;delete @@ -121,6 +133,12 @@ func (r *PodSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, r.Update(ctx, podSet) } + // Reconcile podgroup + if err := r.reconcilePodGroup(ctx, podSet); err != nil { + r.EventRecorder.Eventf(podSet, v1.EventTypeWarning, "ReconcileError", "Failed to reconcile podgroup: %v", err) + return ctrl.Result{}, err + } + // Reconcile pods if err := r.reconcilePods(ctx, podSet); err != nil { r.EventRecorder.Eventf(podSet, v1.EventTypeWarning, "ReconcileError", "Failed to reconcile pods: %v", err) @@ -135,6 +153,61 @@ func (r *PodSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } +func (r *PodSetReconciler) reconcilePodGroup(ctx context.Context, podSet *orchestrationv1alpha1.PodSet) error { + if podSet == nil || podSet.Spec.SchedulingStrategy == nil { + return nil + } + + podGroupMeta := metav1.ObjectMeta{ + Name: podSet.Name, + Namespace: podSet.Namespace, + Labels: map[string]string{ + constants.PodSetNameLabelKey: podSet.Name, + }, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(podSet, orchestrationv1alpha1.SchemeGroupVersion.WithKind(orchestrationv1alpha1.PodSetKind)), + }, + } + + if podSet.Spec.SchedulingStrategy.GodelSchedulingStrategy != nil { + expectedGroup := &schedv1alpha1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: schedv1alpha1.PodGroupSpec(*podSet.Spec.SchedulingStrategy.GodelSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(schedv1alpha1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, podSet.Name, podSet.Namespace); err != nil { + return err + } else if created { + r.EventRecorder.Eventf(podSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", podSet.Name) + } + } + if podSet.Spec.SchedulingStrategy.CoschedulingSchedulingStrategy != nil { + expectedGroup := &schedulerpluginsv1aplha1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: schedulerpluginsv1aplha1.PodGroupSpec(*podSet.Spec.SchedulingStrategy.CoschedulingSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(schedulerpluginsv1aplha1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, podSet.Name, podSet.Namespace); err != nil { + return err + } else if created { + r.EventRecorder.Eventf(podSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", podSet.Name) + } + } + if podSet.Spec.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + expectedGroup := &volcanoschedv1beta1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: volcanoschedv1beta1.PodGroupSpec(*podSet.Spec.SchedulingStrategy.VolcanoSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(volcanoschedv1beta1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, podSet.Name, podSet.Namespace); err != nil { + return err + } else if created { + r.EventRecorder.Eventf(podSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", podSet.Name) + } + } + return nil +} + func (r *PodSetReconciler) reconcilePods(ctx context.Context, podSet *orchestrationv1alpha1.PodSet) error { // Get existing pods podList, err := r.getPodsForPodSet(ctx, podSet) @@ -311,6 +384,17 @@ func (r *PodSetReconciler) finalizePodSet(ctx context.Context, podSet *orchestra return ctrl.Result{Requeue: true}, nil } + // Delete podgroup + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &schedv1alpha1.PodGroup{}, podSet.Name, podSet.Namespace); err != nil { + return ctrl.Result{RequeueAfter: DefaultRequeueAfter}, err + } + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &schedulerpluginsv1aplha1.PodGroup{}, podSet.Name, podSet.Namespace); err != nil { + return ctrl.Result{RequeueAfter: DefaultRequeueAfter}, err + } + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &volcanoschedv1beta1.PodGroup{}, podSet.Name, podSet.Namespace); err != nil { + return ctrl.Result{RequeueAfter: DefaultRequeueAfter}, err + } + // Remove finalizer controllerutil.RemoveFinalizer(podSet, PodSetFinalizer) return ctrl.Result{}, r.Update(ctx, podSet) diff --git a/pkg/controller/roleset/podset_rollsyncer.go b/pkg/controller/roleset/podset_rollsyncer.go index 9915c439e..6ef559fb2 100644 --- a/pkg/controller/roleset/podset_rollsyncer.go +++ b/pkg/controller/roleset/podset_rollsyncer.go @@ -378,9 +378,10 @@ func (p *PodSetRoleSyncer) createPodSetForRole(roleSet *orchestrationv1alpha1.Ro }, }, Spec: orchestrationv1alpha1.PodSetSpec{ - PodGroupSize: *role.PodGroupSize, - Template: podTemplate, - Stateful: role.Stateful, + PodGroupSize: *role.PodGroupSize, + Template: podTemplate, + Stateful: role.Stateful, + SchedulingStrategy: role.SchedulingStrategy, }, } @@ -399,8 +400,31 @@ func (p *PodSetRoleSyncer) createPodSetForRole(roleSet *orchestrationv1alpha1.Ro podSet.Labels[constants.RoleReplicaIndexLabelKey] = fmt.Sprintf("%d", *roleIndex) } - if roleSet.Spec.SchedulingStrategy.PodGroup != nil { - podSet.Annotations[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + if roleSet.Spec.SchedulingStrategy != nil { + if roleSet.Spec.SchedulingStrategy.CoschedulingSchedulingStrategy != nil { + podSet.Labels[constants.CoschedulingPodGroupNameLabelKey] = roleSet.Name + } + if roleSet.Spec.SchedulingStrategy.GodelSchedulingStrategy != nil { + podSet.Annotations[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + podSet.Labels[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + } + if roleSet.Spec.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + podSet.Annotations[constants.VolcanoPodGroupNameAnnotationKey] = roleSet.Name + podSet.Labels[constants.VolcanoPodGroupNameAnnotationKey] = roleSet.Name + } + } + if role.SchedulingStrategy != nil { // note that roleSet.Spec.SchedulingStrategy and role.SchedulingStrategy should not be set concurrently + if role.SchedulingStrategy.CoschedulingSchedulingStrategy != nil { + podSet.Labels[constants.CoschedulingPodGroupNameLabelKey] = podSet.Name + } + if role.SchedulingStrategy.GodelSchedulingStrategy != nil { + podSet.Annotations[constants.GodelPodGroupNameAnnotationKey] = podSet.Name + podSet.Labels[constants.GodelPodGroupNameAnnotationKey] = podSet.Name + } + if role.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + podSet.Annotations[constants.VolcanoPodGroupNameAnnotationKey] = podSet.Name + podSet.Labels[constants.VolcanoPodGroupNameAnnotationKey] = podSet.Name + } } // inject container env diff --git a/pkg/controller/roleset/roleset_controller.go b/pkg/controller/roleset/roleset_controller.go index 02dfdd501..dece88ae4 100644 --- a/pkg/controller/roleset/roleset_controller.go +++ b/pkg/controller/roleset/roleset_controller.go @@ -24,6 +24,7 @@ import ( v1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/dynamic" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -85,6 +86,7 @@ func newReconciler(mgr manager.Manager, runtimeConfig config.RuntimeConfig) (rec Client: mgr.GetClient(), Scheme: mgr.GetScheme(), EventRecorder: mgr.GetEventRecorderFor(ControllerName), + DynamicClient: dynamic.NewForConfigOrDie(mgr.GetConfig()), } return reconciler, nil } @@ -95,6 +97,7 @@ type RoleSetReconciler struct { Scheme *runtime.Scheme EventRecorder record.EventRecorder + DynamicClient dynamic.Interface } // +kubebuilder:rbac:groups=orchestration.aibrix.ai,resources=rolesets,verbs=get;list;watch;create;update;patch;delete @@ -131,11 +134,10 @@ func (r *RoleSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct var managedErrors []error - // TODO: disable pod group at this moment, consider to switch to open source pod group api later. // 1. sync pod group - //if err := r.syncPodGroup(ctx, roleSet, &roleSet.Spec); err != nil { - // managedErrors = append(managedErrors, fmt.Errorf("sync pod group error %v", err)) - //} + if err := r.syncPodGroup(ctx, roleSet, &roleSet.Spec); err != nil { + managedErrors = append(managedErrors, fmt.Errorf("sync pod group error %v", err)) + } // 2. sync pods err := r.syncPods(ctx, roleSet) diff --git a/pkg/controller/roleset/sync.go b/pkg/controller/roleset/sync.go index 9595393f8..33b7ae754 100644 --- a/pkg/controller/roleset/sync.go +++ b/pkg/controller/roleset/sync.go @@ -30,6 +30,8 @@ import ( "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + schedulerpluginsv1aplha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" + volcanoschedv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1" "github.com/vllm-project/aibrix/pkg/controller/constants" @@ -39,31 +41,56 @@ import ( ) func (r *RoleSetReconciler) syncPodGroup(ctx context.Context, roleSet *orchestrationv1alpha1.RoleSet, spec *orchestrationv1alpha1.RoleSetSpec) error { - if spec.SchedulingStrategy.PodGroup == nil { + if spec.SchedulingStrategy == nil { return nil } - expectedGroup := &schedv1alpha1.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: roleSet.Name, - Namespace: roleSet.Namespace, - Labels: map[string]string{ - constants.RoleSetNameLabelKey: roleSet.Name, - }, - OwnerReferences: []metav1.OwnerReference{ - *metav1.NewControllerRef(roleSet, orchestrationv1alpha1.SchemeGroupVersion.WithKind(orchestrationv1alpha1.RoleSetKind)), - }, + + podGroupMeta := metav1.ObjectMeta{ + Name: roleSet.Name, + Namespace: roleSet.Namespace, + Labels: map[string]string{ + constants.RoleSetNameLabelKey: roleSet.Name, + }, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(roleSet, orchestrationv1alpha1.SchemeGroupVersion.WithKind(orchestrationv1alpha1.RoleSetKind)), }, - Spec: *spec.SchedulingStrategy.PodGroup, } - podGroup := &schedv1alpha1.PodGroup{} - if err := r.Client.Get(ctx, client.ObjectKey{Name: roleSet.Name, Namespace: roleSet.Namespace}, podGroup); client.IgnoreNotFound(err) != nil { - return err - } else if err != nil { - // not found pg, need to create - if err = r.Client.Create(ctx, expectedGroup); err == nil { + + if spec.SchedulingStrategy.GodelSchedulingStrategy != nil { + expectedGroup := &schedv1alpha1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: schedv1alpha1.PodGroupSpec(*spec.SchedulingStrategy.GodelSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(schedv1alpha1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, roleSet.Name, roleSet.Namespace); err != nil { + return err + } else if created { + r.EventRecorder.Eventf(roleSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", roleSet.Name) + } + } + if spec.SchedulingStrategy.CoschedulingSchedulingStrategy != nil { + expectedGroup := &schedulerpluginsv1aplha1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: schedulerpluginsv1aplha1.PodGroupSpec(*spec.SchedulingStrategy.CoschedulingSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(schedulerpluginsv1aplha1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, roleSet.Name, roleSet.Namespace); err != nil { + return err + } else if created { + r.EventRecorder.Eventf(roleSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", roleSet.Name) + } + } + if spec.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + expectedGroup := &volcanoschedv1beta1.PodGroup{ + ObjectMeta: podGroupMeta, + Spec: volcanoschedv1beta1.PodGroupSpec(*spec.SchedulingStrategy.VolcanoSchedulingStrategy), + } + expectedGroup.SetGroupVersionKind(volcanoschedv1beta1.SchemeGroupVersion.WithKind("PodGroup")) + if created, err := utils.EnsurePodGroupExist(ctx, r.DynamicClient, expectedGroup, roleSet.Name, roleSet.Namespace); err != nil { + return err + } else if created { r.EventRecorder.Eventf(roleSet, v1.EventTypeNormal, PodGroupSyncedEventType, "pod group %s synced", roleSet.Name) } - return err } return nil } @@ -262,18 +289,16 @@ func (r *RoleSetReconciler) finalize(ctx context.Context, roleSet *orchestration return false, nil } - // TODO: temporarily disable pod group - // 2. check if pg is deleted - //podGroup := &schedv1alpha1.PodGroup{} - //if err := r.Client.Get(ctx, client.ObjectKey{Name: roleSet.Name, Namespace: roleSet.Namespace}, podGroup); client.IgnoreNotFound(err) != nil { - // return false, err - //} else if err == nil { - // // delete pg - // if err = r.Client.Delete(ctx, podGroup); err != nil { - // return false, err - // } - // return false, nil - //} + // 3. check if pg is deleted + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &schedv1alpha1.PodGroup{}, roleSet.Name, roleSet.Namespace); err != nil { + return false, err + } + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &schedulerpluginsv1aplha1.PodGroup{}, roleSet.Name, roleSet.Namespace); err != nil { + return false, err + } + if err := utils.FinalizePodGroup(ctx, r.DynamicClient, r.Client, &volcanoschedv1beta1.PodGroup{}, roleSet.Name, roleSet.Namespace); err != nil { + return false, err + } // 3. remove finalizer if controllerutil.ContainsFinalizer(roleSet, RoleSetFinalizer) { diff --git a/pkg/controller/roleset/utils.go b/pkg/controller/roleset/utils.go index 66eee6344..c4abb1f77 100644 --- a/pkg/controller/roleset/utils.go +++ b/pkg/controller/roleset/utils.go @@ -118,8 +118,16 @@ func renderStormServicePod(roleSet *orchestrationv1alpha1.RoleSet, role *orchest pod.Labels[k] = v } } - if roleSet.Spec.SchedulingStrategy.PodGroup != nil { - pod.Labels[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + if roleSet.Spec.SchedulingStrategy != nil { + if roleSet.Spec.SchedulingStrategy.CoschedulingSchedulingStrategy != nil { + pod.Labels[constants.CoschedulingPodGroupNameLabelKey] = roleSet.Name + } + if roleSet.Spec.SchedulingStrategy.GodelSchedulingStrategy != nil { + pod.Labels[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + } + if roleSet.Spec.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + pod.Labels[constants.VolcanoPodGroupNameAnnotationKey] = roleSet.Name + } } // inject pod annotations @@ -129,8 +137,13 @@ func renderStormServicePod(roleSet *orchestrationv1alpha1.RoleSet, role *orchest // inject to label as well for routing service discovery (some engines use label selector to find pods only) pod.Labels[constants.RoleReplicaIndexLabelKey] = fmt.Sprintf("%d", *roleIndex) } - if roleSet.Spec.SchedulingStrategy.PodGroup != nil { - pod.Annotations[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + if roleSet.Spec.SchedulingStrategy != nil { + if roleSet.Spec.SchedulingStrategy.VolcanoSchedulingStrategy != nil { + pod.Annotations[constants.VolcanoPodGroupNameAnnotationKey] = roleSet.Name + } + if roleSet.Spec.SchedulingStrategy.GodelSchedulingStrategy != nil { + pod.Annotations[constants.GodelPodGroupNameAnnotationKey] = roleSet.Name + } } // manually set the hostname and subdomain for FQDN diff --git a/pkg/controller/roleset/utils_test.go b/pkg/controller/roleset/utils_test.go index 30b05ebb2..957fad69a 100644 --- a/pkg/controller/roleset/utils_test.go +++ b/pkg/controller/roleset/utils_test.go @@ -22,7 +22,6 @@ import ( "github.com/stretchr/testify/assert" - schedv1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" intstrutil "k8s.io/apimachinery/pkg/util/intstr" @@ -168,7 +167,7 @@ func TestRenderStormServicePod_WithoutRoleIndex(t *testing.T) { assert.NotContains(t, pod.Annotations, constants.RoleReplicaIndexAnnotationKey) } -func TestRenderStormServicePod_WithPodGroup(t *testing.T) { +func TestRenderStormServicePod_WithRoleSetCoschedulingPodGroup(t *testing.T) { roleSet := &orchestrationv1alpha1.RoleSet{ ObjectMeta: metav1.ObjectMeta{ Name: "test-role-set", @@ -177,8 +176,47 @@ func TestRenderStormServicePod_WithPodGroup(t *testing.T) { }, }, Spec: orchestrationv1alpha1.RoleSetSpec{ - SchedulingStrategy: orchestrationv1alpha1.SchedulingStrategy{ - PodGroup: &schedv1alpha1.PodGroupSpec{ + SchedulingStrategy: &orchestrationv1alpha1.SchedulingStrategy{ + CoschedulingSchedulingStrategy: &orchestrationv1alpha1.CoschedulingSchedulingStrategySpec{ + MinMember: 4, + }, + }, + Roles: []orchestrationv1alpha1.RoleSpec{ + { + Name: "test-role", + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "test-container"}, + }, + }, + }, + }, + }, + }, + } + + roleIndex := 0 + pod := &corev1.Pod{ + Spec: *roleSet.Spec.Roles[0].Template.Spec.DeepCopy(), + } + renderStormServicePod(roleSet, &roleSet.Spec.Roles[0], pod, &roleIndex) + + // Verify pod group labels and annotations + assert.Equal(t, "test-role-set", pod.Labels[constants.CoschedulingPodGroupNameLabelKey]) +} + +func TestRenderStormServicePod_WithRoleSetGodelPodGroup(t *testing.T) { + roleSet := &orchestrationv1alpha1.RoleSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-role-set", + Labels: map[string]string{ + constants.StormServiceNameLabelKey: "test-service", + }, + }, + Spec: orchestrationv1alpha1.RoleSetSpec{ + SchedulingStrategy: &orchestrationv1alpha1.SchedulingStrategy{ + GodelSchedulingStrategy: &orchestrationv1alpha1.GodelSchedulingStrategySpec{ MinMember: 3, }, }, @@ -208,6 +246,46 @@ func TestRenderStormServicePod_WithPodGroup(t *testing.T) { assert.Equal(t, "test-role-set", pod.Annotations[constants.GodelPodGroupNameAnnotationKey]) } +func TestRenderStormServicePod_WithRoleSetVolcanoPodGroup(t *testing.T) { + roleSet := &orchestrationv1alpha1.RoleSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-role-set", + Labels: map[string]string{ + constants.StormServiceNameLabelKey: "test-service", + }, + }, + Spec: orchestrationv1alpha1.RoleSetSpec{ + SchedulingStrategy: &orchestrationv1alpha1.SchedulingStrategy{ + VolcanoSchedulingStrategy: &orchestrationv1alpha1.VolcanoSchedulingStrategySpec{ + MinMember: 3, + }, + }, + Roles: []orchestrationv1alpha1.RoleSpec{ + { + Name: "test-role", + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "test-container"}, + }, + }, + }, + }, + }, + }, + } + + roleIndex := 0 + pod := &corev1.Pod{ + Spec: *roleSet.Spec.Roles[0].Template.Spec.DeepCopy(), + } + renderStormServicePod(roleSet, &roleSet.Spec.Roles[0], pod, &roleIndex) + + // Verify pod group labels and annotations + assert.Equal(t, "test-role-set", pod.Labels[constants.VolcanoPodGroupNameAnnotationKey]) + assert.Equal(t, "test-role-set", pod.Annotations[constants.VolcanoPodGroupNameAnnotationKey]) +} + func TestRenderStormServicePod_EmptyLabelsAndAnnotations(t *testing.T) { roleSet := &orchestrationv1alpha1.RoleSet{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/controller/util/orchestration/util.go b/pkg/controller/util/orchestration/util.go index f92c0408a..8ae41527a 100644 --- a/pkg/controller/util/orchestration/util.go +++ b/pkg/controller/util/orchestration/util.go @@ -22,14 +22,19 @@ import ( "fmt" "hash/fnv" "reflect" + "strings" "sync" hashutil "github.com/vllm-project/aibrix/pkg/utils/hash" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/client-go/dynamic" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "k8s.io/utils/integer" @@ -263,3 +268,54 @@ func MinInt(a, b int) int { } return b } + +func EnsurePodGroupExist(ctx context.Context, dc dynamic.Interface, podGroup client.Object, name, namespace string) (created bool, err error) { + crd := schema.GroupVersionResource{ + Group: "apiextensions.k8s.io", + Version: "v1", + Resource: "customresourcedefinitions", + } + crdName := fmt.Sprintf("%ss.%s", strings.ToLower(podGroup.GetObjectKind().GroupVersionKind().Kind), podGroup.GetObjectKind().GroupVersionKind().Group) + if _, err := dc.Resource(crd).Get(ctx, crdName, metav1.GetOptions{}); err != nil { + if apierrors.IsNotFound(err) { // crd is not installed + return false, nil + } + return false, err + } + + if _, err := dc.Resource(podGroup.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("podgroups")).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}); err != nil { + if apierrors.IsNotFound(err) { // podgroup not found, create it + podGroupUnstructed, err := runtime.DefaultUnstructuredConverter.ToUnstructured(podGroup) + if err != nil { + return false, err + } + _, err = dc.Resource(podGroup.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("podgroups")).Namespace(namespace).Create(ctx, &unstructured.Unstructured{Object: podGroupUnstructed}, metav1.CreateOptions{}) + return err == nil, err + } + return false, err // other get error + } + // podgroup already presented + return false, nil +} + +func FinalizePodGroup(ctx context.Context, dc dynamic.Interface, c client.Client, podGroup client.Object, name, namespace string) error { + crd := schema.GroupVersionResource{ + Group: "apiextensions.k8s.io", + Version: "v1", + Resource: "customresourcedefinitions", + } + if _, err := dc.Resource(crd).Get(ctx, fmt.Sprintf("%ss.%s", strings.ToLower(podGroup.GetObjectKind().GroupVersionKind().Kind), podGroup.GetObjectKind().GroupVersionKind().Group), metav1.GetOptions{}); err != nil { + if apierrors.IsNotFound(err) { // crd is not installed + return nil + } + return err + } + + if _, err := dc.Resource(podGroup.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("podgroups")).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}); err != nil { + if apierrors.IsNotFound(err) { // podgroup not found, done + return nil + } + return err + } + return dc.Resource(podGroup.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("podgroups")).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}) +} diff --git a/test/utils/wrapper/roleset.go b/test/utils/wrapper/roleset.go index bcf9f6898..b2157574d 100644 --- a/test/utils/wrapper/roleset.go +++ b/test/utils/wrapper/roleset.go @@ -20,6 +20,7 @@ import ( schedv1alpha1 "github.com/kubewharf/godel-scheduler-api/pkg/apis/scheduling/v1alpha1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" orchestrationapi "github.com/vllm-project/aibrix/api/orchestration/v1alpha1" ) @@ -66,11 +67,13 @@ func (w *RoleSetWrapper) UpdateStrategy(strategy orchestrationapi.RoleSetUpdateS // SchedulingStrategyPodGroup sets the PodGroup scheduling strategy. // This allows integration with kube-batchd or volcano for gang scheduling. func (w *RoleSetWrapper) SchedulingStrategyPodGroup(pgSpec *schedv1alpha1.PodGroupSpec) *RoleSetWrapper { - if w.roleset.Spec.SchedulingStrategy.PodGroup == nil { - w.roleset.Spec.SchedulingStrategy.PodGroup = &schedv1alpha1.PodGroupSpec{} + if w.roleset.Spec.SchedulingStrategy == nil { + w.roleset.Spec.SchedulingStrategy = &orchestrationapi.SchedulingStrategy{ + GodelSchedulingStrategy: &orchestrationapi.GodelSchedulingStrategySpec{}, + } } if pgSpec != nil { - *w.roleset.Spec.SchedulingStrategy.PodGroup = *pgSpec + w.roleset.Spec.SchedulingStrategy.GodelSchedulingStrategy = ptr.To(orchestrationapi.GodelSchedulingStrategySpec(*pgSpec)) } return w }