Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions charts/hami/templates/scheduler/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,7 @@ rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "get", "list"]
- apiGroups: [""]
resources: ["resourcequotas"]
verbs: ["get", "list", "watch"]

16 changes: 16 additions & 0 deletions cmd/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
"vGPU core allocated from a container",
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
)
quotaUsedDesc := prometheus.NewDesc(
"QuotaUsed",
"resourcequota usage for a certain device",
[]string{"quotanamespace", "quotaName", "limit"}, nil,
)
quotas := sher.GetQuotas()
for ns, val := range quotas.Quotas {
for quotaname, q := range *val {
ch <- prometheus.MustNewConstMetric(
quotaUsedDesc,
prometheus.GaugeValue,
float64(q.Used),
ns, quotaname, fmt.Sprint(q.Limit),
)
}
}
schedpods, _ := sher.GetScheduledPods()
for _, val := range schedpods {
for _, podSingleDevice := range val.Devices {
Expand Down
19 changes: 15 additions & 4 deletions pkg/device/ascend/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,12 @@ func InitDevices(config []VNPUConfig) []*Devices {
sort.Slice(dev.config.Templates, func(i, j int) bool {
return dev.config.Templates[i].Memory < dev.config.Templates[j].Memory
})
device.InRequestDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord)
device.SupportDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-allocated", commonWord)
util.HandshakeAnnos[commonWord] = dev.handshakeAnno
_, ok := device.InRequestDevices[commonWord]
if !ok {
device.InRequestDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord)
device.SupportDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-allocated", commonWord)
util.HandshakeAnnos[commonWord] = dev.handshakeAnno
}
devs = append(devs, dev)
klog.Infof("load ascend vnpu config %s: %v", commonWord, dev.config)
}
Expand Down Expand Up @@ -237,14 +240,14 @@ func (dev *Devices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
}

func (dev *Devices) GenerateResourceRequests(ctr *corev1.Container) device.ContainerDeviceRequest {
klog.Infof("Counting %s devices", dev.config.CommonWord)
ascendResourceCount := corev1.ResourceName(dev.config.ResourceName)
ascendResourceMem := corev1.ResourceName(dev.config.ResourceMemoryName)
v, ok := ctr.Resources.Limits[ascendResourceCount]
if !ok {
v, ok = ctr.Resources.Requests[ascendResourceCount]
}
if ok {
klog.V(3).Infof("Counting %s devices", dev.config.CommonWord)
if n, ok := v.AsInt64(); ok {
klog.Info("Found AscendDevices devices")
memnum := 0
Expand Down Expand Up @@ -289,6 +292,14 @@ func (dev *Devices) AddResourceUsage(pod *corev1.Pod, n *device.DeviceUsage, ctr
return nil
}

func (dev *Devices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: dev.config.ResourceName,
ResourceMemoryName: dev.config.ResourceMemoryName,
ResourceCoreName: "",
}
}

func (npu *Devices) Fit(devices []*device.DeviceUsage, request device.ContainerDeviceRequest, pod *corev1.Pod, nodeInfo *device.NodeInfo, allocated *device.PodDevices) (bool, map[string]device.ContainerDevices, string) {
k := request
originReq := k.Nums
Expand Down
13 changes: 12 additions & 1 deletion pkg/device/awsneuron/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ type AWSNeuronConfig struct {
}

func InitAWSNeuronDevice(config AWSNeuronConfig) *AWSNeuronDevices {
device.SupportDevices[AWSNeuronDevice] = "hami.io/aws-neuron-devices-allocated"
_, ok := device.SupportDevices[AWSNeuronDevice]
if !ok {
device.SupportDevices[AWSNeuronDevice] = "hami.io/aws-neuron-devices-allocated"
}
return &AWSNeuronDevices{
resourceCountName: config.ResourceCountName,
resourceCoreName: config.ResourceCoreName,
Expand Down Expand Up @@ -211,6 +214,14 @@ func (dev *AWSNeuronDevices) CheckHealth(devType string, n *corev1.Node) (bool,
return true, true
}

func (dev *AWSNeuronDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: dev.resourceCountName,
ResourceMemoryName: "",
ResourceCoreName: dev.resourceCoreName,
}
}

func (dev *AWSNeuronDevices) GenerateResourceRequests(ctr *corev1.Container) device.ContainerDeviceRequest {
klog.Info("Start to count awsNeuron devices for container ", ctr.Name)
awsResourceCount := corev1.ResourceName(dev.resourceCountName)
Expand Down
15 changes: 13 additions & 2 deletions pkg/device/cambricon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,11 @@ func InitMLUDevice(config CambriconConfig) *CambriconDevices {
MLUResourceCount = config.ResourceCountName
MLUResourceMemory = config.ResourceMemoryName
MLUResourceCores = config.ResourceCoreName
device.InRequestDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-to-allocate"
device.SupportDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-allocated"
_, ok := device.InRequestDevices[CambriconMLUDevice]
if !ok {
device.InRequestDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-to-allocate"
device.SupportDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-allocated"
}
return &CambriconDevices{}
}

Expand Down Expand Up @@ -421,3 +424,11 @@ func (cam *CambriconDevices) Fit(devices []*device.DeviceUsage, request device.C
}
return false, tmpDevs, common.GenReason(reason, len(devices))
}

func (dev *CambriconDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: MLUResourceCount,
ResourceMemoryName: MLUResourceMemory,
ResourceCoreName: MLUResourceCores,
}
}
1 change: 1 addition & 0 deletions pkg/device/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const (
AllocatedCardsInsufficientRequest = "AllocatedCardsInsufficientRequest"
NodeUnfitPod = "NodeUnfitPod"
NodeFitPod = "NodeFitPod"
ResourceQuotaNotFit = "ResourceQuotaNotFit"
)

func GenReason(reasons map[string]int, cards int) string {
Expand Down
5 changes: 3 additions & 2 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type Devices interface {
MutateAdmission(ctr *corev1.Container, pod *corev1.Pod) (bool, error)
CheckHealth(devType string, n *corev1.Node) (bool, bool)
NodeCleanUp(nn string) error
GetResourceNames() ResourceNames
GetNodeDevices(n corev1.Node) ([]*DeviceInfo, error)
LockNode(n *corev1.Node, p *corev1.Pod) error
ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error
Expand Down Expand Up @@ -117,7 +118,7 @@ type NodeInfo struct {
Devices []DeviceInfo
}

type ResoureNames struct {
type ResourceNames struct {
ResourceCountName string
ResourceMemoryName string
ResourceCoreName string
Expand Down Expand Up @@ -374,7 +375,7 @@ func DecodePodDevices(checklist map[string]string, annos map[string]string) (Pod
pd[devID] = append(pd[devID], cd)
}
}
klog.InfoS("Decoded pod annos", "poddevices", pd)
klog.V(5).InfoS("Decoded pod annos", "poddevices", pd)
return pd, nil
}

Expand Down
29 changes: 20 additions & 9 deletions pkg/device/enflame/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ type EnflameDevices struct {
}

const (
EnflameGPUDevice = "Enflame"
EnflameGPUCommonWord = "Enflame"
EnflameVGCUDevice = "Enflame"
EnflameVGCUCommonWord = "Enflame"
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID.
EnflameUseUUID = "enflame.com/use-gpuuuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID.
Expand All @@ -54,14 +54,17 @@ const (
func InitEnflameDevice(config EnflameConfig) *EnflameDevices {
EnflameResourceNameVGCU = config.ResourceNameVGCU
EnflameResourceNameVGCUPercentage = config.ResourceNameVGCUPercentage
device.SupportDevices[EnflameGPUDevice] = "hami.io/enflame-vgpu-devices-allocated"
_, ok := device.SupportDevices[EnflameVGCUDevice]
if !ok {
device.SupportDevices[EnflameVGCUDevice] = "hami.io/enflame-vgpu-devices-allocated"
}
return &EnflameDevices{
factor: 0,
}
}

func (dev *EnflameDevices) CommonWord() string {
return EnflameGPUCommonWord
return EnflameVGCUCommonWord
}

func (dev *EnflameDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) {
Expand Down Expand Up @@ -111,7 +114,7 @@ func (dev *EnflameDevices) GetNodeDevices(n corev1.Node) ([]*device.DeviceInfo,
Count: 100,
Devmem: 100,
Devcore: 100,
Type: EnflameGPUDevice,
Type: EnflameVGCUDevice,
Numa: 0,
Health: true,
})
Expand All @@ -121,9 +124,9 @@ func (dev *EnflameDevices) GetNodeDevices(n corev1.Node) ([]*device.DeviceInfo,
}

func (dev *EnflameDevices) PatchAnnotations(pod *corev1.Pod, annoinput *map[string]string, pd device.PodDevices) map[string]string {
devlist, ok := pd[EnflameGPUDevice]
devlist, ok := pd[EnflameVGCUDevice]
if ok && len(devlist) > 0 {
(*annoinput)[device.SupportDevices[EnflameGPUDevice]] = device.EncodePodSingleDevice(devlist)
(*annoinput)[device.SupportDevices[EnflameVGCUDevice]] = device.EncodePodSingleDevice(devlist)
(*annoinput)[PodHasAssignedGCU] = "false"
(*annoinput)[PodAssignedGCUTime] = strconv.FormatInt(time.Now().UnixNano(), 10)
annoKey := PodAssignedGCUID
Expand Down Expand Up @@ -151,7 +154,7 @@ func (dev *EnflameDevices) NodeCleanUp(nn string) error {
}

func (dev *EnflameDevices) checkType(annos map[string]string, d device.DeviceUsage, n device.ContainerDeviceRequest) (bool, bool, bool) {
if strings.Compare(n.Type, EnflameGPUDevice) == 0 {
if strings.Compare(n.Type, EnflameVGCUDevice) == 0 {
return true, true, false
}
return false, false, false
Expand Down Expand Up @@ -209,7 +212,7 @@ func (dev *EnflameDevices) GenerateResourceRequests(ctr *corev1.Container) devic
}
return device.ContainerDeviceRequest{
Nums: int32(n),
Type: EnflameGPUDevice,
Type: EnflameVGCUDevice,
Memreq: int32(memnum),
MemPercentagereq: 0,
Coresreq: 0,
Expand Down Expand Up @@ -327,3 +330,11 @@ func (enf *EnflameDevices) Fit(devices []*device.DeviceUsage, request device.Con
}
return false, tmpDevs, common.GenReason(reason, len(devices))
}

func (dev *EnflameDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: EnflameResourceNameVGCU,
ResourceMemoryName: EnflameResourceNameVGCUPercentage,
ResourceCoreName: "",
}
}
Loading