Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions charts/hami/templates/scheduler/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,7 @@ rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "get", "list"]
- apiGroups: [""]
resources: ["resourcequotas"]
verbs: ["get", "list", "watch"]

16 changes: 16 additions & 0 deletions cmd/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
"vGPU core allocated from a container",
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
)
quotaUsedDesc := prometheus.NewDesc(
"QuotaUsed",
"resourcequota usage for a certain device",
[]string{"quotanamespace", "quotaName", "limit"}, nil,
)
quotas := sher.GetQuotas()
for ns, val := range quotas.Quotas {
for quotaname, q := range *val {
ch <- prometheus.MustNewConstMetric(
quotaUsedDesc,
prometheus.GaugeValue,
float64(q.Used),
ns, quotaname, fmt.Sprint(q.Limit),
)
}
}
schedpods, _ := sher.GetScheduledPods()
for _, val := range schedpods {
for _, podSingleDevice := range val.Devices {
Expand Down
19 changes: 15 additions & 4 deletions pkg/device/ascend/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,12 @@ func InitDevices(config []VNPUConfig) []*Devices {
sort.Slice(dev.config.Templates, func(i, j int) bool {
return dev.config.Templates[i].Memory < dev.config.Templates[j].Memory
})
device.InRequestDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord)
device.SupportDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-allocated", commonWord)
util.HandshakeAnnos[commonWord] = dev.handshakeAnno
_, ok := device.InRequestDevices[commonWord]
if !ok {
device.InRequestDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord)
device.SupportDevices[commonWord] = fmt.Sprintf("hami.io/%s-devices-allocated", commonWord)
util.HandshakeAnnos[commonWord] = dev.handshakeAnno
}
devs = append(devs, dev)
klog.Infof("load ascend vnpu config %s: %v", commonWord, dev.config)
}
Expand Down Expand Up @@ -237,14 +240,14 @@ func (dev *Devices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
}

func (dev *Devices) GenerateResourceRequests(ctr *corev1.Container) device.ContainerDeviceRequest {
klog.Infof("Counting %s devices", dev.config.CommonWord)
ascendResourceCount := corev1.ResourceName(dev.config.ResourceName)
ascendResourceMem := corev1.ResourceName(dev.config.ResourceMemoryName)
v, ok := ctr.Resources.Limits[ascendResourceCount]
if !ok {
v, ok = ctr.Resources.Requests[ascendResourceCount]
}
if ok {
klog.V(3).Infof("Counting %s devices", dev.config.CommonWord)
if n, ok := v.AsInt64(); ok {
klog.Info("Found AscendDevices devices")
memnum := 0
Expand Down Expand Up @@ -289,6 +292,14 @@ func (dev *Devices) AddResourceUsage(pod *corev1.Pod, n *device.DeviceUsage, ctr
return nil
}

func (dev *Devices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: dev.config.ResourceName,
ResourceMemoryName: dev.config.ResourceMemoryName,
ResourceCoreName: "",
}
}

func (npu *Devices) Fit(devices []*device.DeviceUsage, request device.ContainerDeviceRequest, pod *corev1.Pod, nodeInfo *device.NodeInfo, allocated *device.PodDevices) (bool, map[string]device.ContainerDevices, string) {
k := request
originReq := k.Nums
Expand Down
13 changes: 12 additions & 1 deletion pkg/device/awsneuron/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ type AWSNeuronConfig struct {
}

func InitAWSNeuronDevice(config AWSNeuronConfig) *AWSNeuronDevices {
device.SupportDevices[AWSNeuronDevice] = "hami.io/aws-neuron-devices-allocated"
_, ok := device.SupportDevices[AWSNeuronDevice]
if !ok {
device.SupportDevices[AWSNeuronDevice] = "hami.io/aws-neuron-devices-allocated"
}
return &AWSNeuronDevices{
resourceCountName: config.ResourceCountName,
resourceCoreName: config.ResourceCoreName,
Expand Down Expand Up @@ -211,6 +214,14 @@ func (dev *AWSNeuronDevices) CheckHealth(devType string, n *corev1.Node) (bool,
return true, true
}

func (dev *AWSNeuronDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: dev.resourceCountName,
ResourceMemoryName: "",
ResourceCoreName: dev.resourceCoreName,
}
}

func (dev *AWSNeuronDevices) GenerateResourceRequests(ctr *corev1.Container) device.ContainerDeviceRequest {
klog.Info("Start to count awsNeuron devices for container ", ctr.Name)
awsResourceCount := corev1.ResourceName(dev.resourceCountName)
Expand Down
15 changes: 13 additions & 2 deletions pkg/device/cambricon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,11 @@ func InitMLUDevice(config CambriconConfig) *CambriconDevices {
MLUResourceCount = config.ResourceCountName
MLUResourceMemory = config.ResourceMemoryName
MLUResourceCores = config.ResourceCoreName
device.InRequestDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-to-allocate"
device.SupportDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-allocated"
_, ok := device.InRequestDevices[CambriconMLUDevice]
if !ok {
device.InRequestDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-to-allocate"
device.SupportDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-allocated"
}
return &CambriconDevices{}
}

Expand Down Expand Up @@ -421,3 +424,11 @@ func (cam *CambriconDevices) Fit(devices []*device.DeviceUsage, request device.C
}
return false, tmpDevs, common.GenReason(reason, len(devices))
}

func (dev *CambriconDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: MLUResourceCount,
ResourceMemoryName: MLUResourceMemory,
ResourceCoreName: MLUResourceCores,
}
}
1 change: 1 addition & 0 deletions pkg/device/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const (
AllocatedCardsInsufficientRequest = "AllocatedCardsInsufficientRequest"
NodeUnfitPod = "NodeUnfitPod"
NodeFitPod = "NodeFitPod"
ResourceQuotaNotFit = "ResourceQuotaNotFit"
)

func GenReason(reasons map[string]int, cards int) string {
Expand Down
7 changes: 4 additions & 3 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type Devices interface {
MutateAdmission(ctr *corev1.Container, pod *corev1.Pod) (bool, error)
CheckHealth(devType string, n *corev1.Node) (bool, bool)
NodeCleanUp(nn string) error
GetResourceNames() ResourceNames
GetNodeDevices(n corev1.Node) ([]*DeviceInfo, error)
LockNode(n *corev1.Node, p *corev1.Pod) error
ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error
Expand Down Expand Up @@ -117,7 +118,7 @@ type NodeInfo struct {
Devices []DeviceInfo
}

type ResoureNames struct {
type ResourceNames struct {
ResourceCountName string
ResourceMemoryName string
ResourceCoreName string
Expand Down Expand Up @@ -352,7 +353,7 @@ func DecodeContainerDevices(str string) (ContainerDevices, error) {
}

func DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error) {
klog.V(5).Infof("checklist is [%+v], annos is [%+v]", checklist, annos)
klog.Infof("=-=-=-=-=---=-=-=-=checklist is [%+v], annos is [%+v]", checklist, annos)
if len(annos) == 0 {
return PodDevices{}, nil
}
Expand All @@ -374,7 +375,7 @@ func DecodePodDevices(checklist map[string]string, annos map[string]string) (Pod
pd[devID] = append(pd[devID], cd)
}
}
klog.InfoS("Decoded pod annos", "poddevices", pd)
klog.V(5).InfoS("Decoded pod annos", "poddevices", pd)
return pd, nil
}

Expand Down
29 changes: 20 additions & 9 deletions pkg/device/enflame/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ type EnflameDevices struct {
}

const (
EnflameGPUDevice = "Enflame"
EnflameGPUCommonWord = "Enflame"
EnflameVGCUDevice = "Enflame"
EnflameVGCUCommonWord = "Enflame"
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID.
EnflameUseUUID = "enflame.com/use-gpuuuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID.
Expand All @@ -54,14 +54,17 @@ const (
func InitEnflameDevice(config EnflameConfig) *EnflameDevices {
EnflameResourceNameVGCU = config.ResourceNameVGCU
EnflameResourceNameVGCUPercentage = config.ResourceNameVGCUPercentage
device.SupportDevices[EnflameGPUDevice] = "hami.io/enflame-vgpu-devices-allocated"
_, ok := device.SupportDevices[EnflameVGCUDevice]
if !ok {
device.SupportDevices[EnflameVGCUDevice] = "hami.io/enflame-vgpu-devices-allocated"
}
return &EnflameDevices{
factor: 0,
}
}

func (dev *EnflameDevices) CommonWord() string {
return EnflameGPUCommonWord
return EnflameVGCUCommonWord
}

func (dev *EnflameDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) {
Expand Down Expand Up @@ -111,7 +114,7 @@ func (dev *EnflameDevices) GetNodeDevices(n corev1.Node) ([]*device.DeviceInfo,
Count: 100,
Devmem: 100,
Devcore: 100,
Type: EnflameGPUDevice,
Type: EnflameVGCUDevice,
Numa: 0,
Health: true,
})
Expand All @@ -121,9 +124,9 @@ func (dev *EnflameDevices) GetNodeDevices(n corev1.Node) ([]*device.DeviceInfo,
}

func (dev *EnflameDevices) PatchAnnotations(pod *corev1.Pod, annoinput *map[string]string, pd device.PodDevices) map[string]string {
devlist, ok := pd[EnflameGPUDevice]
devlist, ok := pd[EnflameVGCUDevice]
if ok && len(devlist) > 0 {
(*annoinput)[device.SupportDevices[EnflameGPUDevice]] = device.EncodePodSingleDevice(devlist)
(*annoinput)[device.SupportDevices[EnflameVGCUDevice]] = device.EncodePodSingleDevice(devlist)
(*annoinput)[PodHasAssignedGCU] = "false"
(*annoinput)[PodAssignedGCUTime] = strconv.FormatInt(time.Now().UnixNano(), 10)
annoKey := PodAssignedGCUID
Expand Down Expand Up @@ -151,7 +154,7 @@ func (dev *EnflameDevices) NodeCleanUp(nn string) error {
}

func (dev *EnflameDevices) checkType(annos map[string]string, d device.DeviceUsage, n device.ContainerDeviceRequest) (bool, bool, bool) {
if strings.Compare(n.Type, EnflameGPUDevice) == 0 {
if strings.Compare(n.Type, EnflameVGCUDevice) == 0 {
return true, true, false
}
return false, false, false
Expand Down Expand Up @@ -209,7 +212,7 @@ func (dev *EnflameDevices) GenerateResourceRequests(ctr *corev1.Container) devic
}
return device.ContainerDeviceRequest{
Nums: int32(n),
Type: EnflameGPUDevice,
Type: EnflameVGCUDevice,
Memreq: int32(memnum),
MemPercentagereq: 0,
Coresreq: 0,
Expand Down Expand Up @@ -327,3 +330,11 @@ func (enf *EnflameDevices) Fit(devices []*device.DeviceUsage, request device.Con
}
return false, tmpDevs, common.GenReason(reason, len(devices))
}

func (dev *EnflameDevices) GetResourceNames() device.ResourceNames {
return device.ResourceNames{
ResourceCountName: EnflameResourceNameVGCU,
ResourceMemoryName: EnflameResourceNameVGCUPercentage,
ResourceCoreName: "",
}
}
Loading