Skip to content

Commit 47f0a21

Browse files
refactor: enhance logging and error handling in scheduler functions
Signed-off-by: haitwang-cloud <[email protected]>
1 parent e377967 commit 47f0a21

File tree

2 files changed

+80
-36
lines changed

2 files changed

+80
-36
lines changed

pkg/scheduler/scheduler.go

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func NewScheduler() *Scheduler {
7070
}
7171
s.nodeManager = newNodeManager()
7272
s.podManager = newPodManager()
73-
klog.V(2).InfoS("Scheduler initialized successfully")
73+
klog.InfoS("Scheduler initialized successfully")
7474
return s
7575
}
7676

@@ -87,7 +87,7 @@ func (s *Scheduler) onAddPod(obj any) {
8787
klog.ErrorS(fmt.Errorf("invalid pod object"), "Failed to process pod addition")
8888
return
8989
}
90-
klog.V(5).InfoS("Pod added", "pod", pod.Name, "namespace", pod.Namespace)
90+
klog.V(4).InfoS("Pod added", "pod", pod.Name, "namespace", pod.Namespace)
9191
nodeID, ok := pod.Annotations[util.AssignedNodeAnnotations]
9292
if !ok {
9393
return
@@ -107,7 +107,7 @@ func (s *Scheduler) onUpdatePod(_, newObj any) {
107107
func (s *Scheduler) onDelPod(obj any) {
108108
pod, ok := obj.(*corev1.Pod)
109109
if !ok {
110-
klog.Errorf("unknown add object type")
110+
klog.ErrorS(fmt.Errorf("unknown object type"), "Failed to process pod deletion")
111111
return
112112
}
113113
_, ok = pod.Annotations[util.AssignedNodeAnnotations]
@@ -156,9 +156,9 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
156156
for {
157157
select {
158158
case <-s.nodeNotify:
159-
klog.V(5).InfoS("Received node notification")
159+
klog.V(4).InfoS("Received node notification")
160160
case <-ticker.C:
161-
klog.InfoS("Ticker triggered")
161+
klog.V(5).InfoS("Ticker triggered")
162162
case <-s.stopCh:
163163
klog.InfoS("Received stop signal, exiting RegisterFromNodeAnnotations")
164164
return
@@ -168,26 +168,36 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168168
klog.ErrorS(err, "Failed to list nodes with selector", "selector", labelSelector.String())
169169
continue
170170
}
171-
klog.V(5).InfoS("Listed nodes", "nodeCount", len(rawNodes))
171+
172+
klog.V(4).InfoS("Listed nodes", "nodeCount", len(rawNodes))
172173
var nodeNames []string
173174
for _, val := range rawNodes {
174175
nodeNames = append(nodeNames, val.Name)
175-
klog.V(5).InfoS("Processing node", "nodeName", val.Name)
176+
177+
klog.V(4).InfoS("Processing node", "nodeName", val.Name)
176178

177179
for devhandsk, devInstance := range device.GetDevices() {
178-
klog.V(5).InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)
180+
181+
klog.V(4).InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)
179182

180183
nodedevices, err := devInstance.GetNodeDevices(*val)
181184
if err != nil {
182-
klog.V(5).InfoS("Failed to get node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
183-
continue
185+
klog.V(3).InfoS("Failed to update node device status", "nodeName", val.Name, "deviceVendor", devhandsk, "error", err)
184186
}
185187

186188
health, needUpdate := devInstance.CheckHealth(devhandsk, val)
187-
klog.V(5).InfoS("Device health check result", "nodeName", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate)
189+
190+
klog.V(4).InfoS("Device health check result",
191+
"nodeName", val.Name,
192+
"deviceVendor", devhandsk,
193+
"health", health,
194+
"needUpdate", needUpdate)
188195

189196
if !health {
190-
klog.Warning("Device is unhealthy, cleaning up node", "nodeName", val.Name, "deviceVendor", devhandsk)
197+
198+
klog.Warning("Device is unhealthy, cleaning up node",
199+
"nodeName", val.Name,
200+
"deviceVendor", devhandsk)
191201
err := devInstance.NodeCleanUp(val.Name)
192202
if err != nil {
193203
klog.ErrorS(err, "Node cleanup failed", "nodeName", val.Name, "deviceVendor", devhandsk)
@@ -197,40 +207,47 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
197207
continue
198208
}
199209
if !needUpdate {
200-
klog.V(5).InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
210+
klog.V(4).InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
201211
continue
202212
}
203213
_, ok := util.HandshakeAnnos[devhandsk]
204214
if ok {
205215
tmppat := make(map[string]string)
206216
tmppat[util.HandshakeAnnos[devhandsk]] = "Requesting_" + time.Now().Format(time.DateTime)
207-
klog.InfoS("New timestamp for annotation", "nodeName", val.Name, "annotationKey", util.HandshakeAnnos[devhandsk], "annotationValue", tmppat[util.HandshakeAnnos[devhandsk]])
217+
klog.V(3).InfoS("New timestamp for annotation",
218+
"nodeName", val.Name,
219+
"annotationKey", util.HandshakeAnnos[devhandsk],
220+
"annotationValue", tmppat[util.HandshakeAnnos[devhandsk]])
208221
n, err := util.GetNode(val.Name)
209222
if err != nil {
210223
klog.ErrorS(err, "Failed to get node", "nodeName", val.Name)
211224
continue
212225
}
213-
klog.V(5).InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
226+
klog.V(4).InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
214227
if err := util.PatchNodeAnnotations(n, tmppat); err != nil {
215228
klog.ErrorS(err, "Failed to patch node annotations", "nodeName", val.Name)
216229
}
217230
}
218231
nodeInfo := &util.NodeInfo{}
219232
nodeInfo.ID = val.Name
220233
nodeInfo.Node = val
221-
klog.V(5).InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
234+
klog.V(4).InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
222235
nodeInfo.Devices = make([]util.DeviceInfo, 0)
223236
for _, deviceinfo := range nodedevices {
224237
nodeInfo.Devices = append(nodeInfo.Devices, *deviceinfo)
225238
}
226239
s.addNode(val.Name, nodeInfo)
227240
if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 {
228241
if printedLog[val.Name] {
229-
klog.V(5).InfoS("Node device updated", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
242+
klog.InfoS("Node device updated", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
230243
} else {
231244
klog.InfoS("Node device added", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
232245
printedLog[val.Name] = true
233246
}
247+
klog.InfoS("Node device inventory changed",
248+
"nodeName", val.Name,
249+
"deviceVendor", devhandsk,
250+
"deviceCount", len(nodeInfo.Devices))
234251
}
235252
}
236253
}
@@ -319,7 +336,10 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
319336
d.Device.Usedcores += udevice.Usedcores
320337
if strings.Contains(udevice.UUID, "[") {
321338
if strings.Compare(d.Device.Mode, "hami-core") == 0 {
322-
klog.Errorf("found a mig task running on a hami-core GPU\n")
339+
klog.ErrorS(fmt.Errorf("invalid configuration"), "MIG task assigned to non-MIG GPU",
340+
"deviceID", d.Device.ID,
341+
"deviceMode", d.Device.Mode,
342+
"taskUUID", udevice.UUID)
323343
d.Device.Health = false
324344
continue
325345
}
@@ -328,21 +348,28 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
328348
util.PlatternMIG(&d.Device.MigUsage, d.Device.MigTemplate, tmpIdx)
329349
}
330350
d.Device.MigUsage.UsageList[Instance].InUse = true
331-
klog.V(5).Infoln("add mig usage", d.Device.MigUsage, "template=", d.Device.MigTemplate, "uuid=", d.Device.ID)
351+
klog.V(5).InfoS("MIG device allocated",
352+
"deviceID", d.Device.ID,
353+
"instanceID", Instance,
354+
"template", d.Device.MigTemplate)
332355
}
333356
}
334357
}
335358
}
336359
}
337360
}
338-
klog.V(5).Infof("usage: pod %v assigned %v %v", p.Name, p.NodeID, p.Devices)
361+
klog.V(4).InfoS("Pod resource assignment",
362+
"podName", p.Name,
363+
"nodeName", p.NodeID,
364+
"deviceCount", len(p.Devices))
339365
}
340366
s.overviewstatus = overallnodeMap
341367
for _, nodeID := range *nodes {
342368
node, err := s.GetNode(nodeID)
343369
if err != nil {
344-
// The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority.
345-
klog.V(5).InfoS("node unregistered", "node", nodeID, "error", err)
370+
klog.V(3).InfoS("Node unregistered or no GPU devices found",
371+
"nodeName", nodeID,
372+
"error", err)
346373
failedNodes[nodeID] = "node unregistered"
347374
continue
348375
}
@@ -396,7 +423,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
396423
klog.ErrorS(err, "Failed to get pod", "pod", args.PodName, "namespace", args.PodNamespace)
397424
return &extenderv1.ExtenderBindingResult{Error: err.Error()}, err
398425
}
399-
klog.InfoS("Trying to get the target node for pod", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node)
426+
klog.V(3).InfoS("Retrieving target node for binding", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node)
400427
node, err := s.kubeClient.CoreV1().Nodes().Get(context.Background(), args.Node, metav1.GetOptions{})
401428
if err != nil {
402429
klog.ErrorS(err, "Failed to get node", "node", args.Node)
@@ -413,14 +440,14 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
413440
for _, val := range device.GetDevices() {
414441
err = val.LockNode(node, current)
415442
if err != nil {
416-
klog.ErrorS(err, "Failed to lock node", "node", args.Node, "device", val)
443+
klog.ErrorS(err, "Failed to lock node", "node", args.Node, "error", err)
417444
goto ReleaseNodeLocks
418445
}
419446
}
420447

421448
err = util.PatchPodAnnotations(current, tmppatch)
422449
if err != nil {
423-
klog.ErrorS(err, "Failed to patch pod annotations", "pod", klog.KObj(current))
450+
klog.ErrorS(err, "Failed to patch pod annotations", "pod", klog.KObj(current), "annotations", tmppatch)
424451
return &extenderv1.ExtenderBindingResult{Error: err.Error()}, err
425452
}
426453

@@ -453,8 +480,10 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
453480
}
454481
}
455482
if total == 0 {
456-
klog.V(1).InfoS("Pod does not request any resources",
457-
"pod", args.Pod.Name)
483+
klog.InfoS("Pod does not request any resources",
484+
"podName", args.Pod.Name,
485+
"podNamespace", args.Pod.Namespace,
486+
"podUID", args.Pod.UID)
458487
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", fmt.Errorf("does not request any resource"))
459488
return &extenderv1.ExtenderFilterResult{
460489
NodeNames: args.NodeNames,
@@ -470,8 +499,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
470499
return nil, err
471500
}
472501
if len(failedNodes) != 0 {
473-
klog.V(5).InfoS("Nodes failed during usage retrieval",
474-
"nodes", failedNodes)
502+
klog.InfoS("Failed to retrieve usage for some nodes",
503+
"failedNodeCount", len(failedNodes),
504+
"failedNodes", failedNodes)
475505
}
476506
nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod, failedNodes)
477507
if err != nil {
@@ -480,14 +510,18 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
480510
return nil, err
481511
}
482512
if len((*nodeScores).NodeList) == 0 {
483-
klog.V(4).InfoS("No available nodes meet the required scores",
484-
"pod", args.Pod.Name)
513+
klog.InfoS("No available nodes meet the required scores",
514+
"podName", args.Pod.Name,
515+
"podNamespace", args.Pod.Namespace,
516+
"requestedResources", k8sutil.Resourcereqs(args.Pod),
517+
"totalNodesChecked", len(*args.NodeNames),
518+
"failedNodesCount", len(failedNodes))
485519
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", fmt.Errorf("no available node, %d nodes do not meet", len(*args.NodeNames)))
486520
return &extenderv1.ExtenderFilterResult{
487521
FailedNodes: failedNodes,
488522
}, nil
489523
}
490-
klog.V(4).Infoln("nodeScores_len=", len((*nodeScores).NodeList))
524+
klog.InfoS("Calculated node scores", "pod", args.Pod.Name, "nodeScoresLen", len(nodeScores.NodeList))
491525
sort.Sort(nodeScores)
492526
m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1]
493527
klog.InfoS("Scheduling pod to node",

pkg/scheduler/scheduler_test.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -670,8 +670,8 @@ func Test_RegisterFromNodeAnnotations(t *testing.T) {
670670
t.Errorf("missing annotation: hami.io/node-handshake-dcu")
671671
return false
672672
}
673-
_, errHami := time.Parse(time.DateTime, strings.TrimPrefix(handshakeTimeStr, "Requesting_"))
674-
_, errDcu := time.Parse(time.DateTime, strings.TrimPrefix(dcuTimeStr, "Requesting_"))
673+
_, errHami := time.Parse(time.DateTime, trimHandshakePrefix(handshakeTimeStr))
674+
_, errDcu := time.Parse(time.DateTime, trimHandshakePrefix(dcuTimeStr))
675675
if errHami != nil {
676676
t.Errorf("invalid time format in annotation 'hami.io/node-handshake': %v", errHami)
677677
return false
@@ -786,8 +786,8 @@ func Test_RegisterFromNodeAnnotations_NIL(t *testing.T) {
786786
}
787787

788788
// Verify time format in annotations if they exist
789-
_, errHami := time.Parse(time.DateTime, strings.TrimPrefix(handshakeTimeStr, "Requesting_"))
790-
_, errDcu := time.Parse(time.DateTime, strings.TrimPrefix(dcuTimeStr, "Requesting_"))
789+
_, errHami := time.Parse(time.DateTime, trimHandshakePrefix(handshakeTimeStr))
790+
_, errDcu := time.Parse(time.DateTime, trimHandshakePrefix(dcuTimeStr))
791791

792792
if errHami != nil {
793793
t.Errorf("invalid time format in annotation 'hami.io/node-handshake': %v", errHami)
@@ -837,3 +837,13 @@ func Test_RegisterFromNodeAnnotations_NIL(t *testing.T) {
837837
})
838838
}
839839
}
840+
841+
func trimHandshakePrefix(s string) string {
842+
prefixes := []string{"Requesting_", "Deleted_"}
843+
for _, p := range prefixes {
844+
if strings.HasPrefix(s, p) {
845+
return strings.TrimPrefix(s, p)
846+
}
847+
}
848+
return s
849+
}

0 commit comments

Comments
 (0)