@@ -70,7 +70,7 @@ func NewScheduler() *Scheduler {
70
70
}
71
71
s .nodeManager = newNodeManager ()
72
72
s .podManager = newPodManager ()
73
- klog .V ( 2 ). InfoS ("Scheduler initialized successfully" )
73
+ klog .InfoS ("Scheduler initialized successfully" )
74
74
return s
75
75
}
76
76
@@ -87,7 +87,7 @@ func (s *Scheduler) onAddPod(obj any) {
87
87
klog .ErrorS (fmt .Errorf ("invalid pod object" ), "Failed to process pod addition" )
88
88
return
89
89
}
90
- klog .V (5 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
90
+ klog .V (4 ).InfoS ("Pod added" , "pod" , pod .Name , "namespace" , pod .Namespace )
91
91
nodeID , ok := pod .Annotations [util .AssignedNodeAnnotations ]
92
92
if ! ok {
93
93
return
@@ -107,7 +107,7 @@ func (s *Scheduler) onUpdatePod(_, newObj any) {
107
107
func (s * Scheduler ) onDelPod (obj any ) {
108
108
pod , ok := obj .(* corev1.Pod )
109
109
if ! ok {
110
- klog .Errorf ("unknown add object type" )
110
+ klog .ErrorS ( fmt . Errorf ("unknown object type" ), "Failed to process pod deletion " )
111
111
return
112
112
}
113
113
_ , ok = pod .Annotations [util .AssignedNodeAnnotations ]
@@ -156,9 +156,9 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
156
156
for {
157
157
select {
158
158
case <- s .nodeNotify :
159
- klog .V (5 ).InfoS ("Received node notification" )
159
+ klog .V (4 ).InfoS ("Received node notification" )
160
160
case <- ticker .C :
161
- klog .InfoS ("Ticker triggered" )
161
+ klog .V ( 5 ). InfoS ("Ticker triggered" )
162
162
case <- s .stopCh :
163
163
klog .InfoS ("Received stop signal, exiting RegisterFromNodeAnnotations" )
164
164
return
@@ -168,26 +168,36 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168
168
klog .ErrorS (err , "Failed to list nodes with selector" , "selector" , labelSelector .String ())
169
169
continue
170
170
}
171
- klog .V (5 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
171
+
172
+ klog .V (4 ).InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
172
173
var nodeNames []string
173
174
for _ , val := range rawNodes {
174
175
nodeNames = append (nodeNames , val .Name )
175
- klog .V (5 ).InfoS ("Processing node" , "nodeName" , val .Name )
176
+
177
+ klog .V (4 ).InfoS ("Processing node" , "nodeName" , val .Name )
176
178
177
179
for devhandsk , devInstance := range device .GetDevices () {
178
- klog .V (5 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
180
+
181
+ klog .V (4 ).InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
179
182
180
183
nodedevices , err := devInstance .GetNodeDevices (* val )
181
184
if err != nil {
182
- klog .V (5 ).InfoS ("Failed to get node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
183
- continue
185
+ klog .V (3 ).InfoS ("Failed to update node device status" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "error" , err )
184
186
}
185
187
186
188
health , needUpdate := devInstance .CheckHealth (devhandsk , val )
187
- klog .V (5 ).InfoS ("Device health check result" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "health" , health , "needUpdate" , needUpdate )
189
+
190
+ klog .V (4 ).InfoS ("Device health check result" ,
191
+ "nodeName" , val .Name ,
192
+ "deviceVendor" , devhandsk ,
193
+ "health" , health ,
194
+ "needUpdate" , needUpdate )
188
195
189
196
if ! health {
190
- klog .Warning ("Device is unhealthy, cleaning up node" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
197
+
198
+ klog .Warning ("Device is unhealthy, cleaning up node" ,
199
+ "nodeName" , val .Name ,
200
+ "deviceVendor" , devhandsk )
191
201
err := devInstance .NodeCleanUp (val .Name )
192
202
if err != nil {
193
203
klog .ErrorS (err , "Node cleanup failed" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
@@ -197,40 +207,47 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
197
207
continue
198
208
}
199
209
if ! needUpdate {
200
- klog .V (5 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
210
+ klog .V (4 ).InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
201
211
continue
202
212
}
203
213
_ , ok := util .HandshakeAnnos [devhandsk ]
204
214
if ok {
205
215
tmppat := make (map [string ]string )
206
216
tmppat [util.HandshakeAnnos [devhandsk ]] = "Requesting_" + time .Now ().Format (time .DateTime )
207
- klog .InfoS ("New timestamp for annotation" , "nodeName" , val .Name , "annotationKey" , util .HandshakeAnnos [devhandsk ], "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
217
+ klog .V (3 ).InfoS ("New timestamp for annotation" ,
218
+ "nodeName" , val .Name ,
219
+ "annotationKey" , util .HandshakeAnnos [devhandsk ],
220
+ "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
208
221
n , err := util .GetNode (val .Name )
209
222
if err != nil {
210
223
klog .ErrorS (err , "Failed to get node" , "nodeName" , val .Name )
211
224
continue
212
225
}
213
- klog .V (5 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
226
+ klog .V (4 ).InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
214
227
if err := util .PatchNodeAnnotations (n , tmppat ); err != nil {
215
228
klog .ErrorS (err , "Failed to patch node annotations" , "nodeName" , val .Name )
216
229
}
217
230
}
218
231
nodeInfo := & util.NodeInfo {}
219
232
nodeInfo .ID = val .Name
220
233
nodeInfo .Node = val
221
- klog .V (5 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
234
+ klog .V (4 ).InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
222
235
nodeInfo .Devices = make ([]util.DeviceInfo , 0 )
223
236
for _ , deviceinfo := range nodedevices {
224
237
nodeInfo .Devices = append (nodeInfo .Devices , * deviceinfo )
225
238
}
226
239
s .addNode (val .Name , nodeInfo )
227
240
if s .nodes [val .Name ] != nil && len (nodeInfo .Devices ) > 0 {
228
241
if printedLog [val .Name ] {
229
- klog .V ( 5 ). InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
242
+ klog .InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
230
243
} else {
231
244
klog .InfoS ("Node device added" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
232
245
printedLog [val .Name ] = true
233
246
}
247
+ klog .InfoS ("Node device inventory changed" ,
248
+ "nodeName" , val .Name ,
249
+ "deviceVendor" , devhandsk ,
250
+ "deviceCount" , len (nodeInfo .Devices ))
234
251
}
235
252
}
236
253
}
@@ -319,7 +336,10 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
319
336
d .Device .Usedcores += udevice .Usedcores
320
337
if strings .Contains (udevice .UUID , "[" ) {
321
338
if strings .Compare (d .Device .Mode , "hami-core" ) == 0 {
322
- klog .Errorf ("found a mig task running on a hami-core GPU\n " )
339
+ klog .ErrorS (fmt .Errorf ("invalid configuration" ), "MIG task assigned to non-MIG GPU" ,
340
+ "deviceID" , d .Device .ID ,
341
+ "deviceMode" , d .Device .Mode ,
342
+ "taskUUID" , udevice .UUID )
323
343
d .Device .Health = false
324
344
continue
325
345
}
@@ -328,21 +348,28 @@ func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[strin
328
348
util .PlatternMIG (& d .Device .MigUsage , d .Device .MigTemplate , tmpIdx )
329
349
}
330
350
d .Device .MigUsage .UsageList [Instance ].InUse = true
331
- klog .V (5 ).Infoln ("add mig usage" , d .Device .MigUsage , "template=" , d .Device .MigTemplate , "uuid=" , d .Device .ID )
351
+ klog .V (5 ).InfoS ("MIG device allocated" ,
352
+ "deviceID" , d .Device .ID ,
353
+ "instanceID" , Instance ,
354
+ "template" , d .Device .MigTemplate )
332
355
}
333
356
}
334
357
}
335
358
}
336
359
}
337
360
}
338
- klog .V (5 ).Infof ("usage: pod %v assigned %v %v" , p .Name , p .NodeID , p .Devices )
361
+ klog .V (4 ).InfoS ("Pod resource assignment" ,
362
+ "podName" , p .Name ,
363
+ "nodeName" , p .NodeID ,
364
+ "deviceCount" , len (p .Devices ))
339
365
}
340
366
s .overviewstatus = overallnodeMap
341
367
for _ , nodeID := range * nodes {
342
368
node , err := s .GetNode (nodeID )
343
369
if err != nil {
344
- // The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority.
345
- klog .V (5 ).InfoS ("node unregistered" , "node" , nodeID , "error" , err )
370
+ klog .V (3 ).InfoS ("Node unregistered or no GPU devices found" ,
371
+ "nodeName" , nodeID ,
372
+ "error" , err )
346
373
failedNodes [nodeID ] = "node unregistered"
347
374
continue
348
375
}
@@ -396,7 +423,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
396
423
klog .ErrorS (err , "Failed to get pod" , "pod" , args .PodName , "namespace" , args .PodNamespace )
397
424
return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
398
425
}
399
- klog .InfoS ("Trying to get the target node for pod " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
426
+ klog .V ( 3 ). InfoS ("Retrieving target node for binding " , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
400
427
node , err := s .kubeClient .CoreV1 ().Nodes ().Get (context .Background (), args .Node , metav1.GetOptions {})
401
428
if err != nil {
402
429
klog .ErrorS (err , "Failed to get node" , "node" , args .Node )
@@ -413,14 +440,14 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
413
440
for _ , val := range device .GetDevices () {
414
441
err = val .LockNode (node , current )
415
442
if err != nil {
416
- klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "device " , val )
443
+ klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "error " , err )
417
444
goto ReleaseNodeLocks
418
445
}
419
446
}
420
447
421
448
err = util .PatchPodAnnotations (current , tmppatch )
422
449
if err != nil {
423
- klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ))
450
+ klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ), "annotations" , tmppatch )
424
451
return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
425
452
}
426
453
@@ -453,8 +480,10 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
453
480
}
454
481
}
455
482
if total == 0 {
456
- klog .V (1 ).InfoS ("Pod does not request any resources" ,
457
- "pod" , args .Pod .Name )
483
+ klog .InfoS ("Pod does not request any resources" ,
484
+ "podName" , args .Pod .Name ,
485
+ "podNamespace" , args .Pod .Namespace ,
486
+ "podUID" , args .Pod .UID )
458
487
s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("does not request any resource" ))
459
488
return & extenderv1.ExtenderFilterResult {
460
489
NodeNames : args .NodeNames ,
@@ -470,8 +499,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
470
499
return nil , err
471
500
}
472
501
if len (failedNodes ) != 0 {
473
- klog .V (5 ).InfoS ("Nodes failed during usage retrieval" ,
474
- "nodes" , failedNodes )
502
+ klog .InfoS ("Failed to retrieve usage for some nodes" ,
503
+ "failedNodeCount" , len (failedNodes ),
504
+ "failedNodes" , failedNodes )
475
505
}
476
506
nodeScores , err := s .calcScore (nodeUsage , nums , annos , args .Pod , failedNodes )
477
507
if err != nil {
@@ -480,14 +510,18 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
480
510
return nil , err
481
511
}
482
512
if len ((* nodeScores ).NodeList ) == 0 {
483
- klog .V (4 ).InfoS ("No available nodes meet the required scores" ,
484
- "pod" , args .Pod .Name )
513
+ klog .InfoS ("No available nodes meet the required scores" ,
514
+ "podName" , args .Pod .Name ,
515
+ "podNamespace" , args .Pod .Namespace ,
516
+ "requestedResources" , k8sutil .Resourcereqs (args .Pod ),
517
+ "totalNodesChecked" , len (* args .NodeNames ),
518
+ "failedNodesCount" , len (failedNodes ))
485
519
s .recordScheduleFilterResultEvent (args .Pod , EventReasonFilteringFailed , "" , fmt .Errorf ("no available node, %d nodes do not meet" , len (* args .NodeNames )))
486
520
return & extenderv1.ExtenderFilterResult {
487
521
FailedNodes : failedNodes ,
488
522
}, nil
489
523
}
490
- klog .V ( 4 ). Infoln ( "nodeScores_len= " , len (( * nodeScores ) .NodeList ))
524
+ klog .InfoS ( "Calculated node scores" , "pod" , args . Pod . Name , "nodeScoresLen " , len (nodeScores .NodeList ))
491
525
sort .Sort (nodeScores )
492
526
m := (* nodeScores ).NodeList [len ((* nodeScores ).NodeList )- 1 ]
493
527
klog .InfoS ("Scheduling pod to node" ,
0 commit comments