@@ -156,66 +156,83 @@ func (s *Scheduler) Stop() {
156156}
157157
158158func (s * Scheduler ) RegisterFromNodeAnnotations () {
159- klog .V (5 ).Infoln ("Scheduler into RegisterFromNodeAnnotations" )
159+ klog .InfoS ("Entering RegisterFromNodeAnnotations" )
160+ defer klog .InfoS ("Exiting RegisterFromNodeAnnotations" )
160161 ticker := time .NewTicker (time .Second * 15 )
162+ defer ticker .Stop ()
161163 printedLog := map [string ]bool {}
162164 for {
163165 select {
164166 case <- s .nodeNotify :
167+ klog .InfoS ("Received node notification" )
165168 case <- ticker .C :
169+ klog .InfoS ("Ticker triggered" )
166170 case <- s .stopCh :
171+ klog .InfoS ("Received stop signal, exiting RegisterFromNodeAnnotations" )
167172 return
168173 }
169174 labelSelector := labels .Everything ()
170175 if len (config .NodeLabelSelector ) > 0 {
171176 labelSelector = (labels .Set )(config .NodeLabelSelector ).AsSelector ()
177+ klog .InfoS ("Using label selector" , "selector" , labelSelector .String ())
172178 }
173179 rawNodes , err := s .nodeLister .List (labelSelector )
174180 if err != nil {
175- klog .Errorln ( "nodes list failed " , err . Error ())
181+ klog .ErrorS ( err , "Failed to list nodes with selector " , "selector" , labelSelector . String ())
176182 continue
177183 }
184+ klog .InfoS ("Listed nodes" , "nodeCount" , len (rawNodes ))
178185 var nodeNames []string
179186 for _ , val := range rawNodes {
180187 nodeNames = append (nodeNames , val .Name )
188+ klog .InfoS ("Processing node" , "nodeName" , val .Name )
189+
181190 for devhandsk , devInstance := range device .GetDevices () {
191+ klog .InfoS ("Checking device health" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
192+
182193 health , needUpdate := devInstance .CheckHealth (devhandsk , val )
183- klog .V (5 ).InfoS ("device check health" , "node" , val .Name , "deviceVendor" , devhandsk , "health" , health , "needUpdate" , needUpdate )
194+ klog .InfoS ("Device health check result" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "health" , health , "needUpdate" , needUpdate )
195+
184196 if ! health {
197+ klog .Warning ("Device is unhealthy, cleaning up node" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
185198 err := devInstance .NodeCleanUp (val .Name )
186- // If the device is not healthy, the device is removed from the node.
187- // At the same time, this node needs to be removed from the cache.
188199 if err != nil {
189- klog .Errorln ( "node cleanup failed" , err . Error () )
200+ klog .ErrorS ( err , "Node cleanup failed" , "nodeName" , val . Name , "deviceVendor" , devhandsk )
190201 }
202+
191203 info , ok := s .nodes [val .Name ]
192204 if ok {
193- klog .Infof ( "node %v device %s:%v leave, %v remaining devices:%v" , val .Name , devhandsk , info . ID , err , s .nodes [val .Name ].Devices )
205+ klog .InfoS ( "Removing device from node" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "remainingDevices" , s .nodes [val .Name ].Devices )
194206 s .rmNodeDevice (val .Name , info , devhandsk )
195- continue
196207 }
208+ continue
197209 }
198210 if ! needUpdate {
211+ klog .InfoS ("No update needed for device" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
199212 continue
200213 }
201214 _ , ok := util .HandshakeAnnos [devhandsk ]
202215 if ok {
203216 tmppat := make (map [string ]string )
204217 tmppat [util.HandshakeAnnos [devhandsk ]] = "Requesting_" + time .Now ().Format (time .DateTime )
205- klog .V ( 5 ). InfoS ("New timestamp" , util .HandshakeAnnos [devhandsk ], tmppat [util.HandshakeAnnos [devhandsk ]], "nodeName" , val . Name )
218+ klog .InfoS ("New timestamp for annotation " , "nodeName" , val . Name , "annotationKey" , util .HandshakeAnnos [devhandsk ], "annotationValue" , tmppat [util.HandshakeAnnos [devhandsk ]])
206219 n , err := util .GetNode (val .Name )
207220 if err != nil {
208- klog .Errorln ( " get node failed" , err . Error () )
221+ klog .ErrorS ( err , "Failed to get node" , "nodeName" , val . Name )
209222 continue
210223 }
211- util .PatchNodeAnnotations (n , tmppat )
224+ klog .InfoS ("Patching node annotations" , "nodeName" , val .Name , "annotations" , tmppat )
225+ if err := util .PatchNodeAnnotations (n , tmppat ); err != nil {
226+ klog .ErrorS (err , "Failed to patch node annotations" , "nodeName" , val .Name )
227+ }
212228 }
213-
214229 nodeInfo := & util.NodeInfo {}
215230 nodeInfo .ID = val .Name
216231 nodeInfo .Node = val
232+ klog .InfoS ("Fetching node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
217233 nodedevices , err := devInstance .GetNodeDevices (* val )
218234 if err != nil {
235+ klog .ErrorS (err , "Failed to get node devices" , "nodeName" , val .Name , "deviceVendor" , devhandsk )
219236 continue
220237 }
221238 nodeInfo .Devices = make ([]util.DeviceInfo , 0 )
@@ -225,17 +242,17 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
225242 s .addNode (val .Name , nodeInfo )
226243 if s .nodes [val .Name ] != nil && len (nodeInfo .Devices ) > 0 {
227244 if printedLog [val .Name ] {
228- klog .Infof ("node %v device %s come node info=%s,%v total=%v" , val .Name , devhandsk , nodeInfo .ID , nodeInfo .Devices , s .nodes [val .Name ].Devices )
229- printedLog [val .Name ] = true
245+ klog .InfoS ("Node device updated" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
230246 } else {
231- klog .V (5 ).Infof ("node %v device %s come node info=%s,%v total=%v" , val .Name , devhandsk , nodeInfo .ID , nodeInfo .Devices , s .nodes [val .Name ].Devices )
247+ klog .InfoS ("Node device added" , "nodeName" , val .Name , "deviceVendor" , devhandsk , "nodeInfo" , nodeInfo , "totalDevices" , s .nodes [val .Name ].Devices )
248+ printedLog [val .Name ] = true
232249 }
233250 }
234251 }
235252 }
236253 _ , _ , err = s .getNodesUsage (& nodeNames , nil )
237254 if err != nil {
238- klog .Errorln ( " get node usage failed" , err . Error () )
255+ klog .ErrorS ( err , "Failed to get node usage" , "nodeNames" , nodeNames )
239256 }
240257 }
241258}
@@ -377,62 +394,63 @@ func (s *Scheduler) getPodUsage() (map[string]PodUseDeviceStat, error) {
377394}
378395
379396func (s * Scheduler ) Bind (args extenderv1.ExtenderBindingArgs ) (* extenderv1.ExtenderBindingResult , error ) {
380- klog .InfoS ("Bind" , "pod" , args .PodName , "namespace" , args .PodNamespace , "podUID" , args .PodUID , "node" , args .Node )
381- var err error
397+ klog .InfoS ("Attempting to bind pod to node" , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
382398 var res * extenderv1.ExtenderBindingResult
399+
383400 binding := & corev1.Binding {
384401 ObjectMeta : metav1.ObjectMeta {Name : args .PodName , UID : args .PodUID },
385402 Target : corev1.ObjectReference {Kind : "Node" , Name : args .Node },
386403 }
387404 current , err := s .kubeClient .CoreV1 ().Pods (args .PodNamespace ).Get (context .Background (), args .PodName , metav1.GetOptions {})
388405 if err != nil {
389- klog .ErrorS (err , "Get pod failed" )
406+ klog .ErrorS (err , "Failed to get pod" , "pod" , args .PodName , "namespace" , args .PodNamespace )
407+ return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
390408 }
391-
409+ klog . InfoS ( "Trying to get the target node for pod" , "pod" , args . PodName , "namespace" , args . PodNamespace , "node" , args . Node )
392410 node , err := s .kubeClient .CoreV1 ().Nodes ().Get (context .Background (), args .Node , metav1.GetOptions {})
393411 if err != nil {
394412 klog .ErrorS (err , "Failed to get node" , "node" , args .Node )
395- s .recordScheduleBindingResultEvent (current , EventReasonBindingFailed , []string {}, fmt .Errorf ("failed to get node %v" , args .Node ))
396- res = & extenderv1.ExtenderBindingResult {
397- Error : err .Error (),
398- }
413+ s .recordScheduleBindingResultEvent (current , EventReasonBindingFailed , []string {}, fmt .Errorf ("failed to get node %s" , args .Node ))
414+ res = & extenderv1.ExtenderBindingResult {Error : err .Error ()}
399415 return res , nil
400416 }
401- tmppatch := make (map [string ]string )
417+
418+ tmppatch := map [string ]string {
419+ util .DeviceBindPhase : "allocating" ,
420+ util .BindTimeAnnotations : strconv .FormatInt (time .Now ().Unix (), 10 ),
421+ }
422+
402423 for _ , val := range device .GetDevices () {
403424 err = val .LockNode (node , current )
404425 if err != nil {
426+ klog .ErrorS (err , "Failed to lock node" , "node" , args .Node , "device" , val )
405427 goto ReleaseNodeLocks
406428 }
407429 }
408430
409- tmppatch [util .DeviceBindPhase ] = "allocating"
410- tmppatch [util .BindTimeAnnotations ] = strconv .FormatInt (time .Now ().Unix (), 10 )
411-
412431 err = util .PatchPodAnnotations (current , tmppatch )
413432 if err != nil {
414- klog .ErrorS (err , "patch pod annotation failed" )
415- }
416- if err = s .kubeClient .CoreV1 ().Pods (args .PodNamespace ).Bind (context .Background (), binding , metav1.CreateOptions {}); err != nil {
417- klog .ErrorS (err , "Failed to bind pod" , "pod" , args .PodName , "namespace" , args .PodNamespace , "podUID" , args .PodUID , "node" , args .Node )
433+ klog .ErrorS (err , "Failed to patch pod annotations" , "pod" , klog .KObj (current ))
434+ return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, err
418435 }
419- if err == nil {
420- s .recordScheduleBindingResultEvent (current , EventReasonBindingSucceed , []string {args .Node }, nil )
421- res = & extenderv1.ExtenderBindingResult {
422- Error : "" ,
423- }
424- klog .InfoS ("bind success" , "pod" , args .PodName , "namespace" , args .PodNamespace , "podUID" , args .PodUID , "node" , args .Node )
425- return res , nil
436+
437+ err = s .kubeClient .CoreV1 ().Pods (args .PodNamespace ).Bind (context .Background (), binding , metav1.CreateOptions {})
438+ if err != nil {
439+ klog .ErrorS (err , "Failed to bind pod" , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
440+ goto ReleaseNodeLocks
426441 }
442+
443+ s .recordScheduleBindingResultEvent (current , EventReasonBindingSucceed , []string {args .Node }, nil )
444+ klog .InfoS ("Successfully bound pod to node" , "pod" , args .PodName , "namespace" , args .PodNamespace , "node" , args .Node )
445+ return & extenderv1.ExtenderBindingResult {Error : "" }, nil
446+
427447ReleaseNodeLocks:
428- klog .InfoS ("bind failed " , "err " , err . Error () )
448+ klog .InfoS ("Release node locks " , "node " , args . Node )
429449 for _ , val := range device .GetDevices () {
430450 val .ReleaseNodeLock (node , current )
431451 }
432452 s .recordScheduleBindingResultEvent (current , EventReasonBindingFailed , []string {}, err )
433- return & extenderv1.ExtenderBindingResult {
434- Error : err .Error (),
435- }, nil
453+ return & extenderv1.ExtenderBindingResult {Error : err .Error ()}, nil
436454}
437455
438456func (s * Scheduler ) Filter (args extenderv1.ExtenderArgs ) (* extenderv1.ExtenderFilterResult , error ) {
0 commit comments