6
6
from typing import Collection
7
7
from typing import Optional
8
8
9
- from kubernetes import watch
9
+ from kubernetes import watch as kube_watch
10
10
from kubernetes .client import V1Affinity
11
11
from kubernetes .client import V1Container
12
12
from kubernetes .client import V1ContainerPort
@@ -72,13 +72,22 @@ def __init__(
72
72
kubeconfig_path : Optional [str ] = None ,
73
73
task_configs : Optional [Collection [KubernetesTaskConfig ]] = [],
74
74
emit_events_without_state_transitions : bool = False ,
75
+ # kubeconfigs used to continue to watch other clusters
76
+ # Used when transitioning to a new cluster in the primary kubeconfig_path to continue watching still-running pods on other clusters
77
+ watcher_kubeconfig_paths : Collection [str ] = (),
75
78
) -> None :
76
79
if not version :
77
80
version = "unknown_task_processing"
78
81
user_agent = f"{ namespace } /v{ version } "
79
82
self .kube_client = KubeClient (
80
83
kubeconfig_path = kubeconfig_path , user_agent = user_agent
81
84
)
85
+
86
+ self .watcher_kube_clients = [
87
+ KubeClient (kubeconfig_path = watcher_kubeconfig_path , user_agent = user_agent )
88
+ for watcher_kubeconfig_path in watcher_kubeconfig_paths
89
+ ]
90
+
82
91
self .namespace = namespace
83
92
84
93
# Pod modified events that did not result in a pod state transition are usually not
@@ -106,17 +115,23 @@ def __init__(
106
115
107
116
# TODO(TASKPROC-243): keep track of resourceVersion so that we can continue event processing
108
117
# from where we left off on restarts
109
- self .watch = watch .Watch ()
110
- self .pod_event_watch_thread = threading .Thread (
111
- target = self ._pod_event_watch_loop ,
112
- # ideally this wouldn't be a daemon thread, but a watch.Watch() only checks
113
- # if it should stop after receiving an event - and it's possible that we
114
- # have periods with no events so instead we'll attempt to stop the watch
115
- # and then join() with a small timeout to make sure that, if we shutdown
116
- # with the thread alive, we did not drop any events
117
- daemon = True ,
118
- )
119
- self .pod_event_watch_thread .start ()
118
+ self .pod_event_watch_threads = []
119
+ self .watches = []
120
+ for kube_client in [self .kube_client ] + self .watcher_kube_clients :
121
+ watch = kube_watch .Watch ()
122
+ pod_event_watch_thread = threading .Thread (
123
+ target = self ._pod_event_watch_loop ,
124
+ args = (kube_client , watch ),
125
+ # ideally this wouldn't be a daemon thread, but a watch.Watch() only checks
126
+ # if it should stop after receiving an event - and it's possible that we
127
+ # have periods with no events so instead we'll attempt to stop the watch
128
+ # and then join() with a small timeout to make sure that, if we shutdown
129
+ # with the thread alive, we did not drop any events
130
+ daemon = True ,
131
+ )
132
+ pod_event_watch_thread .start ()
133
+ self .pod_event_watch_threads .append (pod_event_watch_thread )
134
+ self .watches .append (watch )
120
135
121
136
self .pending_event_processing_thread = threading .Thread (
122
137
target = self ._pending_event_processing_loop ,
@@ -143,7 +158,9 @@ def _initialize_existing_task(self, task_config: KubernetesTaskConfig) -> None:
143
158
),
144
159
)
145
160
146
- def _pod_event_watch_loop (self ) -> None :
161
+ def _pod_event_watch_loop (
162
+ self , kube_client : KubeClient , watch : kube_watch .Watch
163
+ ) -> None :
147
164
logger .debug (f"Starting watching Pod events for namespace={ self .namespace } ." )
148
165
# TODO(TASKPROC-243): we'll need to correctly handle resourceVersion expiration for the case
149
166
# where the gap between task_proc shutting down and coming back up is long enough for data
@@ -155,8 +172,8 @@ def _pod_event_watch_loop(self) -> None:
155
172
# see: https://github.com/kubernetes/kubernetes/issues/74022
156
173
while not self .stopping :
157
174
try :
158
- for pod_event in self . watch .stream (
159
- self . kube_client .core .list_namespaced_pod , self .namespace
175
+ for pod_event in watch .stream (
176
+ kube_client .core .list_namespaced_pod , self .namespace
160
177
):
161
178
# it's possible that we've received an event after we've already set the stop
162
179
# flag since Watch streams block forever, so re-check if we've stopped before
@@ -168,7 +185,7 @@ def _pod_event_watch_loop(self) -> None:
168
185
break
169
186
except ApiException as e :
170
187
if not self .stopping :
171
- if not self . kube_client .maybe_reload_on_exception (exception = e ):
188
+ if not kube_client .maybe_reload_on_exception (exception = e ):
172
189
logger .exception (
173
190
"Unhandled API exception while watching Pod events - restarting watch!"
174
191
)
@@ -589,11 +606,18 @@ def run(self, task_config: KubernetesTaskConfig) -> Optional[str]:
589
606
590
607
def reconcile (self , task_config : KubernetesTaskConfig ) -> None :
591
608
pod_name = task_config .pod_name
592
- try :
593
- pod = self .kube_client .get_pod (namespace = self .namespace , pod_name = pod_name )
594
- except Exception :
595
- logger .exception (f"Hit an exception attempting to fetch pod { pod_name } " )
596
- pod = None
609
+ pod = None
610
+ for kube_client in [self .kube_client ] + self .watcher_kube_clients :
611
+ try :
612
+ pod = kube_client .get_pod (namespace = self .namespace , pod_name = pod_name )
613
+ except Exception :
614
+ logger .exception (
615
+ f"Hit an exception attempting to fetch pod { pod_name } from { kube_client .kubeconfig_path } "
616
+ )
617
+ else :
618
+ # kube_client.get_pod will return None with no exception if it sees a 404 from API
619
+ if pod :
620
+ break
597
621
598
622
if pod_name not in self .task_metadata :
599
623
self ._initialize_existing_task (task_config )
@@ -640,9 +664,12 @@ def kill(self, task_id: str) -> bool:
640
664
This function will request that Kubernetes delete the named Pod and will return
641
665
True if the Pod termination request was succesfully emitted or False otherwise.
642
666
"""
643
- terminated = self .kube_client .terminate_pod (
644
- namespace = self .namespace ,
645
- pod_name = task_id ,
667
+ terminated = any (
668
+ kube_client .terminate_pod (
669
+ namespace = self .namespace ,
670
+ pod_name = task_id ,
671
+ )
672
+ for kube_client in [self .kube_client ] + self .watcher_kube_clients
646
673
)
647
674
if terminated :
648
675
logger .info (
@@ -678,12 +705,14 @@ def stop(self) -> None:
678
705
logger .debug ("Signaling Pod event Watch to stop streaming events..." )
679
706
# make sure that we've stopped watching for events before calling join() - otherwise,
680
707
# join() will block until we hit the configured timeout (or forever with no timeout).
681
- self .watch .stop ()
708
+ for watch in self .watches :
709
+ watch .stop ()
682
710
# timeout arbitrarily chosen - we mostly just want to make sure that we have a small
683
711
# grace period to flush the current event to the pending_events queue as well as
684
712
# any other clean-up - it's possible that after this join() the thread is still alive
685
713
# but in that case we can be reasonably sure that we're not dropping any data.
686
- self .pod_event_watch_thread .join (timeout = POD_WATCH_THREAD_JOIN_TIMEOUT_S )
714
+ for pod_event_watch_thread in self .pod_event_watch_threads :
715
+ pod_event_watch_thread .join (timeout = POD_WATCH_THREAD_JOIN_TIMEOUT_S )
687
716
688
717
logger .debug ("Waiting for all pending PodEvents to be processed..." )
689
718
# once we've stopped updating the pending events queue, we then wait until we're done
0 commit comments