Skip to content

Commit ae69060

Browse files
Handle existing, locked machines at start up. Use thread-agnostic machine deletion on reimage failures.
1 parent a14027a commit ae69060

File tree

1 file changed

+37
-12
lines changed

1 file changed

+37
-12
lines changed

modules/machinery/az.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@
6969
# This is hard cap of 4 given the maximum preemption chain length of 4
7070
MAX_CONCURRENT_VMSS_OPERATIONS = 4
7171

72+
# These global lists will be used for maintaining lists of machines that failed during reimaging
73+
vms_absent_from_vmss = []
74+
vms_timed_out_being_reimaged = []
75+
7276
# These global lists will be used for maintaining lists of ongoing operations on specific machines
7377
vms_currently_being_reimaged = []
7478
vms_currently_being_deleted = []
@@ -104,10 +108,6 @@ class Azure(Machinery):
104108
WINDOWS_PLATFORM = "windows"
105109
LINUX_PLATFORM = "linux"
106110

107-
# Statuses for machines
108-
ABSENT = "absent_vm"
109-
REIMAGE_FAILING = "failed_reimaging"
110-
111111
def set_options(self, options: dict) -> None:
112112
"""Set machine manager options.
113113
@param options: machine manager options dict.
@@ -118,11 +118,20 @@ def set_options(self, options: dict) -> None:
118118
if not isinstance(mmanager_opts["scale_sets"], list):
119119
mmanager_opts["scale_sets"] = str(mmanager_opts["scale_sets"]).strip().split(",")
120120

121+
def initialize(self):
122+
"""
123+
Overloading abstracts.py:_initialize()
124+
"""
125+
# Load.
126+
self._initialize()
127+
128+
# Run initialization checks.
129+
self._initialize_check()
130+
121131
def _initialize(self):
122132
"""
123133
Overloading abstracts.py:_initialize()
124134
Read configuration.
125-
@param module_name: module name
126135
@raise CuckooDependencyError: if there is a problem with the dependencies call
127136
"""
128137
mmanager_opts = self.options.get(self.module_name)
@@ -285,7 +294,8 @@ def _set_vmss_stage(self):
285294

286295
self._process_pre_existing_vmsss()
287296
self._check_cpu_cores()
288-
self._get_or_upsert_vmsss(self.required_vmsss)
297+
self._update_or_create_vmsss(self.required_vmsss)
298+
self._check_locked_machines()
289299
self._create_batch_threads()
290300

291301
def _process_pre_existing_vmsss(self):
@@ -403,7 +413,7 @@ def _check_cpu_cores(self):
403413
else:
404414
self.instance_type_cpus = self.options.az.instance_type_cores
405415

406-
def _get_or_upsert_vmsss(self, vmsss_dict):
416+
def _update_or_create_vmsss(self, vmsss_dict):
407417
"""
408418
Reimage or scale up existing VMSSs. Create non-existant required VMSSs.
409419
"""
@@ -432,6 +442,17 @@ def _get_or_upsert_vmsss(self, vmsss_dict):
432442
for thr in vmss_reimage_threads + vmss_creation_threads:
433443
thr.join()
434444

445+
def _check_locked_machines(self):
446+
"""
447+
In the case of CAPE unexpectedly restarting, release any locked machines.
448+
They will have been reimaged and their tasks rescheduled before reaching this code.
449+
"""
450+
running = self.running()
451+
if len(running) > 0:
452+
log.info("%d machines found locked on initialize, unlocking.", len(running))
453+
for machine in running:
454+
self.db.unlock_machine(machine)
455+
435456
def _create_batch_threads(self):
436457
"""
437458
Create batch reimage and delete threads.
@@ -491,9 +512,13 @@ def release(self, machine: Machine):
491512
@param label: machine label.
492513
"""
493514
vmss_name = machine.label.split("_")[0]
494-
if machine.status == Azure.ABSENT:
515+
if machine.label in vms_absent_from_vmss:
495516
self.delete_machine(machine.label, delete_from_vmss=False)
496-
elif machine.status == Azure.REIMAGE_FAILING or machine_pools[vmss_name]["is_scaling_down"]:
517+
vms_absent_from_vmss.remove(machine.label)
518+
elif machine.label in vms_timed_out_being_reimaged:
519+
self.delete_machine(machine.label)
520+
vms_timed_out_being_reimaged.remove(machine.label)
521+
elif machine_pools[vmss_name]["is_scaling_down"]:
497522
self.delete_machine(machine.label)
498523
else:
499524
_ = super(Azure, self).release(machine)
@@ -640,7 +665,7 @@ def _add_machines_to_db(self, vmss_name):
640665
):
641666
# VMs not deleted from VMSS yet.
642667
continue
643-
self._get_or_upsert_vmsss(vmsss_dict={vmss_name: self.required_vmsss[vmss_name]})
668+
self._update_or_create_vmsss(vmsss_dict={vmss_name: self.required_vmsss[vmss_name]})
644669
return
645670
log.debug(f"{vmss_name} initialize retry failed. Timed out waiting for VMs to be deleted.")
646671

@@ -1288,7 +1313,7 @@ def _thr_reimage_list_reader(self):
12881313
vms_currently_being_deleted.append(f"{vmss_to_reimage}_{instance_id}")
12891314
with delete_lock:
12901315
delete_vm_list.append({"vmss": vmss_to_reimage, "id": instance_id, "time_added": time.time()})
1291-
self.set_status(f"{vmss_to_reimage}_{instance_id}", Azure.ABSENT)
1316+
vms_absent_from_vmss.append(f"{vmss_to_reimage}_{instance_id}")
12921317
vms_currently_being_reimaged.remove(f"{vmss_to_reimage}_{instance_id}")
12931318
instance_ids.remove(instance_id)
12941319

@@ -1309,7 +1334,7 @@ def _thr_reimage_list_reader(self):
13091334
)
13101335
# That sucks, now we have mark each one for deletion
13111336
for instance_id in instance_ids:
1312-
self.set_status(f"{vmss_to_reimage}_{instance_id}", Azure.REIMAGE_FAILING)
1337+
vms_timed_out_being_reimaged.append(f"{vmss_to_reimage}_{instance_id}")
13131338
break
13141339
time.sleep(2)
13151340

0 commit comments

Comments
 (0)