Skip to content

Commit ea2ca21

Browse files
Move machine deletions out of any threads.
1 parent 1217470 commit ea2ca21

File tree

1 file changed

+20
-15
lines changed

1 file changed

+20
-15
lines changed

modules/machinery/az.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ class Azure(Machinery):
104104
WINDOWS_PLATFORM = "windows"
105105
LINUX_PLATFORM = "linux"
106106

107+
# Statuses for machines
108+
ABSENT = "absent_vm"
109+
REIMAGE_FAILING = "failed_reimaging"
110+
107111
def set_options(self, options: dict) -> None:
108112
"""Set machine manager options.
109113
@param options: machine manager options dict.
@@ -454,17 +458,11 @@ def start(self, label=None):
454458
# Something bad happened, we are starting a task on a machine that needs to be deleted
455459
with vms_currently_being_deleted_lock:
456460
if label in vms_currently_being_deleted:
457-
err_msg = (
458-
f"Attempting to start a task with machine {label} while it is scheduled for deletion."
459-
f"Reassigning the task and removing {label} from the database."
460-
)
461-
log.error(err_msg)
462-
raise CuckooMachineError(err_msg)
461+
raise CuckooMachineError(f"Attempting to start a task with machine {label} while it is scheduled for deletion.")
463462

464463
def stop(self, label=None):
465464
"""
466-
If the VMSS is in the "scaling-down" state, delete machine,
467-
otherwise reimage it.
465+
If the VMSS is NOT in the "scaling-down" state, reimage it.
468466
@param label: virtual machine label
469467
@return: End method call
470468
"""
@@ -486,8 +484,16 @@ def stop(self, label=None):
486484
label_in_reimage_vm_list = label in [f"{vm['vmss']}_{vm['id']}" for vm in reimage_vm_list]
487485

488486
def release(self, machine: Machine):
487+
"""
488+
Delete machine if its VMSS is in the "scaling-down" state, it was found to be absent from its VMSS during
489+
reimaging, or reimaging timed out.
490+
Otherwise, release the successfully reimaged machine.
491+
@param label: machine label.
492+
"""
489493
vmss_name = machine.label.split("_")[0]
490-
if machine_pools[vmss_name]["is_scaling_down"]:
494+
if machine.status == Azure.ABSENT:
495+
self.delete_machine(machine.label, delete_from_vmss=False)
496+
elif machine.status == Azure.REIMAGE_FAILING or machine_pools[vmss_name]["is_scaling_down"]:
491497
self.delete_machine(machine.label)
492498
else:
493499
_ = super(Azure, self).release(machine)
@@ -1282,8 +1288,7 @@ def _thr_reimage_list_reader(self):
12821288
vms_currently_being_deleted.append(f"{vmss_to_reimage}_{instance_id}")
12831289
with delete_lock:
12841290
delete_vm_list.append({"vmss": vmss_to_reimage, "id": instance_id, "time_added": time.time()})
1285-
1286-
self.delete_machine(f"{vmss_to_reimage}_{instance_id}", delete_from_vmss=False)
1291+
self.set_status(f"{vmss_to_reimage}_{instance_id}", Azure.ABSENT)
12871292
vms_currently_being_reimaged.remove(f"{vmss_to_reimage}_{instance_id}")
12881293
instance_ids.remove(instance_id)
12891294

@@ -1299,12 +1304,12 @@ def _thr_reimage_list_reader(self):
12991304
if (timeit.default_timer() - start_time) > AZURE_TIMEOUT:
13001305
reimaged = False
13011306

1302-
log.debug(
1307+
log.warning(
13031308
f"Reimaging machines {instance_ids} in {vmss_to_reimage} took too long, deleting them from the DB and the VMSS."
13041309
)
1305-
# That sucks, now we have to delete each one
1310+
# That sucks, now we have mark each one for deletion
13061311
for instance_id in instance_ids:
1307-
self.delete_machine(f"{vmss_to_reimage}_{instance_id}")
1312+
self.set_status(f"{vmss_to_reimage}_{instance_id}", Azure.REIMAGE_FAILING)
13081313
break
13091314
time.sleep(2)
13101315

@@ -1373,7 +1378,7 @@ def _thr_delete_list_reader(self):
13731378
while not async_delete_some_machines.done():
13741379
deleted = True
13751380
if (timeit.default_timer() - start_time) > AZURE_TIMEOUT:
1376-
log.debug(f"Deleting machines {instance_ids} in {vmss_to_delete_from} took too long.")
1381+
log.warning(f"Deleting machines {instance_ids} in {vmss_to_delete_from} took too long.")
13771382
deleted = False
13781383
break
13791384
time.sleep(2)

0 commit comments

Comments
 (0)