Skip to content

Commit abe9f77

Browse files
Protecting against initial total machine failures with initialization retries.
1 parent 78b8ddf commit abe9f77

File tree

2 files changed

+99
-47
lines changed

2 files changed

+99
-47
lines changed

conf/default/az.conf.default

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ spot_instances = false
118118
# start pulling tasks off of the stack
119119
wait_for_agent_before_starting = true
120120

121+
# This integer value is used to determine how many times a VMSS that does not initialize properly can retry
122+
init_retries = 2
123+
121124
# These are the value(s) of the DNS server(s) that you want the scale sets to use. (E.g. 1.1.1.1,8.8.8.8)
122125
# NOTE: NO SPACES
123126
dns_server_ips = <dns_server_ip>

modules/machinery/az.py

Lines changed: 96 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ def set_options(self, options: dict) -> None:
110110
@param options: machine manager options dict.
111111
"""
112112
self.options = options
113+
# Using "scale_sets" here instead of "machines" to avoid KeyError
114+
mmanager_opts = self.options.get(self.module_name)
115+
if not isinstance(mmanager_opts["scale_sets"], list):
116+
mmanager_opts["scale_sets"] = str(mmanager_opts["scale_sets"]).strip().split(",")
113117

114118
def _initialize(self):
115119
"""
@@ -118,11 +122,7 @@ def _initialize(self):
118122
@param module_name: module name
119123
@raise CuckooDependencyError: if there is a problem with the dependencies call
120124
"""
121-
# Using "scale_sets" here instead of "machines" to avoid KeyError
122125
mmanager_opts = self.options.get(self.module_name)
123-
if not isinstance(mmanager_opts["scale_sets"], list):
124-
mmanager_opts["scale_sets"] = str(mmanager_opts["scale_sets"]).strip().split(",")
125-
126126
# Replace a list of IDs with dictionary representations
127127
scale_sets = mmanager_opts.pop("scale_sets")
128128
mmanager_opts["scale_sets"] = {}
@@ -150,7 +150,7 @@ def _initialize(self):
150150
# Insert the scale_set_opts into the module.scale_sets attribute
151151
mmanager_opts["scale_sets"][scale_set_id] = scale_set_opts
152152

153-
except (AttributeError, CuckooOperationalError) as e:
153+
except (AttributeError, CuckooCriticalError) as e:
154154
log.warning(f"Configuration details about scale set {scale_set_id.strip()} are missing: {e}")
155155
continue
156156

@@ -168,19 +168,21 @@ def _initialize_check(self):
168168

169169
# We will be using this as a source of truth for the VMSS configs
170170
self.required_vmsss = {
171-
vmss_name: {"exists": False, "image": None, "platform": None, "tag": None, "initial_pool_size": None}
171+
vmss_name: {
172+
"exists": False, "image": None, "platform": None, "tag": None, "initial_pool_size": None, "retries": self.options.az.init_retries
173+
}
172174
for vmss_name in self.options.az.scale_sets
173175
}
174176

175177
# Starting the thread that sets API clients periodically
176178
self._thr_refresh_clients()
177179

178-
# Starting the thread that scales the machine pools periodically
179-
self._thr_machine_pool_monitor()
180-
181180
# Initialize the VMSSs that we will be using and not using
182181
self._set_vmss_stage()
183182

183+
# Starting the thread that scales the machine pools periodically
184+
self._thr_machine_pool_monitor()
185+
184186
# Set the flag that indicates that the system is not initializing
185187
self.initializing = False
186188

@@ -189,7 +191,6 @@ def _get_credentials(self):
189191
Used to instantiate the Azure ClientSecretCredential object.
190192
@return: an Azure ClientSecretCredential object
191193
"""
192-
193194
credentials = None
194195
if self.options.az.secret and self.options.az.secret != "<secret>":
195196
# Instantiates the ClientSecretCredential object using
@@ -279,6 +280,19 @@ def _set_vmss_stage(self):
279280
elif required_vmss_values["initial_pool_size"] is None:
280281
raise CuckooCriticalError(f"The VMSS '{required_vmss_name}' does not have an initial pool size.")
281282

283+
self._process_existing_vmsss()
284+
self._process_pre_existing_vmsss(self.required_vmsss)
285+
self._create_batch_threads()
286+
287+
def _process_pre_existing_vmsss(self):
288+
"""
289+
Delete a VMSS if it does NOT have:
290+
- the expected tag AND has one of the required names for a VMSS we plan to create
291+
- one of the required names AND has the expected tag AND az.config's multiple_capes_in_sandbox_rg is 'false'
292+
Update a VMSS if it:
293+
- does not have the required image reference
294+
- has a capacity (current size) different from its required 'initial_pool_size'
295+
"""
282296
# Get all VMSSs in Resource Group
283297
existing_vmsss = Azure._azure_api_call(
284298
self.options.az.sandbox_resource_group,
@@ -292,7 +306,7 @@ def _set_vmss_stage(self):
292306
# Cuckoo (AUTO_SCALE_CAPE key-value pair), ignore
293307
if not vmss.tags or not vmss.tags.get(Azure.AUTO_SCALE_CAPE_KEY) == Azure.AUTO_SCALE_CAPE_VALUE:
294308

295-
# Unless! They have one of the required names of the VMSSs that we are going to create
309+
# Ignoring... unless! They have one of the required names of the VMSSs that we are going to create
296310
if vmss.name in self.required_vmsss.keys():
297311
async_delete_vmss = Azure._azure_api_call(
298312
self.options.az.sandbox_resource_group,
@@ -352,22 +366,9 @@ def _set_vmss_stage(self):
352366
operation=self.compute_client.virtual_machine_scale_sets.begin_delete,
353367
)
354368

355-
try:
356-
self.subnet_id = Azure._azure_api_call(
357-
self.options.az.vnet_resource_group,
358-
self.options.az.vnet,
359-
self.options.az.subnet,
360-
operation=self.network_client.subnets.get,
361-
).id # note the id attribute here
362-
except CuckooMachineError:
363-
raise CuckooCriticalError(
364-
f"Subnet '{self.options.az.subnet}' does not exist in Virtual Network '{self.options.az.vnet}'"
365-
)
366-
367369
# Initialize the platform scaling state monitor
368370
is_platform_scaling.update({Azure.WINDOWS_PLATFORM: False, Azure.LINUX_PLATFORM: False})
369371

370-
# Let's get the number of CPUs associated with the SKU (instance_type)
371372
# If we want to programmatically determine the number of cores for the sku
372373
if self.options.az.find_number_of_cores_for_sku or self.options.az.instance_type_cores == 0:
373374
resource_skus = Azure._azure_api_call(
@@ -394,10 +395,14 @@ def _set_vmss_stage(self):
394395
else:
395396
self.instance_type_cpus = self.options.az.instance_type_cores
396397

397-
# Create required VMSSs that don't exist yet
398+
def _get_or_upsert_vmsss(self, vmsss_dict):
399+
"""
400+
Reimage or scale up existing VMSSs. Create non-existant required VMSSs.
401+
"""
402+
398403
vmss_creation_threads = []
399404
vmss_reimage_threads = []
400-
for vmss, vals in self.required_vmsss.items():
405+
for vmss, vals in vmsss_dict.items():
401406
if vals["exists"] and not self.options.az.just_start:
402407
if machine_pools[vmss]["size"] == 0:
403408
self._thr_scale_machine_pool(self.options.az.scale_sets[vmss].pool_tag, True if vals["platform"] else False),
@@ -419,6 +424,10 @@ def _set_vmss_stage(self):
419424
for thr in vmss_reimage_threads + vmss_creation_threads:
420425
thr.join()
421426

427+
def _create_batch_threads(self):
428+
"""
429+
Create batch reimage and delete threads.
430+
"""
422431
# Initialize the batch reimage threads. We want at most 4 batch reimaging threads
423432
# so that if no VMSS scaling or batch deleting is taking place (aka we are receiving constant throughput of
424433
# tasks and have the appropriate number of VMs created) then we'll perform batch reimaging at an optimal rate.
@@ -583,8 +592,8 @@ def _add_machines_to_db(self, vmss_name):
583592
resultserver_port=self.options.az.resultserver_port,
584593
reserved=False,
585594
)
586-
# When we aren't initializing the system, the machine will immediately become available in DB
587-
# When we are initializing, we're going to wait for the machine to be have the Cuckoo agent all set up
595+
# We always wait for Cuckoo agent to finish setting up if 'wait_for_agent_before_starting' is true or if we are initializing.
596+
# Else, the machine should become immediately available in DB.
588597
if self.initializing or self.options.az.wait_for_agent_before_starting:
589598
thr = threading.Thread(
590599
target=Azure._thr_wait_for_ready_machine,
@@ -607,6 +616,24 @@ def _add_machines_to_db(self, vmss_name):
607616
except Exception as e:
608617
log.error(repr(e), exc_info=True)
609618

619+
# If no machines on any VMSSs are in the db when we leave this method, CAPE will crash.
620+
if not self.machines() and self.required_vmsss[vmss_name]["retries"] > 0:
621+
log.warning(f"No available VMs after initializing {vmss_name}. Attempting to reinitialize VMSS.")
622+
self.required_vmsss[vmss_name]["retries"] -= 1
623+
start_time = timeit.default_timer()
624+
625+
while ((timeit.default_timer() - start_time) < 120):
626+
with vms_currently_being_deleted_lock:
627+
if any(
628+
failed_vm in vms_currently_being_deleted
629+
for failed_vm in ready_vmss_vm_threads
630+
):
631+
# VMs not deleted from VMSS yet.
632+
continue
633+
self._get_or_upsert_vmsss(vmsss_dict={vmss_name: self.required_vmsss[vmss_name]})
634+
return
635+
log.debug(f"{vmss_name} initialize retry failed. Timed out waiting for VMs to be deleted.")
636+
610637
def _delete_machines_from_db_if_missing(self, vmss_name):
611638
"""
612639
Delete machine from database if it does not exist in the VMSS.
@@ -636,8 +663,8 @@ def delete_machine(self, label, delete_from_vmss=True):
636663
super(Azure, self).delete_machine(label)
637664

638665
if delete_from_vmss:
639-
vmss_name, instance_id = label.split("_")
640666
# Only add vm to the lists if it isn't there already
667+
vmss_name, instance_id = label.split("_")
641668
with vms_currently_being_deleted_lock:
642669
if not label in vms_currently_being_deleted:
643670
vms_currently_being_deleted.append(label)
@@ -665,13 +692,13 @@ def _thr_wait_for_ready_machine(machine_name, machine_ip):
665692
log.debug(f"{machine_name}: Initializing...")
666693
except socket.error:
667694
log.debug(f"{machine_name}: Initializing...")
668-
time.sleep(10)
669-
670-
if timeit.default_timer() - start >= timeout:
671-
# We didn't do it :(
672-
raise CuckooGuestCriticalTimeout(
673-
f"Machine {machine_name}: the guest initialization hit the critical " "timeout, analysis aborted."
674-
)
695+
else:
696+
if (timeit.default_timer() - start) >= timeout:
697+
# We didn't do it :(
698+
raise CuckooGuestCriticalTimeout(
699+
f"Machine {machine_name}: the guest initialization hit the critical timeout, analysis aborted."
700+
)
701+
time.sleep(10)
675702
log.debug(f"Machine {machine_name} was created and available in {round(timeit.default_timer() - start)}s")
676703

677704
@staticmethod
@@ -723,6 +750,18 @@ def _thr_create_vmss(self, vmss_name, vmss_image_ref, vmss_image_os):
723750
@param vmss_tag: the tag used that represents the OS image
724751
"""
725752

753+
try:
754+
self.subnet_id = Azure._azure_api_call(
755+
self.options.az.vnet_resource_group,
756+
self.options.az.vnet,
757+
self.options.az.subnet,
758+
operation=self.network_client.subnets.get,
759+
).id # note the id attribute here
760+
except CuckooMachineError:
761+
raise CuckooCriticalError(
762+
f"Subnet '{self.options.az.subnet}' does not exist in Virtual Network '{self.options.az.vnet}'"
763+
)
764+
726765
vmss_managed_disk = models.VirtualMachineScaleSetManagedDiskParameters(
727766
storage_account_type=self.options.az.storage_account_type
728767
)
@@ -800,6 +839,7 @@ def _thr_create_vmss(self, vmss_name, vmss_image_ref, vmss_image_os):
800839
"is_scaling_down": False,
801840
"wait": False,
802841
}
842+
self.required_vmsss[vmss_name]["exists"] = True
803843
with self.db.session.begin():
804844
self._add_machines_to_db(vmss_name)
805845

@@ -986,7 +1026,7 @@ def _scale_machine_pool(self, tag, per_platform=False):
9861026
)
9871027

9881028
# We don't want to be stuck in this for longer than the timeout specified
989-
if timeit.default_timer() - start_time > AZURE_TIMEOUT:
1029+
if (timeit.default_timer() - start_time) > AZURE_TIMEOUT:
9901030
log.debug(f"Breaking out of the while loop within the scale down section for {vmss_name}.")
9911031
break
9921032
# Get the updated number of relevant machines required
@@ -1250,9 +1290,12 @@ def _thr_reimage_list_reader(self):
12501290
vms_currently_being_reimaged.remove(f"{vmss_to_reimage}_{instance_id}")
12511291
continue
12521292

1293+
reimaged = True
12531294
# We wait because we want the machine to be fresh before another task is assigned to it
12541295
while not async_reimage_some_machines.done():
12551296
if (timeit.default_timer() - start_time) > AZURE_TIMEOUT:
1297+
reimaged = False
1298+
12561299
log.debug(
12571300
f"Reimaging machines {instance_ids} in {vmss_to_reimage} took too long, deleting them from the DB and the VMSS."
12581301
)
@@ -1270,7 +1313,7 @@ def _thr_reimage_list_reader(self):
12701313
with current_operations_lock:
12711314
current_vmss_operations -= 1
12721315
timediff = timeit.default_timer() - start_time
1273-
log.debug(f"Reimaging instances {instance_ids} in {vmss_to_reimage} took {round(timediff)}s")
1316+
log.debug(f"{'S' if reimaged else 'Uns'}uccessfully reimaging instances {instance_ids} in {vmss_to_reimage} took {round(timediff)}s")
12741317
except Exception as e:
12751318
log.error(f"Exception occurred in the reimage thread: {e}. Trying again...")
12761319

@@ -1297,8 +1340,8 @@ def _thr_delete_list_reader(self):
12971340
for vmss_name, count in vmss_vm_delete_counts.items():
12981341
if count > max:
12991342
max = count
1300-
vmss_to_delete = vmss_name
1301-
vms_to_delete_from_same_vmss = [vm for vm in delete_vm_list if vm["vmss"] == vmss_to_delete]
1343+
vmss_to_delete_from = vmss_name
1344+
vms_to_delete_from_same_vmss = [vm for vm in delete_vm_list if vm["vmss"] == vmss_to_delete_from]
13021345

13031346
for vm in vms_to_delete_from_same_vmss:
13041347
delete_vm_list.remove(vm)
@@ -1309,7 +1352,7 @@ def _thr_delete_list_reader(self):
13091352
start_time = timeit.default_timer()
13101353
async_delete_some_machines = Azure._azure_api_call(
13111354
self.options.az.sandbox_resource_group,
1312-
vmss_to_delete,
1355+
vmss_to_delete_from,
13131356
models.VirtualMachineScaleSetVMInstanceIDs(instance_ids=instance_ids),
13141357
polling_interval=1,
13151358
operation=self.compute_client.virtual_machine_scale_sets.begin_delete_instances,
@@ -1320,24 +1363,30 @@ def _thr_delete_list_reader(self):
13201363
current_vmss_operations -= 1
13211364
with vms_currently_being_deleted_lock:
13221365
for instance_id in instance_ids:
1323-
vms_currently_being_deleted.remove(f"{vmss_to_delete}_{instance_id}")
1366+
vms_currently_being_deleted.remove(f"{vmss_to_delete_from}_{instance_id}")
13241367
continue
13251368

13261369
# We wait because we want the machine to be fresh before another task is assigned to it
13271370
while not async_delete_some_machines.done():
1371+
deleted = True
13281372
if (timeit.default_timer() - start_time) > AZURE_TIMEOUT:
1329-
log.debug(f"Deleting machines {instance_ids} in {vmss_to_delete} took too long.")
1373+
log.debug(f"Deleting machines {instance_ids} in {vmss_to_delete_from} took too long.")
1374+
deleted = False
13301375
break
13311376
time.sleep(2)
13321377

1378+
if self.initializing and deleted:
1379+
# All machines should have been removed from the db and the VMSS at this point.
1380+
# To force the VMSS to scale to initial_pool_size, set the size to zero here.
1381+
log.debug(f"Setting size to 0 for VMSS {vmss_to_delete_from} after successful deletion")
1382+
machine_pools[vmss_to_delete_from]["size"] = 0
1383+
13331384
with vms_currently_being_deleted_lock:
13341385
for instance_id in instance_ids:
1335-
vms_currently_being_deleted.remove(f"{vmss_to_delete}_{instance_id}")
1386+
vms_currently_being_deleted.remove(f"{vmss_to_delete_from}_{instance_id}")
13361387

13371388
with current_operations_lock:
13381389
current_vmss_operations -= 1
1339-
log.debug(
1340-
f"Deleting instances {instance_ids} in {vmss_to_delete} took {round(timeit.default_timer() - start_time)}s"
1341-
)
1390+
log.debug(f"{'S' if deleted else 'Uns'}uccessfully deleting instances {instance_ids} in {vmss_to_delete_from} took {round(timeit.default_timer() - start_time)}s")
13421391
except Exception as e:
13431392
log.error(f"Exception occurred in the delete thread: {e}. Trying again...")

0 commit comments

Comments
 (0)