Skip to content

Commit a43836c

Browse files
committed
reformat
1 parent 3c26723 commit a43836c

File tree

1 file changed

+36
-29
lines changed

1 file changed

+36
-29
lines changed

scoring/run_workloads.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@
99
--tuning_search_space <path_to_tuning_search_space_json>
1010
"""
1111

12+
import datetime
1213
import json
1314
import os
1415
import struct
16+
import subprocess
1517
import time
1618

1719
from absl import app
1820
from absl import flags
1921
from absl import logging
20-
import datetime
21-
import subprocess
2222

2323
from algorithmic_efficiency import random_utils as prng
2424
from algorithmic_efficiency.workloads.workloads import get_base_workload_name
@@ -28,10 +28,11 @@
2828
'docker_image_url',
2929
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
3030
'URL to docker image')
31-
flags.DEFINE_integer('run_percentage',
32-
100,
33-
'Percentage of max num steps to run for.'
34-
'Must set the flag enable_step_budget to True for this to take effect.')
31+
flags.DEFINE_integer(
32+
'run_percentage',
33+
100,
34+
'Percentage of max num steps to run for.'
35+
'Must set the flag enable_step_budget to True for this to take effect.')
3536
flags.DEFINE_string('experiment_name',
3637
'my_experiment',
3738
'Name of top sub directory in experiment dir.')
@@ -91,21 +92,18 @@
9192
'String representing a comma separated list of workload names.'
9293
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
9394
)
94-
flags.DEFINE_string(
95-
'additional_requirements_path',
96-
None,
97-
'Path to requirements.txt if any.'
98-
)
95+
flags.DEFINE_string('additional_requirements_path',
96+
None,
97+
'Path to requirements.txt if any.')
9998
flags.DEFINE_integer(
100-
'max_steps',
101-
None,
102-
'Maximum number of steps to run. Must set flag enable_step_budget.'
103-
'This flag takes precedence over the run_percentage flag.'
104-
)
99+
'max_steps',
100+
None,
101+
'Maximum number of steps to run. Must set flag enable_step_budget.'
102+
'This flag takes precedence over the run_percentage flag.')
105103
flags.DEFINE_bool(
106-
'enable_step_budget',
107-
False,
108-
'Flag that has to be explicitly set to override time budgets to step budget percentage.'
104+
'enable_step_budget',
105+
False,
106+
'Flag that has to be explicitly set to override time budgets to step budget percentage.'
109107
)
110108

111109
FLAGS = flags.FLAGS
@@ -125,31 +123,39 @@ def container_running():
125123
else:
126124
return True
127125

126+
128127
def kill_containers():
129128
docker_client = docker.from_env()
130129
containers = docker_client.containers.list()
131130
for container in containers:
132131
container.kill()
133132

133+
134134
def gpu_is_active():
135-
output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'])
136-
return any(int(x) > 0 for x in output.decode().splitlines())
137-
135+
output = subprocess.check_output([
136+
'nvidia-smi',
137+
'--query-gpu=utilization.gpu',
138+
'--format=csv,noheader,nounits'
139+
])
140+
return any(int(x) > 0 for x in output.decode().splitlines())
141+
138142

139143
def wait_until_container_not_running(sleep_interval=5 * 60):
140-
# check gpu util
141-
# if the gpu has not been utilized for 30 minutes kill the
144+
# check gpu util
145+
# if the gpu has not been utilized for 30 minutes kill the
142146
gpu_last_active = datetime.datetime.now().timestamp()
143147

144148
while container_running():
145149
# check if gpus have been inactive > 45 min and if so terminate container
146150
if gpu_is_active():
147151
gpu_last_active = datetime.datetime.now().timestamp()
148152
if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
149-
kill_containers("Killing container: GPUs have been inactive > 45 minutes...")
153+
kill_containers(
154+
"Killing container: GPUs have been inactive > 45 minutes...")
150155
time.sleep(sleep_interval)
151156
return
152157

158+
153159
def main(_):
154160
framework = FLAGS.framework
155161
experiment_name = FLAGS.experiment_name
@@ -196,9 +202,10 @@ def main(_):
196202
FLAGS.held_out_workloads_config_path)
197203
workloads = workloads + held_out_workloads
198204

199-
# Filter workloads if explicit workloads specified
205+
# Filter workloads if explicit workloads specified
200206
if FLAGS.workloads is not None:
201-
workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads))
207+
workloads = list(
208+
filter(lambda x: x in FLAGS.workloads.split(','), workloads))
202209
if len(workloads) != len(FLAGS.workloads.split(',')):
203210
unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
204211
raise ValueError(f'Invalid workload name {unmatched_workloads}')
@@ -230,7 +237,7 @@ def main(_):
230237
else:
231238
max_steps = FLAGS.max_steps
232239
max_steps_flag = f'-m {max_steps}'
233-
240+
234241
mount_repo_flag = ''
235242
if FLAGS.local:
236243
mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
@@ -291,4 +298,4 @@ def main(_):
291298

292299
if __name__ == '__main__':
293300
flags.mark_flag_as_required('workload_metadata_path')
294-
app.run(main)
301+
app.run(main)

0 commit comments

Comments
 (0)