99 --tuning_search_space <path_to_tuning_search_space_json>
1010"""
1111
12+ import datetime
1213import json
1314import os
1415import struct
16+ import subprocess
1517import time
1618
1719from absl import app
1820from absl import flags
1921from absl import logging
20- import datetime
21- import subprocess
2222
2323from algorithmic_efficiency import random_utils as prng
2424from algorithmic_efficiency .workloads .workloads import get_base_workload_name
2828 'docker_image_url' ,
2929 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev' ,
3030 'URL to docker image' )
31- flags .DEFINE_integer ('run_percentage' ,
32- 100 ,
33- 'Percentage of max num steps to run for.'
34- 'Must set the flag enable_step_budget to True for this to take effect.' )
31+ flags .DEFINE_integer (
32+ 'run_percentage' ,
33+ 100 ,
34+ 'Percentage of max num steps to run for.'
35+ 'Must set the flag enable_step_budget to True for this to take effect.' )
3536flags .DEFINE_string ('experiment_name' ,
3637 'my_experiment' ,
3738 'Name of top sub directory in experiment dir.' )
9192 'String representing a comma separated list of workload names.'
9293 'If not None, only run this workload, else run all workloads in workload_metadata_path.'
9394)
94- flags .DEFINE_string (
95- 'additional_requirements_path' ,
96- None ,
97- 'Path to requirements.txt if any.'
98- )
95+ flags .DEFINE_string ('additional_requirements_path' ,
96+ None ,
97+ 'Path to requirements.txt if any.' )
9998flags .DEFINE_integer (
100- 'max_steps' ,
101- None ,
102- 'Maximum number of steps to run. Must set flag enable_step_budget.'
103- 'This flag takes precedence over the run_percentage flag.'
104- )
99+ 'max_steps' ,
100+ None ,
101+ 'Maximum number of steps to run. Must set flag enable_step_budget.'
102+ 'This flag takes precedence over the run_percentage flag.' )
105103flags .DEFINE_bool (
106- 'enable_step_budget' ,
107- False ,
108- 'Flag that has to be explicitly set to override time budgets to step budget percentage.'
104+ 'enable_step_budget' ,
105+ False ,
106+ 'Flag that has to be explicitly set to override time budgets to step budget percentage.'
109107)
110108
111109FLAGS = flags .FLAGS
@@ -125,31 +123,39 @@ def container_running():
125123 else :
126124 return True
127125
126+
128127def kill_containers ():
129128 docker_client = docker .from_env ()
130129 containers = docker_client .containers .list ()
131130 for container in containers :
132131 container .kill ()
133132
133+
134134def gpu_is_active ():
135- output = subprocess .check_output (['nvidia-smi' , '--query-gpu=utilization.gpu' , '--format=csv,noheader,nounits' ])
136- return any (int (x ) > 0 for x in output .decode ().splitlines ())
137-
135+ output = subprocess .check_output ([
136+ 'nvidia-smi' ,
137+ '--query-gpu=utilization.gpu' ,
138+ '--format=csv,noheader,nounits'
139+ ])
140+ return any (int (x ) > 0 for x in output .decode ().splitlines ())
141+
138142
139143def wait_until_container_not_running (sleep_interval = 5 * 60 ):
140- # check gpu util
141- # if the gpu has not been utilized for 30 minutes kill the
144+ # check gpu util
145+ # if the gpu has not been utilized for 30 minutes kill the
142146 gpu_last_active = datetime .datetime .now ().timestamp ()
143147
144148 while container_running ():
145149 # check if gpus have been inactive > 45 min and if so terminate container
146150 if gpu_is_active ():
147151 gpu_last_active = datetime .datetime .now ().timestamp ()
148152 if (datetime .datetime .now ().timestamp () - gpu_last_active ) > 45 * 60 :
149- kill_containers ("Killing container: GPUs have been inactive > 45 minutes..." )
153+ kill_containers (
154+ "Killing container: GPUs have been inactive > 45 minutes..." )
150155 time .sleep (sleep_interval )
151156 return
152157
158+
153159def main (_ ):
154160 framework = FLAGS .framework
155161 experiment_name = FLAGS .experiment_name
@@ -196,9 +202,10 @@ def main(_):
196202 FLAGS .held_out_workloads_config_path )
197203 workloads = workloads + held_out_workloads
198204
199- # Filter workloads if explicit workloads specified
205+ # Filter workloads if explicit workloads specified
200206 if FLAGS .workloads is not None :
201- workloads = list (filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
207+ workloads = list (
208+ filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
202209 if len (workloads ) != len (FLAGS .workloads .split (',' )):
203210 unmatched_workloads = set (FLAGS .workloads .split (',' )) - set (workloads )
204211 raise ValueError (f'Invalid workload name { unmatched_workloads } ' )
@@ -230,7 +237,7 @@ def main(_):
230237 else :
231238 max_steps = FLAGS .max_steps
232239 max_steps_flag = f'-m { max_steps } '
233-
240+
234241 mount_repo_flag = ''
235242 if FLAGS .local :
236243 mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
@@ -291,4 +298,4 @@ def main(_):
291298
292299if __name__ == '__main__' :
293300 flags .mark_flag_as_required ('workload_metadata_path' )
294- app .run (main )
301+ app .run (main )
0 commit comments