99 --tuning_search_space <path_to_tuning_search_space_json>
1010"""
1111
12+ import datetime
1213import json
1314import os
1415import struct
16+ import subprocess
1517import time
1618
1719from absl import app
2628 'docker_image_url' ,
2729 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev' ,
2830 'URL to docker image' )
29- flags .DEFINE_integer ('run_percentage' ,
30- 100 ,
31- 'Percentage of max num steps to run for.' )
31+ flags .DEFINE_integer (
32+ 'run_percentage' ,
33+ 100 ,
34+ 'Percentage of max num steps to run for.'
35+ 'Must set the flag enable_step_budget to True for this to take effect.' )
3236flags .DEFINE_string ('experiment_name' ,
3337 'my_experiment' ,
3438 'Name of top sub directory in experiment dir.' )
8387 'If your algorithm has a smaller per step time than our baselines '
8488 'you may want to increase the number of steps per workload.' )
8589flags .DEFINE_string (
86- 'workload ' ,
90+ 'workloads ' ,
8791 None ,
92+ 'String representing a comma separated list of workload names.'
8893 'If not None, only run this workload, else run all workloads in workload_metadata_path.'
8994)
95+ flags .DEFINE_string ('additional_requirements_path' ,
96+ None ,
97+ 'Path to requirements.txt if any.' )
98+ flags .DEFINE_integer (
99+ 'max_steps' ,
100+ None ,
101+ 'Maximum number of steps to run. Must set flag enable_step_budget.'
102+ 'This flag takes precedence over the run_percentage flag.' )
103+ flags .DEFINE_bool (
104+ 'enable_step_budget' ,
105+ False ,
106+ 'Flag that has to be explicitly set to override time budgets to step budget percentage.'
107+ )
90108
91109FLAGS = flags .FLAGS
92110
@@ -106,15 +124,40 @@ def container_running():
106124 return True
107125
108126
127+ def kill_containers ():
128+ docker_client = docker .from_env ()
129+ containers = docker_client .containers .list ()
130+ for container in containers :
131+ container .kill ()
132+
133+
134+ def gpu_is_active ():
135+ output = subprocess .check_output ([
136+ 'nvidia-smi' ,
137+ '--query-gpu=utilization.gpu' ,
138+ '--format=csv,noheader,nounits'
139+ ])
140+ return any (int (x ) > 0 for x in output .decode ().splitlines ())
141+
142+
109143def wait_until_container_not_running (sleep_interval = 5 * 60 ):
144+ # check gpu util
145+ # if the gpu has not been utilized for 30 minutes kill the
146+ gpu_last_active = datetime .datetime .now ().timestamp ()
147+
110148 while container_running ():
149+ # check if gpus have been inactive > 45 min and if so terminate container
150+ if gpu_is_active ():
151+ gpu_last_active = datetime .datetime .now ().timestamp ()
152+ if (datetime .datetime .now ().timestamp () - gpu_last_active ) > 45 * 60 :
153+ kill_containers (
154+ "Killing container: GPUs have been inactive > 45 minutes..." )
111155 time .sleep (sleep_interval )
112156 return
113157
114158
115159def main (_ ):
116160 framework = FLAGS .framework
117- run_fraction = FLAGS .run_percentage / 100.
118161 experiment_name = FLAGS .experiment_name
119162 docker_image_url = FLAGS .docker_image_url
120163 submission_path = FLAGS .submission_path
@@ -132,7 +175,13 @@ def main(_):
132175 study_end_index = FLAGS .study_end_index
133176 else :
134177 study_end_index = num_studies - 1
178+
179+ additional_requirements_path_flag = ''
180+ if FLAGS .additional_requirements_path :
181+ additional_requirements_path_flag = f'--additional_requirements_path { FLAGS .additional_requirements_path } '
182+
135183 submission_id = FLAGS .submission_id
184+
136185 rng_seed = FLAGS .seed
137186
138187 if not rng_seed :
@@ -144,17 +193,22 @@ def main(_):
144193 with open (FLAGS .workload_metadata_path ) as f :
145194 workload_metadata = json .load (f )
146195
196+ # Get list of all possible workloads
147197 workloads = [w for w in workload_metadata .keys ()]
148198
149- # Read held-out workloads
199+ # Read heldout workloads
150200 if FLAGS .held_out_workloads_config_path :
151201 held_out_workloads = read_held_out_workloads (
152202 FLAGS .held_out_workloads_config_path )
153203 workloads = workloads + held_out_workloads
154204
155- # Filter for single workload
156- if FLAGS .workload and (FLAGS .workload in workloads ):
157- workloads = [FLAGS .workload ]
205+ # Filter workloads if explicit workloads specified
206+ if FLAGS .workloads is not None :
207+ workloads = list (
208+ filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
209+ if len (workloads ) != len (FLAGS .workloads .split (',' )):
210+ unmatched_workloads = set (FLAGS .workloads .split (',' )) - set (workloads )
211+ raise ValueError (f'Invalid workload name { unmatched_workloads } ' )
158212
159213 rng_subkeys = prng .split (rng_key , num_studies )
160214
@@ -174,14 +228,22 @@ def main(_):
174228 "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'" ) # clear caches
175229 print ('=' * 100 )
176230 dataset = workload_metadata [base_workload_name ]['dataset' ]
177- max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
178- run_fraction )
231+ max_steps_flag = ''
232+ if FLAGS .enable_step_budget :
233+ run_fraction = FLAGS .run_percentage / 100.
234+ if FLAGS .max_steps is None :
235+ max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
236+ run_fraction )
237+ else :
238+ max_steps = FLAGS .max_steps
239+ max_steps_flag = f'-m { max_steps } '
240+
179241 mount_repo_flag = ''
180242 if FLAGS .local :
181- mount_repo_flag = '-v $HOME /algorithmic-efficiency:/algorithmic-efficiency '
182- command = ('docker run -t -d -v $HOME /data/:/data/ '
183- '-v $HOME /experiment_runs/:/experiment_runs '
184- '-v $HOME /experiment_runs/logs:/logs '
243+ mount_repo_flag = '-v /home/kasimbeg /algorithmic-efficiency:/algorithmic-efficiency '
244+ command = ('docker run -t -d -v /home/kasimbeg /data/:/data/ '
245+ '-v /home/kasimbeg /experiment_runs/:/experiment_runs '
246+ '-v /home/kasimbeg /experiment_runs/logs:/logs '
185247 f'{ mount_repo_flag } '
186248 '--gpus all --ipc=host '
187249 f'{ docker_image_url } '
@@ -190,9 +252,10 @@ def main(_):
190252 f'-s { submission_path } '
191253 f'-w { workload } '
192254 f'-e { study_dir } '
193- f'-m { max_steps } '
255+ f'{ max_steps_flag } '
194256 f'--num_tuning_trials { num_tuning_trials } '
195257 f'--rng_seed { run_seed } '
258+ f'{ additional_requirements_path_flag } '
196259 '-c false '
197260 '-o true '
198261 '-i true ' )
@@ -235,4 +298,4 @@ def main(_):
235298
236299if __name__ == '__main__' :
237300 flags .mark_flag_as_required ('workload_metadata_path' )
238- app .run (main )
301+ app .run (main )
0 commit comments