Skip to content

Commit 5190494

Browse files
Add support for v4.1 (#381)
1 parent 7f581e3 commit 5190494

File tree

10 files changed

+877
-9
lines changed

10 files changed

+877
-9
lines changed

mlperf_logging/compliance_checker/mlp_compliance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def get_parser():
315315
parser.add_argument('--usage', type=str, default='training',
316316
choices=usage_choices(),
317317
help='what WG do the benchmarks come from')
318-
parser.add_argument('--ruleset', type=str, default='3.1.0',
318+
parser.add_argument('--ruleset', type=str, default='4.1.0',
319319
choices=rule_choices(),
320320
help='what version of rules to check the log against')
321321
parser.add_argument('--config', type=str,

mlperf_logging/package_checker/package_checker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,13 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
175175
logging.error(" %d files do not comply, directory cannot be accepted", errors_found)
176176

177177
# Check if each run use unique seeds.
178-
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed':
178+
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'} and division == 'closed':
179179
if not seed_checker.check_seeds(result_files, source_files):
180180
too_many_errors = True
181181
logging.error('Seed checker failed')
182182

183183
# Run RCP checker for >= 1.0.0
184-
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'} and division == 'closed' and benchmark != 'minigo':
184+
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'} and division == 'closed' and benchmark != 'minigo':
185185
# Now go again through result files to do RCP checks
186186
rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass)
187187
rcp_pass, rcp_msg, _ = rcp_checker.check_directory(
@@ -235,7 +235,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc
235235
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc.
236236
"""
237237
too_many_errors = False
238-
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'}:
238+
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'}:
239239
logging.info(' Checking System Description Files')
240240
system_description_pass = check_systems(folder, usage, ruleset)
241241
too_many_errors = too_many_errors or not system_description_pass

mlperf_logging/rcp_checker/rcp_checker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,8 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples):
161161
class RCP_Checker:
162162

163163
def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None):
164-
if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0"}:
165-
raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0')
164+
if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"}:
165+
raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, and "4.1.0"')
166166
self.usage = usage
167167
self.ruleset = ruleset
168168
self.benchmark = benchmark
@@ -530,7 +530,7 @@ def get_parser():
530530
parser.add_argument('--rcp_usage', type=str, default='training',
531531
choices=['training', 'hpc'],
532532
help='what WG does the benchmark come from to check the log against')
533-
parser.add_argument('--rcp_version', type=str, default='4.0.0',
533+
parser.add_argument('--rcp_version', type=str, default='4.1.0',
534534
help='what version of rules to check the log against')
535535
parser.add_argument('--verbose', action='store_true')
536536
parser.add_argument('--bert_train_samples', action='store_true',
Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
{
2+
3+
"bert_ref_256":
4+
{
5+
"Benchmark": "bert",
6+
"Creator": "Google",
7+
"When": "Prior to 1.0 submission",
8+
"Platform": "TPU-v4-16 / TF1, TF version ~2.4",
9+
"BS": 256,
10+
"Hyperparams": {
11+
"opt_base_learning_rate": 0.00035,
12+
"opt_epsilon": 1e-6,
13+
"opt_learning_rate_training_steps": 13700,
14+
"num_warmup_steps": 0,
15+
"start_warmup_step": 0,
16+
"opt_lamb_beta_1": 0.9,
17+
"opt_lamb_beta_2": 0.999,
18+
"opt_lamb_weight_decay_rate": 0.01,
19+
"gradient_accumulation_steps": 1
20+
},
21+
"Epochs to converge": [
22+
2834944, 2508800, 2709504, 2609152, 2383360, 2308096, 2910208, 2333184, 2283008, 2935296,
23+
2483712, 2558976, 2709504, 2232832, 2333184, 2533888, 2709504, 2257920, 2609152, 2809856]
24+
},
25+
26+
"bert_ref_448":
27+
{
28+
"Benchmark": "bert",
29+
"Creator": "Google",
30+
"When": "Prior to 2.1 submission, with Habana's HP set",
31+
"Platform": "TPU-v4-32 / TF1, TF version ~2.10",
32+
"BS": 448,
33+
"Hyperparams": {
34+
"opt_base_learning_rate": 0.000425,
35+
"opt_epsilon": 1e-6,
36+
"opt_learning_rate_training_steps": 6700,
37+
"num_warmup_steps": 0,
38+
"start_warmup_step": 0,
39+
"opt_lamb_beta_1": 0.9,
40+
"opt_lamb_beta_2": 0.999,
41+
"opt_lamb_weight_decay_rate": 0.01,
42+
"gradient_accumulation_steps": 1
43+
},
44+
"Epochs to converge": [
45+
2132480, 2333184, 2408448, 2483712, 2684416, 2107392, 2157568, 2709504, 2533888, 2584064,
46+
1981952, 2182656, 2408448, 2433536, 2333184, 2533888, 2458624, 2558976, 2584064, 2358272,
47+
2358272, 2358272, 2759680]
48+
},
49+
50+
"bert_ref_1536":
51+
{
52+
"Benchmark": "bert",
53+
"Creator": "Google",
54+
"When": "At 1.0 submission",
55+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
56+
"BS": 1536,
57+
"Hyperparams": {
58+
"opt_base_learning_rate": 0.002,
59+
"opt_epsilon": 1e-6,
60+
"opt_learning_rate_training_steps": 2254,
61+
"num_warmup_steps": 0,
62+
"start_warmup_step": 0,
63+
"opt_lamb_beta_1": 0.66,
64+
"opt_lamb_beta_2": 0.996,
65+
"opt_lamb_weight_decay_rate": 0.01,
66+
"gradient_accumulation_steps": 1
67+
},
68+
"Epochs to converge": [
69+
2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392,
70+
2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120]
71+
},
72+
73+
"bert_ref_4096":
74+
{
75+
"Benchmark": "bert",
76+
"Creator": "Google",
77+
"When": "Prior to 1.1 submission",
78+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
79+
"BS": 4096,
80+
"Hyperparams": {
81+
"opt_base_learning_rate": 0.0024,
82+
"opt_epsilon": 1e-6,
83+
"opt_learning_rate_training_steps": 855,
84+
"num_warmup_steps": 0,
85+
"start_warmup_step": 0,
86+
"opt_lamb_beta_1": 0.66,
87+
"opt_lamb_beta_2": 0.998,
88+
"opt_lamb_weight_decay_rate": 0.01,
89+
"gradient_accumulation_steps": 16
90+
},
91+
"Epochs to converge": [
92+
2801664, 3022848, 2801664, 3022848, 3047424, 2727936, 2973696, 2703360, 2924544, 2629632,
93+
2678784, 2850816, 2777088, 2826240, 2801664, 2850816, 2924544, 2924544, 2727936, 2850816]
94+
},
95+
96+
97+
"bert_ref_3072":
98+
{
99+
"Benchmark": "bert",
100+
"Creator": "Google",
101+
"When": "Prior to 1.0 submission",
102+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
103+
"BS": 3072,
104+
"Hyperparams": {
105+
"opt_base_learning_rate": 0.002,
106+
"opt_epsilon": 1e-6,
107+
"opt_learning_rate_training_steps": 1141,
108+
"num_warmup_steps": 100,
109+
"start_warmup_step": 0,
110+
"opt_lamb_beta_1": 0.66,
111+
"opt_lamb_beta_2": 0.998,
112+
"opt_lamb_weight_decay_rate": 0.01,
113+
"gradient_accumulation_steps": 96
114+
},
115+
"Epochs to converge": [
116+
2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480,
117+
2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816]
118+
},
119+
120+
"bert_ref_4608":
121+
{
122+
"Benchmark": "bert",
123+
"Creator": "Google",
124+
"When": "Prior to 2.0 submission",
125+
"Platform": "TPU-v4-16 / TF1, TF version ~2.8",
126+
"BS": 4608,
127+
"Hyperparams": {
128+
"opt_base_learning_rate": 0.0035,
129+
"opt_epsilon": 1e-6,
130+
"opt_learning_rate_training_steps": 700,
131+
"num_warmup_steps": 0,
132+
"start_warmup_step": 0,
133+
"opt_lamb_beta_1": 0.62,
134+
"opt_lamb_beta_2": 0.9,
135+
"opt_lamb_weight_decay_rate": 0.01,
136+
"gradient_accumulation_steps": 144
137+
},
138+
"Epochs to converge": [
139+
2626560, 2833920, 2787840, 2949120, 2880000, 2810880, 2880000, 3041280, 2787840, 2833920,
140+
2741760, 2810880, 2649600, 2718720, 2488320, 2603520, 2833920, 2787840, 2810880, 3018240]
141+
},
142+
143+
"bert_ref_6144":
144+
{
145+
"Benchmark": "bert",
146+
"Creator": "Google",
147+
"When": "At 1.0 submission",
148+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
149+
"BS": 6144,
150+
"Hyperparams": {
151+
"opt_base_learning_rate": 0.0029293,
152+
"opt_epsilon": 1e-6,
153+
"opt_learning_rate_training_steps": 700,
154+
"num_warmup_steps": 0,
155+
"start_warmup_step": -700,
156+
"opt_lamb_beta_1": 0.7206,
157+
"opt_lamb_beta_2": 0.78921,
158+
"opt_lamb_weight_decay_rate": 0.001,
159+
"gradient_accumulation_steps": 24
160+
},
161+
"Epochs to converge": [
162+
3366912, 3244032, 3219456, 3686400, 3317760, 3293184, 3416064, 3317760, 3391488, 2998272,
163+
3317760, 3072000, 3416064, 3293184, 3391488, 3514368, 3194880, 3465216, 3244032, 3268608]
164+
},
165+
166+
"bert_ref_6912":
167+
{
168+
"Benchmark": "bert",
169+
"Creator": "Google",
170+
"When": "At 1.0 submission",
171+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
172+
"BS": 6912,
173+
"Hyperparams": {
174+
"opt_base_learning_rate": 0.0029293,
175+
"opt_epsilon": 1e-6,
176+
"opt_learning_rate_training_steps": 700,
177+
"num_warmup_steps": 0,
178+
"start_warmup_step": -700,
179+
"opt_lamb_beta_1": 0.7206,
180+
"opt_lamb_beta_2": 0.78921,
181+
"opt_lamb_weight_decay_rate": 0.001,
182+
"gradient_accumulation_steps": 27
183+
},
184+
"Epochs to converge": [
185+
3621888, 3677184, 3400704, 3594240, 3483648, 3732480, 3677184, 3797776, 3621888, 3760128,
186+
3649536, 3483648, 3566592, 3649536, 3621888, 3483648, 3290112, 3704832, 3594240, 3511296]
187+
},
188+
189+
"bert_ref_8192":
190+
{
191+
"Benchmark": "bert",
192+
"Creator": "Google",
193+
"When": "Prior to 1.0 submission",
194+
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
195+
"BS": 8192,
196+
"Hyperparams": {
197+
"opt_base_learning_rate": 0.00288293,
198+
"opt_epsilon": 1e-6,
199+
"opt_learning_rate_training_steps": 600,
200+
"num_warmup_steps": 287,
201+
"start_warmup_step": -76,
202+
"opt_lamb_beta_1": 0.88,
203+
"opt_lamb_beta_2": 0.88,
204+
"opt_lamb_weight_decay_rate": 0.0166629,
205+
"gradient_accumulation_steps": 16
206+
},
207+
"Epochs to converge": [
208+
4251648, 4153344, 4055040, 4177920, 4177920, 4079616, 4276224, 4128768, 4177920, 4153344,
209+
4177920, 4079616, 4300800, 4153344, 4276224, 4423680, 4276224, 4104192, 4251648, 4153344]
210+
},
211+
212+
"bert_ref_8704":
213+
{
214+
"Benchmark": "bert",
215+
"Creator": "NVIDIA",
216+
"When": "At 1.1 submission",
217+
"Platform": "TBD",
218+
"BS": 8704,
219+
"Hyperparams": {
220+
"opt_base_learning_rate": 0.002971656225,
221+
"opt_epsilon": 1e-6,
222+
"opt_learning_rate_training_steps": 600,
223+
"num_warmup_steps": 287,
224+
"start_warmup_step": -76,
225+
"opt_lamb_beta_1": 0.88,
226+
"opt_lamb_beta_2": 0.88,
227+
"opt_lamb_weight_decay_rate": 0.0166629,
228+
"gradient_accumulation_steps": 34
229+
},
230+
"Epochs to converge": [
231+
4343040, 4143360, 4143360, 4442880, 4392960, 4243200, 4193280, 4542720, 4492800, 4243200,
232+
4243200, 4392960, 4243200, 4193280, 4093440, 4392960, 4093440, 4243200, 4093440, 4392960]
233+
},
234+
235+
"bert_ref_12288":
236+
{
237+
"Benchmark": "bert",
238+
"Creator": "NVIDIA",
239+
"When": "At 1.1 submission",
240+
"Platform": "TBD",
241+
"BS": 12288,
242+
"Hyperparams": {
243+
"opt_base_learning_rate": 0.0031,
244+
"opt_epsilon": 1e-6,
245+
"opt_learning_rate_training_steps": 500,
246+
"num_warmup_steps": 300,
247+
"start_warmup_step": -100,
248+
"opt_lamb_beta_1": 0.80,
249+
"opt_lamb_beta_2": 0.925,
250+
"opt_lamb_weight_decay_rate": 0.0166629,
251+
"gradient_accumulation_steps": 32
252+
},
253+
"Epochs to converge": [
254+
4542720, 4392960, 4642560, 4542720, 4542720, 4492800, 4343040, 4343040, 4442880, 4442880,
255+
4442880, 4442880, 4442880, 4692480, 4492800, 4442880, 4442880, 4442880, 4492800, 4343040]
256+
},
257+
258+
"bert_ref_13056":
259+
{
260+
"Benchmark": "bert",
261+
"Creator": "NVIDIA",
262+
"When": "At 1.1 submission",
263+
"Platform": "TBD",
264+
"BS": 13056,
265+
"Hyperparams": {
266+
"opt_base_learning_rate": 0.00319540686,
267+
"opt_epsilon": 1e-6,
268+
"opt_learning_rate_training_steps": 500,
269+
"num_warmup_steps": 300,
270+
"start_warmup_step": -100,
271+
"opt_lamb_beta_1": 0.80,
272+
"opt_lamb_beta_2": 0.925,
273+
"opt_lamb_weight_decay_rate": 0.0166629,
274+
"gradient_accumulation_steps": 34
275+
},
276+
"Epochs to converge": [
277+
4442880, 4592640, 4642560, 4842240, 4742400, 4592640, 4642560, 4692480, 4942080, 4542720,
278+
4592640, 4093440, 4442880, 4792320, 4642560, 4592640, 4592640, 4892160, 4742400, 4592640]
279+
},
280+
281+
"bert_ref_16384":
282+
{
283+
"Benchmark": "bert",
284+
"Creator": "NVIDIA",
285+
"When": "At 2.0 submission",
286+
"Platform": "TPU-v3-128",
287+
"BS": 16384,
288+
"Hyperparams": {
289+
"opt_base_learning_rate": 0.0033,
290+
"opt_epsilon": 1e-6,
291+
"opt_learning_rate_training_steps": 600,
292+
"num_warmup_steps": 290,
293+
"start_warmup_step": -100,
294+
"opt_lamb_beta_1": 0.75,
295+
"opt_lamb_beta_2": 0.9,
296+
"opt_lamb_weight_decay_rate": 0.0166629,
297+
"gradient_accumulation_steps": 32
298+
},
299+
"Epochs to converge": [
300+
5619712, 5770240, 5720064, 5419008, 5519360, 5569536, 5218304, 5469184, 5419008, 5218304,
301+
5669888, 5669888, 5519360, 5569536, 5368832, 5469184, 5569536, 5469184, 5368832, 5469184]
302+
}
303+
}

0 commit comments

Comments
 (0)