Skip to content

Commit e6ba2a9

Browse files
authored
Merge pull request #398 from mlcommons/add_initial_5.0_files
Add initial 5.0.0 files and configurations
2 parents eb9e1a3 + c16c224 commit e6ba2a9

35 files changed

+1582
-34
lines changed

mlperf_logging/benchmark_meta.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@
1010
'minigo': 10,
1111
'resnet': 5,
1212
'ssd': 5,
13+
'retinanet': 5,
1314
'stable_diffusion': 10,
1415
'transformer': 10,
1516
'ncf': 10,
1617
'rnnt': 10,
1718
'unet3d': 40,
18-
'gnn' : 10,
19+
'gnn' : 10,
20+
'rgat': 10,
1921
'llama2_70b_lora': 10,
2022
},
2123

@@ -131,6 +133,14 @@
131133
'stable_diffusion',
132134
'llama2_70b_lora',
133135
'gnn'
136+
],
137+
'5.0': [
138+
'bert',
139+
'dlrm_dcnv2',
140+
'retinanet',
141+
'stable_diffusion',
142+
'llama2_70b_lora',
143+
'rgat'
134144
]
135145
},
136146

mlperf_logging/compliance_checker/README.md

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ To check a log file for compliance:
1010

1111
python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
1212

13-
By default, 4.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`.
13+
By default, 5.0.0 training edition rules are used and the default config is set to `5.0.0/common.yaml`.
1414
This config will check all common keys and enqueue benchmark specific config to be checked as well.
1515
Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
1616

@@ -22,23 +22,21 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
2222

2323
### Existing config files for training submissions
2424

25-
4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26-
4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27-
4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28-
4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions.
29-
4.1.0/closed_bert.yaml
30-
4.1.0/closed_dlrm_dcnv2.yaml
31-
4.1.0/closed_gpt3.yaml
32-
4.1.0/closed_gnn.yaml
33-
4.1.0/closed_llama2_70b_lora.yaml
34-
4.1.0/closed_stable_diffusion.yaml
35-
4.1.0/open_ssd.yaml - Per-benchmark rules, open submissions.
36-
4.1.0/open_bert.yaml
37-
4.1.0/open_dlrm_dcnv2.yaml
38-
4.1.0/open_gpt3.yaml
39-
4.1.0/open_gnn.yaml
40-
4.1.0/open_llama2_70b_lora.yaml
41-
4.1.0/open_stable_diffusion.yaml
25+
5.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26+
5.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27+
5.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28+
5.0.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
29+
5.0.0/closed_bert.yaml
30+
5.0.0/closed_dlrm_dcnv2.yaml
31+
5.0.0/closed_rgat.yaml
32+
5.0.0/closed_llama2_70b_lora.yaml
33+
5.0.0/closed_stable_diffusion.yaml
34+
5.0.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
35+
5.0.0/open_bert.yaml
36+
5.0.0/open_dlrm_dcnv2.yaml
37+
5.0.0/open_rgat.yaml
38+
5.0.0/open_llama2_70b_lora.yaml
39+
5.0.0/open_stable_diffusion.yaml
4240

4341
### Existing config files for HPC submissions
4442

@@ -173,7 +171,7 @@ Tested and confirmed working using the following software versions:
173171
- Python 2.7.12 + PyYAML 3.11
174172
- Python 3.6.8 + PyYAML 5.1
175173
- Python 2.9.2 + PyYAML 5.3.1
176-
- Python 3.9.10 + PyYAML 5.4.1
174+
- Python 3.9.10 + PyYAML 5.5.0
177175

178176
### How to install PyYaML
179177

mlperf_logging/compliance_checker/mlp_compliance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def get_parser():
315315
parser.add_argument('--usage', type=str, default='training',
316316
choices=usage_choices(),
317317
help='what WG do the benchmarks come from')
318-
parser.add_argument('--ruleset', type=str, default='4.1.0',
318+
parser.add_argument('--ruleset', type=str, default='5.0.0',
319319
choices=rule_choices(),
320320
help='what version of rules to check the log against')
321321
parser.add_argument('--config', type=str,

mlperf_logging/compliance_checker/mlp_parser/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .ruleset_310 import parse_file as parse_file_310
99
from .ruleset_400 import parse_file as parse_file_400
1010
from .ruleset_410 import parse_file as parse_file_410
11+
from .ruleset_500 import parse_file as parse_file_500
1112

1213

1314
def parse_file(filename, ruleset='0.6.0'):
@@ -31,5 +32,7 @@ def parse_file(filename, ruleset='0.6.0'):
3132
return parse_file_400(filename)
3233
elif ruleset == '4.1.0':
3334
return parse_file_410(filename)
35+
elif ruleset == '5.0.0':
36+
return parse_file_500(filename)
3437
else:
3538
raise Exception(f'Ruleset "{ruleset}" is not supported')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
'''
2+
Parses a text MLPerf log into a structured format.
3+
'''
4+
5+
from __future__ import print_function
6+
7+
import collections
8+
import json
9+
import re
10+
import sys
11+
from dataclasses import dataclass
12+
13+
from io import open
14+
15+
@dataclass
16+
class LogLine:
17+
"""Class for keeping track of an item in inventory."""
18+
full_string: str
19+
timestamp: float
20+
key: str
21+
value: str
22+
lineno: int
23+
24+
TOKEN = ':::MLLOG '
25+
26+
27+
def parse_line(line):
28+
if not line.startswith(TOKEN):
29+
return None
30+
31+
return json.loads(line[len(TOKEN):])
32+
33+
34+
def string_to_logline(lineno, string):
35+
''' Returns a LogLine or raises a ValueError '''
36+
m = parse_line(string)
37+
38+
if m is None:
39+
raise ValueError('does not match regex')
40+
41+
args = []
42+
args.append(string) # full string
43+
44+
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
45+
# TODO check for weird values
46+
args.append(ts)
47+
48+
args.append(m['key']) # key
49+
50+
j = { 'value': m['value'], 'metadata': m['metadata'] }
51+
args.append(j)
52+
53+
args.append(lineno)
54+
return LogLine(*args)
55+
56+
57+
def parse_file(filename):
58+
''' Reads a file by name and returns list of loglines and list of errors'''
59+
with open(filename, encoding='latin-1') as f:
60+
return parse_generator(f)
61+
62+
63+
def strip_and_dedup(gen):
64+
lines = []
65+
for l in gen:
66+
if TOKEN not in l:
67+
continue
68+
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
69+
return lines
70+
71+
72+
73+
def parse_generator(gen):
74+
''' Reads a generator of lines and returns (loglines, errors)
75+
The list of errors are any parsing issues as a tuple (str_line, error_msg)
76+
'''
77+
loglines = []
78+
failed = []
79+
for lineno, line in enumerate(strip_and_dedup(gen)):
80+
line = line.strip()
81+
try:
82+
ll = string_to_logline(lineno, line)
83+
loglines.append(ll)
84+
except ValueError as e:
85+
failed.append((line, str(e)))
86+
return loglines, failed
87+
88+
89+
if __name__ == '__main__':
90+
if len(sys.argv) != 2:
91+
print('usage: mlp_parser.py FILENAME')
92+
print(' tests parsing on the file.')
93+
sys.exit(1)
94+
95+
filename = sys.argv[1]
96+
lines, errors = parse_file(filename)
97+
98+
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
99+
100+
if len(errors) > 0:
101+
print('Lines which failed to parse:')
102+
for line, error in errors:
103+
print(' Following line failed: {}'.format(error))
104+
print(line)
105+
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
POST: >
5+
s['global_batch_size'] = v['value']
6+
7+
- KEY:
8+
NAME: opt_base_learning_rate
9+
REQ: EXACTLY_ONE
10+
11+
- KEY:
12+
NAME: opt_lamb_epsilon
13+
REQ: EXACTLY_ONE
14+
15+
- KEY:
16+
NAME: opt_learning_rate_training_steps
17+
REQ: EXACTLY_ONE
18+
19+
- KEY:
20+
NAME: opt_learning_rate_warmup_steps
21+
REQ: EXACTLY_ONE
22+
23+
- KEY:
24+
NAME: num_warmup_steps
25+
REQ: EXACTLY_ONE
26+
27+
- KEY:
28+
NAME: start_warmup_step
29+
REQ: EXACTLY_ONE
30+
31+
- KEY:
32+
NAME: opt_lamb_beta_1
33+
REQ: EXACTLY_ONE
34+
35+
- KEY:
36+
NAME: opt_lamb_beta_2
37+
REQ: EXACTLY_ONE
38+
39+
- KEY:
40+
NAME: opt_lamb_weight_decay_rate
41+
REQ: EXACTLY_ONE
42+
43+
- KEY:
44+
NAME: eval_accuracy
45+
REQ: AT_LEAST_ONE
46+
CHECK:
47+
- "'epoch_num' in v['metadata']"
48+
ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
- KEY:
3+
NAME: submission_benchmark
4+
REQ: EXACTLY_ONE
5+
CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora'] "
6+
POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) "
7+
8+
- KEY:
9+
NAME: gradient_accumulation_steps
10+
REQ: EXACTLY_ONE
11+
CHECK: " v['value'] > 0 "
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
5+
- KEY:
6+
NAME: opt_name
7+
REQ: EXACTLY_ONE
8+
CHECK: " v['value'] == 'adagrad' "
9+
10+
- KEY:
11+
NAME: opt_base_learning_rate
12+
REQ: EXACTLY_ONE
13+
14+
- KEY:
15+
NAME: opt_adagrad_learning_rate_decay
16+
REQ: EXACTLY_ONE
17+
CHECK: " v['value'] == 0 "
18+
19+
- KEY:
20+
NAME: opt_weight_decay
21+
REQ: EXACTLY_ONE
22+
CHECK: " v['value'] == 0 "
23+
24+
- KEY:
25+
NAME: opt_adagrad_initial_accumulator_value
26+
REQ: EXACTLY_ONE
27+
CHECK: " v['value'] == 0 "
28+
29+
- KEY:
30+
NAME: opt_adagrad_epsilon
31+
REQ: EXACTLY_ONE
32+
CHECK: " v['value'] == 1e-8 "
33+
34+
- KEY:
35+
NAME: opt_learning_rate_warmup_steps
36+
REQ: EXACTLY_ONE
37+
CHECK: " v['value'] == 0 "
38+
39+
- KEY:
40+
NAME: opt_learning_rate_decay_start_step
41+
REQ: EXACTLY_ONE
42+
CHECK: " v['value'] == 0 "
43+
44+
- KEY:
45+
NAME: opt_learning_rate_decay_steps
46+
REQ: EXACTLY_ONE
47+
CHECK: " v['value'] == 0 "
48+
49+
- KEY:
50+
NAME: eval_accuracy
51+
REQ: AT_LEAST_ONE
52+
CHECK:
53+
- "'epoch_num' in v['metadata']"
54+
ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"
55+
56+
- KEY:
57+
NAME: eval_samples
58+
REQ: EXACTLY_ONE
59+
CHECK: " v['value'] == 89137319 "
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
POST: >
5+
s['global_batch_size'] = v['value']
6+
7+
- KEY:
8+
NAME: opt_base_learning_rate
9+
REQ: EXACTLY_ONE
10+
11+
12+
- KEY:
13+
NAME: opt_learning_rate_training_steps
14+
REQ: EXACTLY_ONE
15+
16+
- KEY:
17+
NAME: opt_gradient_clip_norm
18+
REQ: EXACTLY_ONE
19+
20+
- KEY:
21+
NAME: opt_adamw_weight_decay
22+
REQ: EXACTLY_ONE
23+
24+
- KEY:
25+
NAME: gradient_accumulation_steps
26+
REQ: EXACTLY_ONE
27+
28+
- KEY:
29+
NAME: lora_alpha
30+
REQ: EXACTLY_ONE
31+
32+
- KEY:
33+
NAME: lora_rank
34+
REQ: EXACTLY_ONE
35+
CHECK: " v['value'] == 16"
36+
37+
- KEY:
38+
NAME: eval_accuracy
39+
REQ: AT_LEAST_ONE
40+
CHECK:
41+
- "'samples_count' in v['metadata']"
42+
ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0"

0 commit comments

Comments
 (0)