Skip to content

Commit 9b5f1b4

Browse files
Add support for compliance for 4.1 (#382)
Some changes slipped and were not changed in #381 This PR fixes it.
1 parent 5190494 commit 9b5f1b4

26 files changed

+828
-30
lines changed

mlperf_logging/benchmark_meta.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,16 @@
122122
'llama2_70b_lora',
123123
'stable_diffusion',
124124
'gnn'
125-
]
125+
],
126+
'4.1': [
127+
'bert',
128+
'dlrm_dcnv2',
129+
'gpt3',
130+
'ssd',
131+
'stable_diffusion',
132+
'llama2_70b_lora',
133+
'gnn'
134+
]
126135
},
127136

128137
'hpc': {

mlperf_logging/compliance_checker/README.md

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ To check a log file for compliance:
1010

1111
python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
1212

13-
By default, 3.1.0 training edition rules are used and the default config is set to `3.1.0/common.yaml`.
13+
By default, 3.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`.
1414
This config will check all common keys and enqueue benchmark specific config to be checked as well.
15-
Old training editions, still supported are 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
15+
Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
1616

1717
To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
1818

@@ -22,27 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
2222

2323
### Existing config files for training submissions
2424

25-
3.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26-
3.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27-
3.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28-
3.1.0/closed_resnet.yaml - Per-benchmark rules, closed submissions.
29-
3.1.0/closed_ssd.yaml
30-
3.1.0/closed_maskrcnn.yaml
31-
3.1.0/closed_rnnt.yaml
32-
3.1.0/closed_unet3d.yaml
33-
3.1.0/closed_bert.yaml
34-
3.1.0/closed_dlrm_dcnv2.yaml
35-
3.1.0/closed_gpt3.yaml
36-
3.1.0/closed_stable_diffusion.yaml
37-
3.1.0/open_resnet.yaml - Per-benchmark rules, closed submissions.
38-
3.1.0/open_ssd.yaml
39-
3.1.0/open_maskrcnn.yaml
40-
3.1.0/open_rnnt.yaml
41-
3.1.0/open_unet3d.yaml
42-
3.1.0/open_bert.yaml
43-
3.1.0/open_dlrm_dcnv2.yaml
44-
3.1.0/open_gpt3.yaml
45-
3.1.0/open_stable_diffusion.yaml
25+
4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26+
4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27+
4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28+
4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions.
29+
4.1.0/closed_bert.yaml
30+
4.1.0/closed_dlrm_dcnv2.yaml
31+
4.1.0/closed_gpt3.yaml
32+
4.1.0/closed_gnn.yaml
33+
4.1.0/closed_llama2_70b_lora.yaml
34+
4.1.0/closed_stable_diffusion.yaml
35+
4.1.0/open_ssd.yaml - Per-benchmark rules, closed submissions.
36+
4.1.0/open_bert.yaml
37+
4.1.0/open_dlrm_dcnv2.yaml
38+
4.1.0/open_gpt3.yaml
39+
4.1.0/open_gnn.yaml
40+
4.1.0/open_llama2_70b_lora.yaml
41+
4.1.0/open_stable_diffusion.yaml
4642

4743
### Existing config files for HPC submissions
4844

mlperf_logging/compliance_checker/mlp_parser/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .ruleset_300 import parse_file as parse_file_300
88
from .ruleset_310 import parse_file as parse_file_310
99
from .ruleset_400 import parse_file as parse_file_400
10+
from .ruleset_410 import parse_file as parse_file_410
1011

1112

1213
def parse_file(filename, ruleset='0.6.0'):
@@ -28,5 +29,7 @@ def parse_file(filename, ruleset='0.6.0'):
2829
return parse_file_310(filename)
2930
elif ruleset == '4.0.0':
3031
return parse_file_400(filename)
32+
elif ruleset == '4.1.0':
33+
return parse_file_410(filename)
3134
else:
32-
raise Exception(f'Ruleset "{ruleset}" is not supported')
35+
raise Exception(f'Ruleset "{ruleset}" is not supported')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
'''
2+
Parses a text MLPerf log into a structured format.
3+
'''
4+
5+
from __future__ import print_function
6+
7+
import collections
8+
import json
9+
import re
10+
import sys
11+
from dataclasses import dataclass
12+
13+
from io import open
14+
15+
@dataclass
16+
class LogLine:
17+
"""Class for keeping track of an item in inventory."""
18+
full_string: str
19+
timestamp: float
20+
key: str
21+
value: str
22+
lineno: int
23+
24+
TOKEN = ':::MLLOG '
25+
26+
27+
def parse_line(line):
28+
if not line.startswith(TOKEN):
29+
return None
30+
31+
return json.loads(line[len(TOKEN):])
32+
33+
34+
def string_to_logline(lineno, string):
35+
''' Returns a LogLine or raises a ValueError '''
36+
m = parse_line(string)
37+
38+
if m is None:
39+
raise ValueError('does not match regex')
40+
41+
args = []
42+
args.append(string) # full string
43+
44+
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
45+
# TODO check for weird values
46+
args.append(ts)
47+
48+
args.append(m['key']) # key
49+
50+
j = { 'value': m['value'], 'metadata': m['metadata'] }
51+
args.append(j)
52+
53+
args.append(lineno)
54+
return LogLine(*args)
55+
56+
57+
def parse_file(filename):
58+
''' Reads a file by name and returns list of loglines and list of errors'''
59+
with open(filename, encoding='latin-1') as f:
60+
return parse_generator(f)
61+
62+
63+
def strip_and_dedup(gen):
64+
lines = []
65+
for l in gen:
66+
if TOKEN not in l:
67+
continue
68+
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
69+
return lines
70+
71+
72+
73+
def parse_generator(gen):
74+
''' Reads a generator of lines and returns (loglines, errors)
75+
The list of errors are any parsing issues as a tuple (str_line, error_msg)
76+
'''
77+
loglines = []
78+
failed = []
79+
for lineno, line in enumerate(strip_and_dedup(gen)):
80+
line = line.strip()
81+
try:
82+
ll = string_to_logline(lineno, line)
83+
loglines.append(ll)
84+
except ValueError as e:
85+
failed.append((line, str(e)))
86+
return loglines, failed
87+
88+
89+
if __name__ == '__main__':
90+
if len(sys.argv) != 2:
91+
print('usage: mlp_parser.py FILENAME')
92+
print(' tests parsing on the file.')
93+
sys.exit(1)
94+
95+
filename = sys.argv[1]
96+
lines, errors = parse_file(filename)
97+
98+
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
99+
100+
if len(errors) > 0:
101+
print('Lines which failed to parse:')
102+
for line, error in errors:
103+
print(' Following line failed: {}'.format(error))
104+
print(line)
105+
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
POST: >
5+
s['global_batch_size'] = v['value']
6+
7+
- KEY:
8+
NAME: opt_base_learning_rate
9+
REQ: EXACTLY_ONE
10+
11+
- KEY:
12+
NAME: opt_lamb_epsilon
13+
REQ: EXACTLY_ONE
14+
15+
- KEY:
16+
NAME: opt_learning_rate_training_steps
17+
REQ: EXACTLY_ONE
18+
19+
- KEY:
20+
NAME: opt_learning_rate_warmup_steps
21+
REQ: EXACTLY_ONE
22+
23+
- KEY:
24+
NAME: num_warmup_steps
25+
REQ: EXACTLY_ONE
26+
27+
- KEY:
28+
NAME: start_warmup_step
29+
REQ: EXACTLY_ONE
30+
31+
- KEY:
32+
NAME: opt_lamb_beta_1
33+
REQ: EXACTLY_ONE
34+
35+
- KEY:
36+
NAME: opt_lamb_beta_2
37+
REQ: EXACTLY_ONE
38+
39+
- KEY:
40+
NAME: opt_lamb_weight_decay_rate
41+
REQ: EXACTLY_ONE
42+
43+
- KEY:
44+
NAME: eval_accuracy
45+
REQ: AT_LEAST_ONE
46+
CHECK:
47+
- "'epoch_num' in v['metadata']"
48+
ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
- KEY:
3+
NAME: submission_benchmark
4+
REQ: EXACTLY_ONE
5+
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn','llama2_70b_lora'] "
6+
POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) "
7+
8+
- KEY:
9+
NAME: gradient_accumulation_steps
10+
REQ: EXACTLY_ONE
11+
CHECK: " v['value'] > 0 "
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
5+
- KEY:
6+
NAME: opt_name
7+
REQ: EXACTLY_ONE
8+
CHECK: " v['value'] == 'adagrad' "
9+
10+
- KEY:
11+
NAME: opt_base_learning_rate
12+
REQ: EXACTLY_ONE
13+
14+
- KEY:
15+
NAME: opt_adagrad_learning_rate_decay
16+
REQ: EXACTLY_ONE
17+
CHECK: " v['value'] == 0 "
18+
19+
- KEY:
20+
NAME: opt_weight_decay
21+
REQ: EXACTLY_ONE
22+
CHECK: " v['value'] == 0 "
23+
24+
- KEY:
25+
NAME: opt_adagrad_initial_accumulator_value
26+
REQ: EXACTLY_ONE
27+
CHECK: " v['value'] == 0 "
28+
29+
- KEY:
30+
NAME: opt_adagrad_epsilon
31+
REQ: EXACTLY_ONE
32+
CHECK: " v['value'] == 1e-8 "
33+
34+
- KEY:
35+
NAME: opt_learning_rate_warmup_steps
36+
REQ: EXACTLY_ONE
37+
CHECK: " v['value'] == 0 "
38+
39+
- KEY:
40+
NAME: opt_learning_rate_decay_start_step
41+
REQ: EXACTLY_ONE
42+
CHECK: " v['value'] == 0 "
43+
44+
- KEY:
45+
NAME: opt_learning_rate_decay_steps
46+
REQ: EXACTLY_ONE
47+
CHECK: " v['value'] == 0 "
48+
49+
- KEY:
50+
NAME: eval_accuracy
51+
REQ: AT_LEAST_ONE
52+
CHECK:
53+
- "'epoch_num' in v['metadata']"
54+
ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"
55+
56+
- KEY:
57+
NAME: eval_samples
58+
REQ: EXACTLY_ONE
59+
CHECK: " v['value'] == 89137319 "
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
CHECK: " v['value'] > 0"
5+
6+
- KEY:
7+
NAME: opt_name
8+
REQ: EXACTLY_ONE
9+
CHECK: " v['value'] == 'adam' "
10+
11+
- KEY:
12+
NAME: opt_base_learning_rate
13+
REQ: EXACTLY_ONE
14+
CHECK: " v['value'] >= 0.0"
15+
16+
- KEY:
17+
NAME: eval_accuracy
18+
REQ: AT_LEAST_ONE
19+
CHECK:
20+
- "'epoch_num' in v['metadata']"
21+
ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"

0 commit comments

Comments
 (0)