Skip to content

Commit 170c9d2

Browse files
Add power submission checker (#402)
* Add power submission checker * Fix minor issues --------- Co-authored-by: Hiwot Tadese Kassa <[email protected]>
1 parent 814feda commit 170c9d2

File tree

2 files changed

+110
-5
lines changed

2 files changed

+110
-5
lines changed

mlperf_logging/package_checker/package_checker.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ..compliance_checker.mlp_compliance import usage_choices, rule_choices
1515
from ..rcp_checker import rcp_checker
1616
from .seed_checker import find_source_files_under, SeedChecker
17+
from .power_checker import PowerChecker
1718
from ..system_desc_checker import system_desc_checker
1819

1920
from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts
@@ -47,6 +48,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
4748
global_seed_checker_bypass = seed_checker_bypass
4849

4950
seed_checker = SeedChecker(ruleset)
51+
power_checker = PowerChecker(ruleset)
5052
too_many_errors = False
5153
result_folder = os.path.join(folder, 'results')
5254
for system_folder in _get_sub_folders(result_folder):
@@ -146,6 +148,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
146148
logging.warning('Unknown files in result directory: %s', benchmark_folder)
147149

148150
errors_found = 0
151+
error_set = set({})
149152
result_files.sort()
150153
for result_file in result_files:
151154
result_basename = os.path.basename(result_file)
@@ -173,13 +176,19 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
173176
)
174177
if not valid:
175178
errors_found += 1
176-
if errors_found == 1 and benchmark != 'unet3d':
179+
error_set.add(result_name)
180+
power_folder = os.path.join(benchmark_folder, "power")
181+
if os.path.exists(power_folder):
182+
power_valid, power_errors = power_checker.check_power(power_folder, result_files)
183+
error_set = error_set | power_errors
184+
error_list = list(error_set)
185+
if len(error_list) == 1 and benchmark != 'unet3d':
177186
logging.warning(" 1 file does not comply, accepting this under olympic scoring")
178-
elif errors_found > 0 and errors_found <= 4 and benchmark == 'unet3d':
179-
logging.warning(" %d files do not comply, accepting this under olympic scoring", errors_found)
180-
elif errors_found > 0:
187+
elif len(error_list) > 0 and len(error_list) <= 4 and benchmark == 'unet3d':
188+
logging.warning(" %d files do not comply, accepting this under olympic scoring", len(error_list))
189+
elif len(error_list) > 0:
181190
too_many_errors = True
182-
logging.error(" %d files do not comply, directory cannot be accepted", errors_found)
191+
logging.error(" %d files do not comply, directory cannot be accepted", len(error_list))
183192

184193
# Check if each run use unique seeds.
185194
if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'} and division == 'closed':
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import warnings
2+
import os
3+
import logging
4+
5+
6+
class PowerChecker:
7+
""" Check for errors in the MLPerf Power submissions.
8+
Current checks are:
9+
10+
1. Check there is a power folder for each result
11+
2. Check the power file names
12+
3. Check there are the same number of nodes and switches in each run
13+
(No file is missing)
14+
15+
Unsatisfying any of the above checks results in failure.
16+
"""
17+
def __init__(self, ruleset):
18+
self._ruleset = ruleset
19+
20+
def check_range(self, l, n):
21+
seen = set({})
22+
errors = []
23+
for e in l:
24+
if e < 0 or e > (n-1) or e in seen:
25+
return False
26+
27+
return True
28+
29+
def check_equals(self, l):
30+
counter = {}
31+
errors = []
32+
for e in l:
33+
if e in counter:
34+
counter[e] += 1
35+
else:
36+
counter[e] = 1
37+
max_equals = max(counter, key = counter.get)
38+
for i, e in enumerate(l):
39+
if e != max_equals:
40+
errors.append(i)
41+
42+
return len(errors) == 0, errors
43+
44+
def check_power(self, power_folder, result_files):
45+
system, benchmark = os.path.normpath(power_folder).split(os.sep)[-3:-1]
46+
errors_found = 0
47+
errors_set = set()
48+
49+
node_lens = []
50+
sw_lens = []
51+
for result_file in result_files:
52+
result_name, _ = os.path.splitext(os.path.basename(result_file))
53+
if os.path.exists(os.path.join(power_folder, result_name)):
54+
power_result_folder = os.path.join(power_folder, result_name)
55+
power_files = os.listdir(power_result_folder)
56+
node_results = [file for file in power_files if file.startswith("node")]
57+
sw_results = [file for file in power_files if file.startswith("sw")]
58+
node_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in node_results]
59+
sw_idx = [int(os.path.splitext(os.path.basename(file))[0].split('_')[-1]) for file in sw_results]
60+
61+
if len(power_files) > len(node_results) + len(sw_results):
62+
logging.warning("Detected %d total files in directory %s, but some do not conform", len(power_files), power_result_folder)
63+
64+
if not self.check_range(node_idx, len(node_results)):
65+
logging.warning("Bad naming of node power files in directory %s, expected to be node_x with x in range [0, %d]", power_result_folder, len(node_results)-1)
66+
errors_found += 1
67+
errors_set.add(result_name)
68+
if not self.check_range(sw_idx, len(sw_results)):
69+
logging.warning("Bad naming of sw power files in directory %s, expected to be sw_x with x in range [0, %d]", power_result_folder, len(sw_results)-1)
70+
errors_found += 1
71+
errors_set.add(result_name)
72+
73+
node_lens.append(len(node_results))
74+
sw_lens.append(len(sw_results))
75+
pass
76+
else:
77+
logging.warning("Package does not contain power result for %s/%s: %s", system, benchmark, result_name)
78+
errors_found += 1
79+
errors_set.add(result_name)
80+
81+
result_names = [os.path.splitext(os.path.basename(result_file))[0] for result_file in result_files]
82+
83+
valid, errors = self.check_equals(node_lens)
84+
node_errors = set([result_names[error] for error in errors])
85+
for error_result in [result_names[error] for error in errors]:
86+
logging.warning("Inconsistent number of nodes in directory %s/%s", power_folder, error_result)
87+
logging.warning("Directory %s/%s does not comply", power_folder, error_result)
88+
89+
valid, errors = self.check_equals(sw_lens)
90+
sw_errors = set([result_names[error] for error in errors])
91+
for error_result in [result_names[error] for error in errors]:
92+
logging.warning("Inconsistent number of sw in directory %s/%s", power_folder, error_result)
93+
logging.warning("Directory %s/%s does not comply", power_folder, error_result)
94+
95+
errors_set = errors_set | node_errors | sw_errors
96+
return errors_found == 0, errors_set

0 commit comments

Comments
 (0)