Skip to content

Commit 32e2eec

Browse files
Fixes: Reset scaling + output individual results + remove unnecesary argument
1 parent 072d88d commit 32e2eec

File tree

3 files changed

+131
-40
lines changed

3 files changed

+131
-40
lines changed

mlperf_logging/result_summarizer/compute_score/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ python3 -m mlperf_logging.result_summarizer.compute_score --benchmark BENCHMARK
1414

1515

1616
**BENCHMARK:** Name of the benchmark to compute the score such as rgat, llama31_8b, etc.
17-
**SYSTEM_NAME:** The name of the system, it can be set to None.
17+
**SYSTEM_NAME:** Optional system name.
1818
**BENCHMARK_FOLDER:** Folder containing all the results files of the benchmark.
1919
**USAGE:** Either "training" or "hpc",
2020
**RULESET:** Version of the rules that applies one of "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0".

mlperf_logging/result_summarizer/compute_score/__main__.py

Lines changed: 84 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
from .. import result_summarizer
22
from ...rcp_checker import rcp_checker
33
from ...compliance_checker.mlp_compliance import usage_choices, rule_choices
4+
from ...compliance_checker.mlp_parser import parse_file
5+
from ...benchmark_meta import get_result_file_counts
46
import argparse
7+
import glob
8+
import json
9+
import os
510

611

712
def get_compute_args():
813
parser = argparse.ArgumentParser(
914
prog="mlperf_logging.result_summarizer.compute_score",
1015
description="Compute the score of a single benchmark",
1116
)
12-
parser.add_argument(
13-
"--benchmark",
14-
type=str,
15-
help="Benchmark to compute the score such as rgat, llama31_8b, etc.",
16-
required=True,
17-
)
1817
parser.add_argument("--system", type=str, help="System name", default=None)
1918
parser.add_argument(
2019
"--has_power", action="store_true", help="Compute power score as well"
@@ -50,15 +49,55 @@ def get_compute_args():
5049
return parser.parse_args()
5150

5251

53-
def print_benchmark_info(args):
52+
def print_benchmark_info(args, benchmark):
53+
print("INFO -------------------------------------------------------")
5454
print(f"MLPerf {args.usage}")
5555
print(f"Folder: {args.benchmark_folder}")
5656
print(f"Version: {args.ruleset}")
5757
print(f"System: {args.system}")
58-
print(f"Benchmark: {args.benchmark}")
58+
print(f"Benchmark: {benchmark}")
59+
print("-------------------------------------------------------------")
60+
61+
62+
def _reset_scaling(results_dir):
63+
filepath = results_dir + "/scaling.json"
64+
if os.path.exists(filepath):
65+
os.remove(filepath)
66+
67+
68+
def _get_scaling_factor(results_dir):
69+
scaling_factor = 1.0
70+
scaling_file = results_dir + "/scaling.json"
71+
if os.path.exists(scaling_file):
72+
with open(scaling_file, "r") as f:
73+
contents = json.load(f)
74+
scaling_factor = contents["scaling_factor"]
75+
return scaling_factor
76+
77+
78+
def _find_benchmark(result_file, ruleset):
79+
loglines, _ = parse_file(result_file, ruleset)
80+
benchmark = None
81+
for logline in loglines:
82+
if logline.key == "submission_benchmark":
83+
benchmark = logline.value["value"]
84+
break
85+
if benchmark is None:
86+
raise ValueError("Benchmark not specified in result file")
87+
return benchmark
5988

6089

6190
args = get_compute_args()
91+
_reset_scaling(args.benchmark_folder)
92+
pattern = "{folder}/result_*.txt".format(folder=args.benchmark_folder)
93+
result_files = glob.glob(pattern, recursive=True)
94+
benchmark = _find_benchmark(result_files[0], args.ruleset)
95+
required_runs = get_result_file_counts(args.usage)[benchmark]
96+
if required_runs > len(result_files):
97+
print(
98+
f"WARNING: Not enough runs found for an official submission."
99+
f" Found: {len(result_files)}, required: {required_runs}"
100+
)
62101

63102
if args.scale:
64103
rcp_checker.check_directory(
@@ -73,29 +112,54 @@ def print_benchmark_info(args):
73112
set_scaling=True,
74113
)
75114

115+
scaling_factor = _get_scaling_factor(args.benchmark_folder)
116+
76117
if args.is_weak_scaling:
77118
scores, power_scores = result_summarizer._compute_weak_score_standalone(
78-
args.benchmark,
119+
benchmark,
79120
args.system,
80121
args.has_power,
81122
args.benchmark_folder,
82123
args.usage,
83124
args.ruleset,
84125
)
85-
print_benchmark_info(args)
126+
print_benchmark_info(args, benchmark)
86127
print(f"Scores: {scores}")
87128
if power_scores:
88129
print(f"Power Scores - Energy (kJ): {power_scores}")
89130
else:
90-
score, power_score = result_summarizer._compute_strong_score_standalone(
91-
args.benchmark,
92-
args.system,
93-
args.has_power,
94-
args.benchmark_folder,
95-
args.usage,
96-
args.ruleset,
131+
scores_track, power_scores_track, score, power_score = (
132+
result_summarizer._compute_strong_score_standalone(
133+
benchmark,
134+
args.system,
135+
args.has_power,
136+
args.benchmark_folder,
137+
args.usage,
138+
args.ruleset,
139+
return_full_scores=True,
140+
)
97141
)
98-
print_benchmark_info(args)
99-
print(f"Score - Time to Train (minutes): {score}")
142+
print_benchmark_info(args, benchmark)
143+
mean_score = 0
144+
for file, s in scores_track.items():
145+
print(f"Score - Time to Train (minutes) for {file}: {s}")
146+
mean_score += s
147+
mean_score /= len(result_files)
148+
mean_score *= scaling_factor
149+
if required_runs > len(result_files):
150+
print("WARNING: Olympic scoring skipped")
151+
print(f"Final score - Time to Train (minutes): {mean_score}")
152+
else:
153+
print(f"Final score - Time to Train (minutes): {score}")
100154
if power_score:
101-
print(f"Power Score - Energy (kJ): {power_score}")
155+
mean_power = 0
156+
for file, ps in power_scores_track.items():
157+
print(f"Power Score - Energy (kJ) for {file}: {ps}")
158+
mean_power += ps
159+
mean_power /= len(result_files)
160+
mean_power *= scaling_factor
161+
if required_runs > len(result_files):
162+
print("WARNING: Olympic scoring skipped")
163+
print(f"Final score - Time to Train (minutes): {mean_power}")
164+
else:
165+
print(f"Power Score - Energy (kJ): {power_score}")

mlperf_logging/result_summarizer/result_summarizer.py

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -324,58 +324,85 @@ def _get_scaling_factor(folder):
324324
return scaling_factor
325325

326326

327-
def _compute_strong_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc = {"submitter": None}):
328-
pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder)
327+
def _compute_strong_score_standalone(
328+
benchmark,
329+
system,
330+
has_power,
331+
benchmark_folder,
332+
usage,
333+
ruleset,
334+
desc={"submitter": None},
335+
return_full_scores=False,
336+
):
337+
pattern = "{folder}/result_*.txt".format(folder=benchmark_folder)
329338
result_files = glob.glob(pattern, recursive=True)
330339
scores = []
340+
scores_track = {}
331341
power_scores = []
342+
power_scores_track = {}
332343
dropped_scores = 0
333344
for result_file in result_files:
334345
try:
335346
loglines = _read_result_file(result_file, usage, ruleset)
336347
start, stop = _query_run_start_stop(loglines)
337348
time_to_train_ms = stop - start
338349
scores.append(time_to_train_ms / 60 / 1000)
350+
scores_track[result_file] = scores[-1]
339351
except ValueError as e:
340-
print('{} in {}'.format(e, result_file))
352+
print("{} in {}".format(e, result_file))
341353
dropped_scores += 1
342354
continue
343355
if has_power:
344-
power_scores.append(_compute_total_power(benchmark_folder, result_file, time_to_train_ms, ruleset))
345-
max_dropped_scores = 4 if benchmark == 'unet3d' else 1
356+
power_scores.append(
357+
_compute_total_power(
358+
benchmark_folder, result_file, time_to_train_ms, ruleset
359+
)
360+
)
361+
power_scores_track[result_file] = power_scores[-1]
362+
max_dropped_scores = 4 if benchmark == "unet3d" else 1
346363
if dropped_scores > max_dropped_scores:
347-
print('CRITICAL ERROR: Too many non-converging runs '
348-
'for {} {}/{}'.format(desc['submitter'], system, benchmark))
349-
print('** CRITICAL ERROR ** Results in the table for {} {}/{} are '
350-
'NOT correct'.format(desc['submitter'], system, benchmark))
364+
print(
365+
"CRITICAL ERROR: Too many non-converging runs "
366+
"for {} {}/{}".format(desc["submitter"], system, benchmark)
367+
)
368+
print(
369+
"** CRITICAL ERROR ** Results in the table for {} {}/{} are "
370+
"NOT correct".format(desc["submitter"], system, benchmark)
371+
)
351372
elif dropped_scores >= 1:
352-
print('NOTICE: Dropping non-converged run(s) for {} {}/{} using '
353-
'olympic scoring.'.format(
354-
desc['submitter'],
355-
system,
356-
benchmark,
357-
))
358-
373+
print(
374+
"NOTICE: Dropping non-converged run(s) for {} {}/{} using "
375+
"olympic scoring.".format(
376+
desc["submitter"],
377+
system,
378+
benchmark,
379+
)
380+
)
381+
359382
if has_power:
360383
unsorted_scores = scores.copy()
361384

362385
score = None
363386
scaling_factor = _get_scaling_factor(benchmark_folder)
364387
if dropped_scores <= max_dropped_scores:
365388
olympic_avg = _compute_olympic_average(
366-
scores, dropped_scores, max_dropped_scores)
389+
scores, dropped_scores, max_dropped_scores
390+
)
367391
if olympic_avg is not None:
368392
score = olympic_avg
369393
score *= scaling_factor
370394

371395
power_score = None
372396
if has_power and dropped_scores <= max_dropped_scores:
373-
index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x:x[1])]
397+
index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x: x[1])]
374398
olympic_avg = _index_olympic_average(
375-
power_scores, index, dropped_scores, max_dropped_scores)
399+
power_scores, index, dropped_scores, max_dropped_scores
400+
)
376401
if olympic_avg is not None:
377402
power_score = olympic_avg
378403
power_score *= scaling_factor
404+
if return_full_scores:
405+
return scores_track, power_scores_track, score, power_score
379406
return score, power_score
380407

381408

0 commit comments

Comments
 (0)