Fixes: Reset scaling + output individual results + remove unnecesary argument

pgmpablo157321 · pgmpablo157321 · commit 32e2eecaf3b1 · 2025-09-08T19:16:27.000-05:00
diff --git a/mlperf_logging/result_summarizer/compute_score/README.md b/mlperf_logging/result_summarizer/compute_score/README.md
@@ -14,7 +14,7 @@ python3 -m mlperf_logging.result_summarizer.compute_score --benchmark BENCHMARK
 
 
 **BENCHMARK:** Name of the benchmark to compute the score such as rgat, llama31_8b, etc.
-**SYSTEM_NAME:** The name of the system, it can be set to None.
+**SYSTEM_NAME:** Optional system name.
 **BENCHMARK_FOLDER:** Folder containing all the results files of the benchmark.
 **USAGE:** Either "training" or "hpc",
 **RULESET:** Version of the rules that applies one of "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0".
diff --git a/mlperf_logging/result_summarizer/compute_score/__main__.py b/mlperf_logging/result_summarizer/compute_score/__main__.py
@@ -1,20 +1,19 @@
 from .. import result_summarizer
 from ...rcp_checker import rcp_checker
 from ...compliance_checker.mlp_compliance import usage_choices, rule_choices
+from ...compliance_checker.mlp_parser import parse_file
+from ...benchmark_meta import get_result_file_counts
 import argparse
+import glob
+import json
+import os
 
 
 def get_compute_args():
     parser = argparse.ArgumentParser(
         prog="mlperf_logging.result_summarizer.compute_score",
         description="Compute the score of a single benchmark",
     )
-    parser.add_argument(
-        "--benchmark",
-        type=str,
-        help="Benchmark to compute the score such as rgat, llama31_8b, etc.",
-        required=True,
-    )
     parser.add_argument("--system", type=str, help="System name", default=None)
     parser.add_argument(
         "--has_power", action="store_true", help="Compute power score as well"
@@ -50,15 +49,55 @@ def get_compute_args():
     return parser.parse_args()
 
 
-def print_benchmark_info(args):
+def print_benchmark_info(args, benchmark):
+    print("INFO -------------------------------------------------------")
     print(f"MLPerf {args.usage}")
     print(f"Folder: {args.benchmark_folder}")
     print(f"Version: {args.ruleset}")
     print(f"System: {args.system}")
-    print(f"Benchmark: {args.benchmark}")
+    print(f"Benchmark: {benchmark}")
+    print("-------------------------------------------------------------")
+
+
+def _reset_scaling(results_dir):
+    filepath = results_dir + "/scaling.json"
+    if os.path.exists(filepath):
+        os.remove(filepath)
+
+
+def _get_scaling_factor(results_dir):
+    scaling_factor = 1.0
+    scaling_file = results_dir + "/scaling.json"
+    if os.path.exists(scaling_file):
+        with open(scaling_file, "r") as f:
+            contents = json.load(f)
+        scaling_factor = contents["scaling_factor"]
+    return scaling_factor
+
+
+def _find_benchmark(result_file, ruleset):
+    loglines, _ = parse_file(result_file, ruleset)
+    benchmark = None
+    for logline in loglines:
+        if logline.key == "submission_benchmark":
+            benchmark = logline.value["value"]
+            break
+    if benchmark is None:
+        raise ValueError("Benchmark not specified in result file")
+    return benchmark
 
 
 args = get_compute_args()
+_reset_scaling(args.benchmark_folder)
+pattern = "{folder}/result_*.txt".format(folder=args.benchmark_folder)
+result_files = glob.glob(pattern, recursive=True)
+benchmark = _find_benchmark(result_files[0], args.ruleset)
+required_runs = get_result_file_counts(args.usage)[benchmark]
+if required_runs > len(result_files):
+    print(
+        f"WARNING: Not enough runs found for an official submission."
+        f" Found: {len(result_files)}, required: {required_runs}"
+    )
 
 if args.scale:
     rcp_checker.check_directory(
@@ -73,29 +112,54 @@ def print_benchmark_info(args):
         set_scaling=True,
     )
 
+scaling_factor = _get_scaling_factor(args.benchmark_folder)
+
 if args.is_weak_scaling:
     scores, power_scores = result_summarizer._compute_weak_score_standalone(
-        args.benchmark,
+        benchmark,
         args.system,
         args.has_power,
         args.benchmark_folder,
         args.usage,
         args.ruleset,
     )
-    print_benchmark_info(args)
+    print_benchmark_info(args, benchmark)
     print(f"Scores: {scores}")
     if power_scores:
         print(f"Power Scores - Energy (kJ): {power_scores}")
 else:
-    score, power_score = result_summarizer._compute_strong_score_standalone(
-        args.benchmark,
-        args.system,
-        args.has_power,
-        args.benchmark_folder,
-        args.usage,
-        args.ruleset,
+    scores_track, power_scores_track, score, power_score = (
+        result_summarizer._compute_strong_score_standalone(
+            benchmark,
+            args.system,
+            args.has_power,
+            args.benchmark_folder,
+            args.usage,
+            args.ruleset,
+            return_full_scores=True,
+        )
     )
-    print_benchmark_info(args)
-    print(f"Score - Time to Train (minutes): {score}")
+    print_benchmark_info(args, benchmark)
+    mean_score = 0
+    for file, s in scores_track.items():
+        print(f"Score - Time to Train (minutes) for {file}: {s}")
+        mean_score += s
+    mean_score /= len(result_files)
+    mean_score *= scaling_factor
+    if required_runs > len(result_files):
+        print("WARNING: Olympic scoring skipped")
+        print(f"Final score - Time to Train (minutes): {mean_score}")
+    else:
+        print(f"Final score - Time to Train (minutes): {score}")
     if power_score:
-        print(f"Power Score - Energy (kJ): {power_score}")
+        mean_power = 0
+        for file, ps in power_scores_track.items():
+            print(f"Power Score - Energy (kJ) for {file}: {ps}")
+            mean_power += ps
+        mean_power /= len(result_files)
+        mean_power *= scaling_factor
+        if required_runs > len(result_files):
+            print("WARNING: Olympic scoring skipped")
+            print(f"Final score - Time to Train (minutes): {mean_power}")
+        else:
+            print(f"Power Score - Energy (kJ): {power_score}")
diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py
@@ -324,58 +324,85 @@ def _get_scaling_factor(folder):
     return scaling_factor
 
 
-def _compute_strong_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc = {"submitter": None}):
-    pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder)
+def _compute_strong_score_standalone(
+    benchmark,
+    system,
+    has_power,
+    benchmark_folder,
+    usage,
+    ruleset,
+    desc={"submitter": None},
+    return_full_scores=False,
+):
+    pattern = "{folder}/result_*.txt".format(folder=benchmark_folder)
     result_files = glob.glob(pattern, recursive=True)
     scores = []
+    scores_track = {}
     power_scores = []
+    power_scores_track = {}
     dropped_scores = 0
     for result_file in result_files:
         try:
             loglines = _read_result_file(result_file, usage, ruleset)
             start, stop = _query_run_start_stop(loglines)
             time_to_train_ms = stop - start
             scores.append(time_to_train_ms / 60 / 1000)
+            scores_track[result_file] = scores[-1]
         except ValueError as e:
-            print('{} in {}'.format(e, result_file))
+            print("{} in {}".format(e, result_file))
             dropped_scores += 1
             continue
         if has_power:
-            power_scores.append(_compute_total_power(benchmark_folder, result_file, time_to_train_ms, ruleset))
-    max_dropped_scores = 4 if benchmark == 'unet3d' else 1
+            power_scores.append(
+                _compute_total_power(
+                    benchmark_folder, result_file, time_to_train_ms, ruleset
+                )
+            )
+            power_scores_track[result_file] = power_scores[-1]
+    max_dropped_scores = 4 if benchmark == "unet3d" else 1
     if dropped_scores > max_dropped_scores:
-        print('CRITICAL ERROR: Too many non-converging runs '
-                'for {} {}/{}'.format(desc['submitter'], system, benchmark))
-        print('** CRITICAL ERROR ** Results in the table for {} {}/{} are '
-                'NOT correct'.format(desc['submitter'], system, benchmark))
+        print(
+            "CRITICAL ERROR: Too many non-converging runs "
+            "for {} {}/{}".format(desc["submitter"], system, benchmark)
+        )
+        print(
+            "** CRITICAL ERROR ** Results in the table for {} {}/{} are "
+            "NOT correct".format(desc["submitter"], system, benchmark)
+        )
     elif dropped_scores >= 1:
-        print('NOTICE: Dropping non-converged run(s) for {} {}/{} using '
-                'olympic scoring.'.format(
-                    desc['submitter'],
-                    system,
-                    benchmark,
-                ))
-        
+        print(
+            "NOTICE: Dropping non-converged run(s) for {} {}/{} using "
+            "olympic scoring.".format(
+                desc["submitter"],
+                system,
+                benchmark,
+            )
+        )
+
     if has_power:
         unsorted_scores = scores.copy()
 
     score = None
     scaling_factor = _get_scaling_factor(benchmark_folder)
     if dropped_scores <= max_dropped_scores:
         olympic_avg = _compute_olympic_average(
-            scores, dropped_scores, max_dropped_scores)
+            scores, dropped_scores, max_dropped_scores
+        )
         if olympic_avg is not None:
             score = olympic_avg
             score *= scaling_factor
 
     power_score = None
     if has_power and dropped_scores <= max_dropped_scores:
-        index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x:x[1])]
+        index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x: x[1])]
         olympic_avg = _index_olympic_average(
-            power_scores, index, dropped_scores, max_dropped_scores)
+            power_scores, index, dropped_scores, max_dropped_scores
+        )
         if olympic_avg is not None:
             power_score = olympic_avg
             power_score *= scaling_factor
+    if return_full_scores:
+        return scores_track, power_scores_track, score, power_score
     return score, power_score