mlcommons
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docker/build_docker_images.sh‎
Lines changed: 1 addition & 1 deletion b/‎docker/build_docker_images.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/scripts/startup.sh‎
Lines changed: 0 additions & 1 deletion b/‎docker/scripts/startup.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎scoring/generate_held_out_workloads.py‎ renamed to ‎scoring/algoperf_v05/generate_held_out_workloads.py‎ b/‎scoring/generate_held_out_workloads.py‎ renamed to ‎scoring/algoperf_v05/generate_held_out_workloads.py‎
diff --git a/‎scoring/held_out_workloads_algoperf_v05.json‎ renamed to ‎scoring/algoperf_v05/held_out_workloads_algoperf_v05.json‎ b/‎scoring/held_out_workloads_algoperf_v05.json‎ renamed to ‎scoring/algoperf_v05/held_out_workloads_algoperf_v05.json‎
diff --git a/‎scoring/held_out_workloads_example.json‎ renamed to ‎scoring/algoperf_v05/held_out_workloads_example.json‎ b/‎scoring/held_out_workloads_example.json‎ renamed to ‎scoring/algoperf_v05/held_out_workloads_example.json‎
diff --git a/‎scoring/algoperf_v05/score_submissions.py‎
Lines changed: 231 additions & 0 deletions b/‎scoring/algoperf_v05/score_submissions.py‎
Lines changed: 231 additions & 0 deletions
diff --git a/‎scoring/performance_profile.py‎
Lines changed: 11 additions & 6 deletions b/‎scoring/performance_profile.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎scoring/score_submissions.py‎
Lines changed: 17 additions & 19 deletions b/‎scoring/score_submissions.py‎
Lines changed: 17 additions & 19 deletions
diff --git a/‎scoring/package_logs.py‎ renamed to ‎scoring/utils/package_logs.py‎ b/‎scoring/package_logs.py‎ renamed to ‎scoring/utils/package_logs.py‎
@@ -22,6 +22,12 @@
 
 ---
 
+Unlike benchmarks that focus on model architecture or hardware, the AlgoPerf benchmark isolates the training algorithm itself, measuring how quickly it can achieve target performance levels on a fixed set of representative deep learning tasks. These tasks span various domains, including image classification, speech recognition, machine translation, and more, all running on standardized hardware (8x NVIDIA V100 GPUs). The benchmark includes 8 base workloads, which are fully specified. In addition there are definitions for "randomized" workloads, which are variations of the fixed workloads, which are designed to discourage overfitting. These randomized workloads were used for scoring the AlgPerf competition but will not be used for future scoring.
+
+Submissions are evaluated based on their "time-to-result", i.e., the wall-clock time it takes to reach predefined validation and test set performance targets on each workload. Submissions are scored under one of two different tuning rulesets. The [external tuning rule set](https://github.com/mlcommons/algorithmic-efficiency/blob/main/docs/DOCUMENTATION.md#external-tuning-ruleset) allows a limited amount of hyperparameter tuning (20 quasirandom trials) for each workload. The [self-tuning rule set](https://github.com/mlcommons/algorithmic-efficiency/blob/main/docs/DOCUMENTATION.md#self-tuning-ruleset) allows no external tuning, so any tuning is done "on-the-clock". For each submission, a single, overall benchmark score is computed by integrating its "performance profile" across all fixed workloads. The performance profile captures the relative training time of the submission to the best submission on each workload. Therefore the score of each submission is a function of other submissions in the submission pool. The higher the benchmark score, the better the submission's overall performance.
+
+---
+
 > This is the repository for the *AlgoPerf: Training Algorithms benchmark* measuring neural network training speedups due to algorithmic improvements.
 > It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
 > This repository holds the benchmark code, the benchmark's [**technical documentation**](/docs/DOCUMENTATION.md) and [**getting started guides**](/docs/GETTING_STARTED.md). For a detailed description of the benchmark design, see our [**introductory paper**](https://arxiv.org/abs/2306.07179), for the results of the inaugural competition see our [**results paper**](https://openreview.net/forum?id=CtM5xjRSfm).
 
@@ -14,7 +14,7 @@ do
 done
 
 # Artifact repostiory
-ARTIFACT_REPO="europe-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
+ARTIFACT_REPO="europe-west-4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
 
 if [[ -z ${GIT_BRANCH+x} ]]
 then 
 
@@ -293,7 +293,6 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then
         --workload=${WORKLOAD} \
         --submission_path=${SUBMISSION_PATH}  \
         --data_dir=${DATA_DIR} \
-        --num_tuning_trials=1  \
         --experiment_dir=${EXPERIMENT_DIR}  \
         --experiment_name=${EXPERIMENT_NAME} \
         --overwrite=${OVERWRITE} \
 
@@ -0,0 +1,231 @@
+"""This script can
+1. Summarize the raw submission times for each workload run in a set of studies and trials.
+2. Produce the performance profiles and scores of a group of submissions. 
+Note that for performance profiles and final scores are computed w.r.t. a group of submissions.
+If you only have logs for one submission you may group it with some reference submission
+to compare the performance.
+
+Example usage:
+python3 score_submissions.py \
+  --submission_directory $HOME/algorithmic-efficiency/prize_qualification_baselines/logs \
+  --strict True
+  --compute_performance_profiles
+"""
+
+import operator
+import os
+import pickle
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import pandas as pd
+import performance_profile
+import scoring_utils
+from tabulate import tabulate
+
+flags.DEFINE_string(
+    'submission_directory',
+    None,
+    'Path to submission directory containing experiment directories.')
+flags.DEFINE_string(
+    'output_dir',
+    'scoring_results',
+    'Path to save performance profile artifacts, submission_summaries and results files.'
+)
+flags.DEFINE_boolean('compute_performance_profiles',
+                     False,
+                     'Whether or not to compute the performance profiles.')
+flags.DEFINE_boolean(
+    'strict',
+    False,
+    'Whether to enforce scoring criteria on variant performance and on'
+    '5-trial median performance. Note that during official scoring this '
+    'flag will be set to True.')
+flags.DEFINE_boolean(
+    'self_tuning_ruleset',
+    False,
+    'Whether to score on self-tuning ruleset or externally tuned ruleset')
+flags.DEFINE_string(
+    'save_results_to_filename',
+    None,
+    'Filename to save the processed results that are fed into the performance profile functions.'
+)
+flags.DEFINE_string(
+    'load_results_from_filename',
+    None,
+    'Filename to load processed results from that are fed into performance profile functions'
+)
+flags.DEFINE_string(
+    'exclude_submissions',
+    '',
+    'Optional comma seperated list of names of submissions to exclude from scoring.'
+)
+FLAGS = flags.FLAGS
+
+
+def get_summary_df(workload, workload_df, include_test_split=False):
+  validation_metric, validation_target = scoring_utils.get_workload_metrics_and_targets(workload, split='validation')
+
+  is_minimized = performance_profile.check_if_minimized(validation_metric)
+  target_op = operator.le if is_minimized else operator.ge
+  best_op = min if is_minimized else max
+  idx_op = np.argmin if is_minimized else np.argmax
+
+  summary_df = pd.DataFrame()
+  summary_df['workload'] = workload_df['workload']
+  summary_df['trial'] = workload_df['trial'].apply(lambda x: x[0])
+  summary_df['val target metric name'] = validation_metric
+  summary_df['val target metric value'] = validation_target
+
+  summary_df['val target reached'] = workload_df[validation_metric].apply(
+      lambda x: target_op(x, validation_target)).apply(np.any)
+  summary_df['best metric value on val'] = workload_df[validation_metric].apply(
+      lambda x: best_op(x))
+  workload_df['index best eval on val'] = workload_df[validation_metric].apply(
+      lambda x: idx_op(x))
+  summary_df['time to best eval on val (s)'] = workload_df.apply(
+      lambda x: x['accumulated_submission_time'][x['index best eval on val']],
+      axis=1)
+  workload_df['val target reached'] = workload_df[validation_metric].apply(
+      lambda x: target_op(x, validation_target)).apply(np.any)
+  workload_df['index to target on val'] = workload_df.apply(
+      lambda x: np.argmax(target_op(x[validation_metric], validation_target))
+      if x['val target reached'] else np.nan,
+      axis=1)
+  summary_df['time to target on val (s)'] = workload_df.apply(
+      lambda x: x['accumulated_submission_time'][int(x[
+          'index to target on val'])] if x['val target reached'] else np.inf,
+      axis=1)
+
+  # test metrics
+  if include_test_split:
+    test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(workload, split='test')
+
+    summary_df['test target metric name'] = test_metric
+    summary_df['test target metric value'] = test_target
+
+    summary_df['test target reached'] = workload_df[test_metric].apply(
+        lambda x: target_op(x, test_target)).apply(np.any)
+    summary_df['best metric value on test'] = workload_df[test_metric].apply(
+        lambda x: best_op(x))
+    workload_df['index best eval on test'] = workload_df[test_metric].apply(
+        lambda x: idx_op(x))
+    summary_df['time to best eval on test (s)'] = workload_df.apply(
+        lambda x: x['accumulated_submission_time'][x['index best eval on test']
+                                                  ],
+        axis=1)
+    summary_df['time to target on test (s)'] = summary_df.apply(
+        lambda x: x['time to best eval on test (s)']
+        if x['test target reached'] else np.inf,
+        axis=1)
+
+  return summary_df
+
+
+def get_submission_summary(df, include_test_split=True):
+  """Summarizes the submission results into metric and time tables
+  organized by workload.
+  """
+
+  dfs = []
+  print(df)
+  for workload, group in df.groupby('workload'):
+    summary_df = get_summary_df(
+        workload, group, include_test_split=include_test_split)
+    dfs.append(summary_df)
+
+  df = pd.concat(dfs)
+  logging.info('\n' + tabulate(df, headers='keys', tablefmt='psql'))
+  return df
+
+
+def compute_leaderboard_score(df, normalize=True):
+  """Compute leaderboard score by taking integral of performance profile.
+
+  Args:
+    df: pd.DataFrame returned from `compute_performance_profiles`.
+    normalize: divide by the range of the performance profile's tau.
+
+  Returns:
+    pd.DataFrame with one column of scores indexed by submission.
+  """
+  scores = np.trapz(df, x=df.columns)
+  if normalize:
+    scores /= df.columns.max() - df.columns.min()
+  return pd.DataFrame(scores, columns=['score'], index=df.index)
+
+
+def main(_):
+  results = {}
+  os.makedirs(FLAGS.output_dir, exist_ok=True)
+
+  # Optionally read results to filename
+  if FLAGS.load_results_from_filename:
+    with open(
+        os.path.join(FLAGS.output_dir, FLAGS.load_results_from_filename),
+        'rb') as f:
+      results = pickle.load(f)
+  else:
+    for team in os.listdir(FLAGS.submission_directory):
+      for submission in os.listdir(
+          os.path.join(FLAGS.submission_directory, team)):
+        print(submission)
+        if submission in FLAGS.exclude_submissions.split(','):
+          continue
+        experiment_path = os.path.join(FLAGS.submission_directory,
+                                       team,
+                                       submission)
+        df = scoring_utils.get_experiment_df(experiment_path)
+        results[submission] = df
+        summary_df = get_submission_summary(df)
+        with open(
+            os.path.join(FLAGS.output_dir, f'{submission}_summary.csv'),
+            'w') as fout:
+          summary_df.to_csv(fout)
+
+    # Optionally save results to filename
+    if FLAGS.save_results_to_filename:
+      with open(
+          os.path.join(FLAGS.output_dir, FLAGS.save_results_to_filename),
+          'wb') as f:
+        pickle.dump(results, f)
+
+  if not FLAGS.strict:
+    logging.warning(
+        'You are running with strict=False. This will relax '
+        'scoring criteria on the held-out workloads, number of trials and number '
+        'of studies. Your score may not be an accurate representation '
+        'under competition scoring rules. To enforce the criteria set strict=True.'
+    )
+  if FLAGS.compute_performance_profiles:
+    performance_profile_df = performance_profile.compute_performance_profiles(
+        results,
+        time_col='score',
+        min_tau=1.0,
+        max_tau=4.0,
+        reference_submission_tag=None,
+        num_points=100,
+        scale='linear',
+        verbosity=0,
+        self_tuning_ruleset=FLAGS.self_tuning_ruleset,
+        strict=FLAGS.strict,
+        output_dir=FLAGS.output_dir,
+    )
+    if not os.path.exists(FLAGS.output_dir):
+      os.mkdir(FLAGS.output_dir)
+    performance_profile.plot_performance_profiles(
+        performance_profile_df, 'score', save_dir=FLAGS.output_dir)
+    performance_profile_str = tabulate(
+        performance_profile_df.T, headers='keys', tablefmt='psql')
+    logging.info(f'Performance profile:\n {performance_profile_str}')
+    scores = compute_leaderboard_score(performance_profile_df)
+    scores.to_csv(os.path.join(FLAGS.output_dir, 'scores.csv'))
+    scores_str = tabulate(scores, headers='keys', tablefmt='psql')
+    logging.info(f'Scores: \n {scores_str}')
+
+
+if __name__ == '__main__':
+  # flags.mark_flag_as_required('submission_directory')
+  app.run(main)
@@ -47,17 +47,21 @@
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
 BASE_WORKLOADS_DIR = 'algoperf/workloads/'
 # Open json file to read heldout workloads
-# TODO: This probably shouldn't be hardcoded but passed as an argument.
-with open("held_out_workloads_algoperf_v05.json", "r") as f:
-  HELDOUT_WORKLOADS = json.load(f)
+# TODO: This probably shouldn't be hardcoded but passed as an argument.\
+try:
+  with open("held_out_workloads_algoperf_v05.json", "r") as f:
+    HELDOUT_WORKLOADS = json.load(f)
+except:
+  HELDOUT_WORKLOADS = None
+
 # These global variables have to be set according to the current set of
 # workloads and rules for the scoring to be correct.
 # We do not use the workload registry since it contains test and development
 # workloads as well.
 NUM_BASE_WORKLOADS = 8
-NUM_VARIANT_WORKLOADS = 6
+NUM_VARIANT_WORKLOADS = 0
 NUM_TRIALS = 5
-NUM_STUDIES = 5
+NUM_STUDIES = 3
 
 MIN_EVAL_METRICS = [
     'ce_loss',
@@ -318,7 +322,8 @@ def compute_performance_profiles(submissions,
   # Restrict to base and sampled held-out workloads
   # (ignore the additional workload variants of the baseline
   # as they cause issues when checking for nans in workload variants).
-  df = df[BASE_WORKLOADS + HELDOUT_WORKLOADS]
+  if HELDOUT_WORKLOADS:
+    df = df[BASE_WORKLOADS + HELDOUT_WORKLOADS]
   # Sort workloads alphabetically (for better display)
   df = df.reindex(sorted(df.columns), axis=1)
 
 
@@ -7,9 +7,10 @@
 
 Example usage:
 python3 score_submissions.py \
-  --submission_directory $HOME/algorithmic-efficiency/prize_qualification_baselines/logs \
-  --strict True
-  --compute_performance_profiles
+  --submission_directory $HOME/algoperf-runs/submissions/rolling_leaderboard/self_tuning \
+  --compute_performance_profiles \
+  --output_dir scoring_results_self_tuning \
+  --self_tuning_ruleset
 """
 
 import operator
@@ -160,6 +161,7 @@ def compute_leaderboard_score(df, normalize=True):
 def main(_):
   results = {}
   os.makedirs(FLAGS.output_dir, exist_ok=True)
+  logging.info(f"Scoring submissions in {FLAGS.submission_directory}")
 
   # Optionally read results to filename
   if FLAGS.load_results_from_filename:
@@ -168,22 +170,18 @@ def main(_):
         'rb') as f:
       results = pickle.load(f)
   else:
-    for team in os.listdir(FLAGS.submission_directory):
-      for submission in os.listdir(
-          os.path.join(FLAGS.submission_directory, team)):
-        print(submission)
-        if submission in FLAGS.exclude_submissions.split(','):
-          continue
-        experiment_path = os.path.join(FLAGS.submission_directory,
-                                       team,
-                                       submission)
-        df = scoring_utils.get_experiment_df(experiment_path)
-        results[submission] = df
-        summary_df = get_submission_summary(df)
-        with open(
-            os.path.join(FLAGS.output_dir, f'{submission}_summary.csv'),
-            'w') as fout:
-          summary_df.to_csv(fout)
+    for submission in os.listdir(FLAGS.submission_directory):
+      print(submission)
+      if submission in FLAGS.exclude_submissions.split(','):
+        continue
+      experiment_path = os.path.join(FLAGS.submission_directory, submission)
+      df = scoring_utils.get_experiment_df(experiment_path)
+      results[submission] = df
+      summary_df = get_submission_summary(df)
+      with open(
+          os.path.join(FLAGS.output_dir, f'{submission}_summary.csv'),
+          'w') as fout:
+        summary_df.to_csv(fout)
 
     # Optionally save results to filename
     if FLAGS.save_results_to_filename: