From 688d2f0a1a2ec1070785fffda6a149d62144e17e Mon Sep 17 00:00:00 2001 From: Su Ann Chong Date: Fri, 5 Sep 2025 10:37:39 -0500 Subject: [PATCH 1/4] remove constraint for opt_learning_rate_decay_steps --- .../compliance_checker/training_5.1.0/closed_llama31_8b.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml index 2aba722..aa882f3 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml @@ -25,7 +25,6 @@ - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == 1200000 " - KEY: NAME: opt_learning_rate_warmup_steps From f705313e376fa577bd57d65bfc8e594d32975aba Mon Sep 17 00:00:00 2001 From: Su Ann Chong Date: Fri, 5 Sep 2025 11:01:23 -0500 Subject: [PATCH 2/4] Update rcp_checker.py to handle llama31_8b epochs correctly --- mlperf_logging/rcp_checker/rcp_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 9b7e6e0..55405eb 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -84,7 +84,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] From a827a87b3aa0915f77cb45ff764d6b0ce3319853 Mon Sep 17 00:00:00 2001 From: Su Ann Chong Date: Thu, 11 Sep 2025 01:42:57 -0500 Subject: [PATCH 3/4] added constraint check for 8b and 405b --- .../training_5.1.0/closed_llama31_405b.yaml | 7 +++++-- .../training_5.1.0/closed_llama31_8b.yaml | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml index c47fd87..f20c150 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml @@ -24,12 +24,15 @@ REQ: EXACTLY_ONE - KEY: - NAME: opt_learning_rate_decay_steps + NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = math.ceil(8000 * 1152 / s['global_batch_size'] ) - KEY: - NAME: opt_learning_rate_warmup_steps + NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == math.ceil(1_200_000 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml index aa882f3..38cb96f 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml @@ -23,12 +23,15 @@ REQ: EXACTLY_ONE - KEY: - NAME: opt_learning_rate_decay_steps + NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] - KEY: - NAME: opt_learning_rate_warmup_steps + NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == math.ceil(1_200_000 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule From 470daf0fc0e6347a8c46ee8071b7dc92310ff81f Mon Sep 17 00:00:00 2001 From: Su Ann Chong Date: Tue, 23 Sep 2025 11:07:23 -0500 Subject: [PATCH 4/4] missing factor --- .../compliance_checker/training_5.1.0/closed_llama31_405b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml index f20c150..90e2d45 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml @@ -32,7 +32,7 @@ - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == math.ceil(1_200_000 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " + CHECK: " v['value'] == math.ceil(1_200_000 * 1152 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule