Merge pull request #228 from shangw-nvidia/shangw-nvidia/rnnt_rcps

emizan76 · web-flow · commit 4272f73c952d · 2022-05-02T11:47:46.000-07:00
[RCP] Update rcps for rnnt.
diff --git a/mlperf_logging/rcp_checker/training_2.0.0/rcps_rnnt.json b/mlperf_logging/rcp_checker/training_2.0.0/rcps_rnnt.json
@@ -1,30 +1,84 @@
 {
 
+  "rnn_t_ref_0.5k":
+  {
+    "Benchmark": "rnnt",
+    "Creator": "NVIDIA",
+    "When": "Prior to 2.0 submission",
+    "Platform": "TBD",
+    "BS": 512,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.0062,
+      "opt_lamb_learning_rate_hold_epochs": 11,
+      "opt_lamb_learning_rate_decay_poly_power": 0.915,
+      "opt_learning_rate_warmup_epochs": 1,
+      "opt_weight_decay": 0.001,
+      "opt_lamb_beta_1": 0.9,
+      "opt_lamb_beta_2": 0.9985,
+      "opt_lamb_learning_rate_min": 1e-5,
+      "opt_gradient_clip_norm": 1.0,
+      "opt_gradient_accumulation_steps": 2,
+      "model_eva_ema_factor": 0.994,
+      "model_weights_initialization_scale": 0.45,
+      "data_train_num_buckets": 1
+    },
+  "Epochs to converge": [
+    39, 40, 43, 41, 39, 52, 43, 44, 40, 42,
+    40, 40, 42, 37, 40, 41, 46, 51, 41, 40 ]
+  },
+
   "rnn_t_ref_1k":
   {
     "Benchmark": "rnnt",
     "Creator": "NVIDIA",
-    "When": "Prior to 1.1 submission",
+    "When": "Prior to 2.0 submission",
     "Platform": "TBD",
     "BS": 1024,
     "Hyperparams": {
-      "opt_base_learning_rate": 0.004,
-      "opt_lamb_learning_rate_hold_epochs": 40,
-      "opt_lamb_learning_rate_decay_poly_power": 0.935,
-      "opt_learning_rate_warmup_epochs": 6,
+      "opt_base_learning_rate": 0.007447,
+      "opt_lamb_learning_rate_hold_epochs": 17,
+      "opt_lamb_learning_rate_decay_poly_power": 0.9037,
+      "opt_learning_rate_warmup_epochs": 3,
+      "opt_weight_decay": 0.001,
+      "opt_lamb_beta_1": 0.9,
+      "opt_lamb_beta_2": 0.999,
+      "opt_lamb_learning_rate_min": 1e-5,
+      "opt_gradient_clip_norm": 1.0,
+      "opt_gradient_accumulation_steps": 4,
+      "model_eva_ema_factor": 0.992,
+      "model_weights_initialization_scale": 0.5,
+      "data_train_num_buckets": 1
+    },
+  "Epochs to converge": [
+    44, 42, 41, 45, 43, 49, 43, 45, 50, 41,
+    48, 41, 47, 46, 46, 44, 42, 50, 43, 42 ]
+  },
+
+  "rnn_t_ref_1.5k":
+  {
+    "Benchmark": "rnnt",
+    "Creator": "NVIDIA",
+    "When": "Prior to 2.0 submission",
+    "Platform": "TBD",
+    "BS": 1536,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.0072,
+      "opt_lamb_learning_rate_hold_epochs": 26,
+      "opt_lamb_learning_rate_decay_poly_power": 0.92,
+      "opt_learning_rate_warmup_epochs": 5,
       "opt_weight_decay": 0.001,
       "opt_lamb_beta_1": 0.9,
       "opt_lamb_beta_2": 0.999,
       "opt_lamb_learning_rate_min": 1e-5,
-      "opt_gradient_clip_norm": "none",
-      "opt_gradient_accumulation_steps": 8,
-      "model_eva_ema_factor": 0.999,
+      "opt_gradient_clip_norm": 1.0,
+      "opt_gradient_accumulation_steps": 4,
+      "model_eva_ema_factor": 0.995,
       "model_weights_initialization_scale": 0.5,
       "data_train_num_buckets": 1
     },
   "Epochs to converge": [
-    59, 57, 59, 54, 57, 58, 56, 58, 55, 58,
-    58, 58, 62, 61, 63, 60, 57, 59, 57, 63 ]
+    46, 49, 47, 49, 49, 50, 46, 51, 48, 50,
+    52, 48, 47, 47, 51, 52, 47, 50, 50, 49 ]
   },
 
   "rnn_t_ref_2k":
@@ -112,4 +166,3 @@
   }
 
 }
-