|
1 | 1 | {
|
2 | 2 |
|
| 3 | + "rnn_t_ref_0.5k": |
| 4 | + { |
| 5 | + "Benchmark": "rnnt", |
| 6 | + "Creator": "NVIDIA", |
| 7 | + "When": "Prior to 2.0 submission", |
| 8 | + "Platform": "TBD", |
| 9 | + "BS": 512, |
| 10 | + "Hyperparams": { |
| 11 | + "opt_base_learning_rate": 0.0062, |
| 12 | + "opt_lamb_learning_rate_hold_epochs": 11, |
| 13 | + "opt_lamb_learning_rate_decay_poly_power": 0.915, |
| 14 | + "opt_learning_rate_warmup_epochs": 1, |
| 15 | + "opt_weight_decay": 0.001, |
| 16 | + "opt_lamb_beta_1": 0.9, |
| 17 | + "opt_lamb_beta_2": 0.9985, |
| 18 | + "opt_lamb_learning_rate_min": 1e-5, |
| 19 | + "opt_gradient_clip_norm": 1.0, |
| 20 | + "opt_gradient_accumulation_steps": 2, |
| 21 | + "model_eva_ema_factor": 0.994, |
| 22 | + "model_weights_initialization_scale": 0.45, |
| 23 | + "data_train_num_buckets": 1 |
| 24 | + }, |
| 25 | + "Epochs to converge": [ |
| 26 | + 39, 40, 43, 41, 39, 52, 43, 44, 40, 42, |
| 27 | + 40, 40, 42, 37, 40, 41, 46, 51, 41, 40 ] |
| 28 | + }, |
| 29 | + |
3 | 30 | "rnn_t_ref_1k":
|
4 | 31 | {
|
5 | 32 | "Benchmark": "rnnt",
|
6 | 33 | "Creator": "NVIDIA",
|
7 |
| - "When": "Prior to 1.1 submission", |
| 34 | + "When": "Prior to 2.0 submission", |
8 | 35 | "Platform": "TBD",
|
9 | 36 | "BS": 1024,
|
10 | 37 | "Hyperparams": {
|
11 |
| - "opt_base_learning_rate": 0.004, |
12 |
| - "opt_lamb_learning_rate_hold_epochs": 40, |
13 |
| - "opt_lamb_learning_rate_decay_poly_power": 0.935, |
14 |
| - "opt_learning_rate_warmup_epochs": 6, |
| 38 | + "opt_base_learning_rate": 0.007447, |
| 39 | + "opt_lamb_learning_rate_hold_epochs": 17, |
| 40 | + "opt_lamb_learning_rate_decay_poly_power": 0.9037, |
| 41 | + "opt_learning_rate_warmup_epochs": 3, |
| 42 | + "opt_weight_decay": 0.001, |
| 43 | + "opt_lamb_beta_1": 0.9, |
| 44 | + "opt_lamb_beta_2": 0.999, |
| 45 | + "opt_lamb_learning_rate_min": 1e-5, |
| 46 | + "opt_gradient_clip_norm": 1.0, |
| 47 | + "opt_gradient_accumulation_steps": 4, |
| 48 | + "model_eva_ema_factor": 0.992, |
| 49 | + "model_weights_initialization_scale": 0.5, |
| 50 | + "data_train_num_buckets": 1 |
| 51 | + }, |
| 52 | + "Epochs to converge": [ |
| 53 | + 44, 42, 41, 45, 43, 49, 43, 45, 50, 41, |
| 54 | + 48, 41, 47, 46, 46, 44, 42, 50, 43, 42 ] |
| 55 | + }, |
| 56 | + |
| 57 | + "rnn_t_ref_1.5k": |
| 58 | + { |
| 59 | + "Benchmark": "rnnt", |
| 60 | + "Creator": "NVIDIA", |
| 61 | + "When": "Prior to 2.0 submission", |
| 62 | + "Platform": "TBD", |
| 63 | + "BS": 1536, |
| 64 | + "Hyperparams": { |
| 65 | + "opt_base_learning_rate": 0.0072, |
| 66 | + "opt_lamb_learning_rate_hold_epochs": 26, |
| 67 | + "opt_lamb_learning_rate_decay_poly_power": 0.92, |
| 68 | + "opt_learning_rate_warmup_epochs": 5, |
15 | 69 | "opt_weight_decay": 0.001,
|
16 | 70 | "opt_lamb_beta_1": 0.9,
|
17 | 71 | "opt_lamb_beta_2": 0.999,
|
18 | 72 | "opt_lamb_learning_rate_min": 1e-5,
|
19 |
| - "opt_gradient_clip_norm": "none", |
20 |
| - "opt_gradient_accumulation_steps": 8, |
21 |
| - "model_eva_ema_factor": 0.999, |
| 73 | + "opt_gradient_clip_norm": 1.0, |
| 74 | + "opt_gradient_accumulation_steps": 4, |
| 75 | + "model_eva_ema_factor": 0.995, |
22 | 76 | "model_weights_initialization_scale": 0.5,
|
23 | 77 | "data_train_num_buckets": 1
|
24 | 78 | },
|
25 | 79 | "Epochs to converge": [
|
26 |
| - 59, 57, 59, 54, 57, 58, 56, 58, 55, 58, |
27 |
| - 58, 58, 62, 61, 63, 60, 57, 59, 57, 63 ] |
| 80 | + 46, 49, 47, 49, 49, 50, 46, 51, 48, 50, |
| 81 | + 52, 48, 47, 47, 51, 52, 47, 50, 50, 49 ] |
28 | 82 | },
|
29 | 83 |
|
30 | 84 | "rnn_t_ref_2k":
|
|
112 | 166 | }
|
113 | 167 |
|
114 | 168 | }
|
115 |
| - |
|
0 commit comments