mlcommons
diff --git a/‎algoperf/workloads/criteo1tb/criteo1tb_jax/models_ref.py‎
Lines changed: 1 addition & 1 deletion b/‎algoperf/workloads/criteo1tb/criteo1tb_jax/models_ref.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algoperf/workloads/librispeech_conformer/librispeech_jax/models_ref.py‎
Lines changed: 3 additions & 3 deletions b/‎algoperf/workloads/librispeech_conformer/librispeech_jax/models_ref.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎algoperf/workloads/ogbg/ogbg_jax/models.py‎
Lines changed: 1 addition & 1 deletion b/‎algoperf/workloads/ogbg/ogbg_jax/models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algoperf/workloads/ogbg/ogbg_jax/models_ref.py‎
Lines changed: 6 additions & 10 deletions b/‎algoperf/workloads/ogbg/ogbg_jax/models_ref.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎tests/dropout_fix/fastmri_jax/test_model_equivalence.py‎
Lines changed: 81 additions & 93 deletions b/‎tests/dropout_fix/fastmri_jax/test_model_equivalence.py‎
Lines changed: 81 additions & 93 deletions
@@ -23,7 +23,7 @@ class DLRMResNet(nn.Module):
   mlp_bottom_dims: Sequence[int] = (256, 256, 256)
   mlp_top_dims: Sequence[int] = (256, 256, 256, 256, 1)
   embed_dim: int = 128
-  dropout_rate: float = 0.0
+  dropout_rate: float = 0.1
   use_layer_norm: bool = False  # Unused.
   embedding_init_multiplier: float = None  # Unused
 
 
@@ -36,12 +36,12 @@ class ConformerConfig:
   encoder_dim: int = 512
   num_attention_heads: int = 8
   num_encoder_layers: int = 4
-  attention_dropout_rate: float = 0.0
+  attention_dropout_rate: float = 0.1
   # If None, defaults to 0.1.
   attention_residual_dropout_rate: Optional[float] = 0.1
   # If None, defaults to 0.0.
-  conv_residual_dropout_rate: Optional[float] = 0.0
-  feed_forward_dropout_rate: float = 0.0
+  conv_residual_dropout_rate: Optional[float] = 0.1
+  feed_forward_dropout_rate: float = 0.1
   # If None, defaults to 0.1.
   feed_forward_residual_dropout_rate: Optional[float] = 0.1
   convolution_kernel_size: int = 5
 
@@ -93,4 +93,4 @@ def __call__(self, graph, train, dropout_rate=DROPOUT_RATE):
     decoder = jraph.GraphMapFeatures(embed_global_fn=nn.Dense(self.num_outputs))
     graph = decoder(graph)
 
-    return graph.globals
+    return graph.globals
@@ -15,7 +15,7 @@ def make_fn(inputs):
   return make_fn
 
 
-def _make_mlp(hidden_dims, dropout, activation_fn):
+def _make_mlp(hidden_dims, activation_fn, train, dropout_rate):
   """Creates a MLP with specified dimensions."""
 
   @jraph.concatenated_args
@@ -25,7 +25,7 @@ def make_fn(inputs):
       x = nn.Dense(features=dim)(x)
       x = nn.LayerNorm()(x)
       x = activation_fn(x)
-      x = dropout(x)
+      x = nn.Dropout(rate=dropout_rate, deterministic=not train)(x)
     return x
 
   return make_fn
@@ -46,11 +46,7 @@ class GNN(nn.Module):
 
   @nn.compact
   def __call__(self, graph, train):
-    if self.dropout_rate is None:
-      dropout_rate = 0.1
-    else:
-      dropout_rate = self.dropout_rate
-    dropout = nn.Dropout(rate=dropout_rate, deterministic=not train)
+    dropout_rate = self.dropout_rate
 
     graph = graph._replace(
         globals=jnp.zeros([graph.n_node.shape[0], self.num_outputs]))
@@ -73,11 +69,11 @@ def __call__(self, graph, train):
     for _ in range(self.num_message_passing_steps):
       net = jraph.GraphNetwork(
           update_edge_fn=_make_mlp(
-              self.hidden_dims, dropout=dropout, activation_fn=activation_fn),
+              self.hidden_dims, activation_fn=activation_fn, train=train, dropout_rate=dropout_rate),
           update_node_fn=_make_mlp(
-              self.hidden_dims, dropout=dropout, activation_fn=activation_fn),
+              self.hidden_dims, activation_fn=activation_fn, train=train, dropout_rate=dropout_rate),
           update_global_fn=_make_mlp(
-              self.hidden_dims, dropout=dropout, activation_fn=activation_fn))
+              self.hidden_dims, activation_fn=activation_fn, train=train, dropout_rate=dropout_rate))
 
       graph = net(graph)
 
 
@@ -6,125 +6,113 @@
 
 import os
 
+
 from absl.testing import absltest
 from absl.testing import parameterized
-import torch
-from torch.testing import assert_close
+import jax
+import jax.numpy as jnp
+# import equinox as eqx
+
 
-from algoperf.workloads.fastmri.fastmri_pytorch.models import \
+from algoperf.workloads.fastmri.fastmri_jax.models_ref import \
     UNet as OriginalUNet
-from algoperf.workloads.fastmri.fastmri_pytorch.models_dropout import \
+from algoperf.workloads.fastmri.fastmri_jax.models import \
     UNet as CustomUNet
 
 BATCH, IN_CHANS, H, W = 4, 1, 256, 256
 OUT_CHANS, C, LAYERS = 1, 32, 4
-DEVICE = 'cuda'
-TORCH_COMPILE = False
 SEED = 1996
 
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-torch.backends.cudnn.benchmark = False
-torch.backends.cudnn.deterministic = True
-torch.use_deterministic_algorithms(True)
-
 
-class FastMRIModeEquivalenceTest(parameterized.TestCase):
-
-  def fwd_pass(self, orig, cust, dropout_rate):
-    x = torch.randn(BATCH, IN_CHANS, H, W, device=DEVICE)
-    for mode in ('train', 'eval'):
-      getattr(orig, mode)()
-      getattr(cust, mode)()
-      torch.manual_seed(0)
-      y1 = orig(x)
-      torch.manual_seed(0)
-      y2 = cust(x, dropout_rate)
-      assert_close(y1, y2, atol=0, rtol=0)
-      if mode == 'eval':  # one extra test: omit dropout at eval
-        torch.manual_seed(0)
-        y2 = cust(x)
-        assert_close(y1, y2, atol=0, rtol=0)
+class ModelEquivalenceTest(parameterized.TestCase):
 
   @parameterized.named_parameters(
-      dict(testcase_name='p=0.0', dropout_rate=0.0),
-      dict(testcase_name='p=0.1', dropout_rate=0.1),
-      dict(testcase_name='p=0.7', dropout_rate=0.7),
-      dict(testcase_name='p=1.0', dropout_rate=1.0),
+      dict(
+          testcase_name='UNet, p=0.0',
+          dropout_rate=0.0),
+      dict(
+          testcase_name='UNet, p=0.1',
+          dropout_rate=0.1),
   )
-  def test_dropout_values(self, dropout_rate):
-    """Test different values of dropout_rate."""
+  def test_forward(self, dropout_rate):
+    OrigCls, CustCls = (OriginalUNet, CustomUNet)
 
-    torch.manual_seed(SEED)
-    orig = OriginalUNet(
-        IN_CHANS, OUT_CHANS, C, LAYERS, dropout_rate=dropout_rate).to(DEVICE)
 
-    torch.manual_seed(SEED)
-    cust = CustomUNet(IN_CHANS, OUT_CHANS, C, LAYERS).to(DEVICE)
+    # init model
+    rng, data_rng, dropout_rng = jax.random.split(jax.random.key(SEED), 3)
 
-    cust.load_state_dict(orig.state_dict())  # sync weights
-    if TORCH_COMPILE:
-      orig = torch.compile(orig)
-      cust = torch.compile(cust)
+    kwargs = dict(num_pool_layers = LAYERS, num_channels=IN_CHANS)
+    orig_model = OrigCls(**kwargs)
+    cust_model = CustCls(**kwargs)
 
-    self.fwd_pass(orig, cust, dropout_rate)
+    fake_batch = jnp.ones((BATCH, IN_CHANS, H, W))
 
-  @parameterized.named_parameters(
-      dict(testcase_name='default', use_tanh=False, use_layer_norm=False),
-      dict(testcase_name='tanh', use_tanh=True, use_layer_norm=False),
-      dict(testcase_name='layer_norm', use_tanh=False, use_layer_norm=True),
-      dict(testcase_name='both', use_tanh=True, use_layer_norm=True),
-  )
-  def test_arch_configs(self, use_tanh, use_layer_norm):
-    """Test different architecture configurations, fixed dropout_rate."""
-    dropout_rate = 0.1
-
-    torch.manual_seed(SEED)
-    orig = OriginalUNet(
-        IN_CHANS,
-        OUT_CHANS,
-        C,
-        LAYERS,
-        dropout_rate=dropout_rate,
-        use_tanh=use_tanh,
-        use_layer_norm=use_layer_norm).to(DEVICE)
-
-    torch.manual_seed(SEED)
-    cust = CustomUNet(
-        IN_CHANS,
-        OUT_CHANS,
-        C,
-        LAYERS,
-        use_tanh=use_tanh,
-        use_layer_norm=use_layer_norm).to(DEVICE)
-
-    cust.load_state_dict(orig.state_dict())  # sync weights
-    if TORCH_COMPILE:
-      orig = torch.compile(orig)
-      cust = torch.compile(cust)
-
-    self.fwd_pass(orig, cust, dropout_rate)
+    initial_params_original = orig_model.init({'params': rng},
+                                              fake_batch,
+                                              train=False)
+    initial_params_custom = cust_model.init({'params': rng},
+                                            fake_batch,
+                                            train=False)
+
+    # fwd
+    x = jax.random.normal(data_rng, shape=(BATCH, H, W))
+
+    for mode in ('train', 'eval'):
+      train = mode == 'train'
+      y1 = orig_model.apply(
+          initial_params_original,
+          x,
+          train=train,
+          rngs={'dropout': dropout_rng})
+      y2 = cust_model.apply(
+          initial_params_custom,
+          x,
+          train=train,
+          dropout_rate=dropout_rate,
+          rngs={'dropout': dropout_rng})
+
+      assert jnp.allclose(y1, y2, atol=1e-3, rtol=1e-3)
 
   @parameterized.named_parameters(
-      dict(testcase_name=''),)
+      dict(testcase_name='UNet, default'),
+  )
   def test_default_dropout(self):
     """Test default dropout_rate."""
+    OrigCls, CustCls = (OriginalUNet, CustomUNet)
 
-    torch.manual_seed(SEED)
-    orig = OriginalUNet(IN_CHANS, OUT_CHANS, C, LAYERS).to(DEVICE)
-    torch.manual_seed(SEED)
-    cust = CustomUNet(IN_CHANS, OUT_CHANS, C, LAYERS).to(DEVICE)
-    cust.load_state_dict(orig.state_dict())  # sync weights
 
-    x = torch.randn(BATCH, IN_CHANS, H, W, device=DEVICE)
-    for mode in ('train', 'eval'):
-      getattr(orig, mode)()
-      getattr(cust, mode)()
-      torch.manual_seed(0)
-      y1 = orig(x)
-      torch.manual_seed(0)
-      y2 = cust(x)
-      assert_close(y1, y2, atol=0, rtol=0)
+    # init model
+    rng, data_rng, dropout_rng = jax.random.split(jax.random.key(SEED), 3)
+
+    kwargs = dict(num_pool_layers=LAYERS, 
+                  num_channels=IN_CHANS,
+                  )
+    orig_model = OrigCls(**kwargs)
+    cust_model = CustCls(**kwargs)
 
+    fake_batch = jnp.ones((2, IN_CHANS, H, W))
+
+    initial_params_original = orig_model.init({'params': rng},
+                                              fake_batch,
+                                              train=False)
+    initial_params_custom = cust_model.init({'params': rng},
+                                            fake_batch,
+                                            train=False)
+
+    # fwd
+    x = jax.random.normal(data_rng, shape=(BATCH, H, W))
+
+    for mode in ('train', 'eval'):
+      train = mode == 'train'
+      y1 = orig_model.apply(
+          initial_params_original,
+          x,
+          train=train,
+          rngs={'dropout': dropout_rng})
+      y2 = cust_model.apply(
+          initial_params_custom, x, train=train, rngs={'dropout': dropout_rng})
+
+      assert jnp.allclose(y1, y2, atol=0, rtol=0)
 
 if __name__ == '__main__':
   absltest.main()