mlcommons
diff --git a/‎algorithmic_efficiency/data_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎algorithmic_efficiency/data_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algorithmic_efficiency/param_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎algorithmic_efficiency/param_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algorithmic_efficiency/workloads/cifar/cifar_jax/workload.py‎
Lines changed: 1 addition & 1 deletion b/‎algorithmic_efficiency/workloads/cifar/cifar_jax/workload.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algorithmic_efficiency/workloads/imagenet_resnet/imagenet_jax/workload.py‎
Lines changed: 1 addition & 1 deletion b/‎algorithmic_efficiency/workloads/imagenet_resnet/imagenet_jax/workload.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algorithmic_efficiency/workloads/imagenet_vit/imagenet_jax/models.py‎
Lines changed: 2 additions & 2 deletions b/‎algorithmic_efficiency/workloads/imagenet_vit/imagenet_jax/models.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/models.py‎
Lines changed: 3 additions & 3 deletions b/‎algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/models.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/workload.py‎
Lines changed: 6 additions & 5 deletions b/‎algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/workload.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎algorithmic_efficiency/workloads/mnist/mnist_jax/workload.py‎
Lines changed: 1 addition & 1 deletion b/‎algorithmic_efficiency/workloads/mnist/mnist_jax/workload.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎algorithmic_efficiency/workloads/ogbg/input_pipeline.py‎
Lines changed: 2 additions & 2 deletions b/‎algorithmic_efficiency/workloads/ogbg/input_pipeline.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎algorithmic_efficiency/workloads/ogbg/ogbg_pytorch/workload.py‎
Lines changed: 3 additions & 3 deletions b/‎algorithmic_efficiency/workloads/ogbg/ogbg_pytorch/workload.py‎
Lines changed: 3 additions & 3 deletions
@@ -65,7 +65,7 @@ def _prepare(x):
     # Assumes that `global_batch_size % local_device_count == 0`.
     return x.reshape((local_device_count, -1, *x.shape[1:]))
 
-  return jax.tree_map(_prepare, batch)
+  return jax.tree.map(_prepare, batch)
 
 
 def pad(tensor: np.ndarray,
 
@@ -66,7 +66,7 @@ def pytorch_param_types(
 
 def jax_param_shapes(
     params: spec.ParameterContainer) -> spec.ParameterShapeTree:
-  return jax.tree_map(lambda x: spec.ShapeTuple(x.shape), params)
+  return jax.tree.map(lambda x: spec.ShapeTuple(x.shape), params)
 
 
 def jax_param_types(param_shapes: spec.ParameterShapeTree,
 
@@ -207,4 +207,4 @@ def _normalize_eval_metrics(
       self, num_examples: int, total_metrics: Dict[str,
                                                    Any]) -> Dict[str, float]:
     """Normalize eval metrics."""
-    return jax.tree_map(lambda x: float(x[0] / num_examples), total_metrics)
+    return jax.tree.map(lambda x: float(x[0] / num_examples), total_metrics)
@@ -264,7 +264,7 @@ def _eval_model_on_split(self,
           eval_metrics[metric_name] = 0.0
         eval_metrics[metric_name] += metric_value
 
-    eval_metrics = jax.tree_map(lambda x: float(x[0] / num_examples),
+    eval_metrics = jax.tree.map(lambda x: float(x[0] / num_examples),
                                 eval_metrics)
     return eval_metrics
 
 
@@ -70,7 +70,7 @@ class Encoder1DBlock(nn.Module):
   def __call__(self, x: spec.Tensor, train: bool = True) -> spec.Tensor:
     if not self.use_post_layer_norm:
       y = nn.LayerNorm(name='LayerNorm_0')(x)
-      y = nn.SelfAttention(
+      y = nn.MultiHeadDotProductAttention(
           num_heads=self.num_heads,
           kernel_init=nn.initializers.xavier_uniform(),
           deterministic=train,
@@ -89,7 +89,7 @@ def __call__(self, x: spec.Tensor, train: bool = True) -> spec.Tensor:
       x = x + y
     else:
       y = x
-      y = nn.SelfAttention(
+      y = nn.MultiHeadDotProductAttention(
           num_heads=self.num_heads,
           kernel_init=nn.initializers.xavier_uniform(),
           deterministic=train,
 
@@ -396,10 +396,9 @@ def __call__(self, inputs, paddings, train):
         mask_paddings > 0, mask_paddings > 0, dtype=jnp.float32)
 
     inputs = LayerNorm(dim=config.encoder_dim)(inputs)
-
     attention_fn = functools.partial(
         dot_product_attention, temperature=config.attention_temperature)
-    result = nn.SelfAttention(
+    result = nn.MultiHeadDotProductAttention(
         num_heads=config.num_attention_heads,
         qkv_features=config.encoder_dim,
         decode=False,
@@ -410,7 +409,8 @@ def __call__(self, inputs, paddings, train):
         broadcast_dropout=False,
         attention_fn=attention_fn,
         dropout_rate=config.attention_dropout_rate,
-        deterministic=not train)(inputs, attention_mask)
+        deterministic=not train)(
+            inputs_q=inputs, mask=attention_mask)
 
     if config.attention_residual_dropout_rate is None:
       attention_residual_dropout_rate = 0.1
 
@@ -227,11 +227,12 @@ def ctc_loss(self,
                labels: spec.Tensor,
                label_paddings: spec.Tensor,
                blank_id: int = 0) -> spec.Tensor:
-    return optax.ctc_loss(logits,
-                          logit_paddings,
-                          labels,
-                          label_paddings,
-                          blank_id)
+    return optax.ctc_loss(
+        logits=logits,
+        logit_paddings=logit_paddings,
+        labels=labels,
+        label_paddings=label_paddings,
+        blank_id=blank_id)
 
   # Adapted from lingvo's greedy decoding logic here:
   # https://github.com/tensorflow/lingvo/blob/2ee26814c57b7dcead3f0382170f2f3da006f810/lingvo/jax/layers/ctc_objectives.py#L138.
 
@@ -132,4 +132,4 @@ def _normalize_eval_metrics(
       self, num_examples: int, total_metrics: Dict[str,
                                                    Any]) -> Dict[str, float]:
     """Normalize eval metrics."""
-    return jax.tree_map(lambda x: float(x[0] / num_examples), total_metrics)
+    return jax.tree.map(lambda x: float(x[0] / num_examples), total_metrics)
@@ -51,7 +51,7 @@ def _load_dataset(split, should_shuffle, data_rng, data_dir):
 
 def _to_jraph(example):
   """Converts an example graph to jraph.GraphsTuple."""
-  example = jax.tree_map(lambda x: x._numpy(), example)  # pylint: disable=protected-access
+  example = jax.tree.map(lambda x: x._numpy(), example)  # pylint: disable=protected-access
   edge_feat = example['edge_feat']
   node_feat = example['node_feat']
   edge_index = example['edge_index']
@@ -150,7 +150,7 @@ def _get_batch_iterator(dataset_iter, global_batch_size, num_shards=None):
     if count == num_shards:
 
       def f(x):
-        return jax.tree_map(lambda *vals: np.stack(vals, axis=0), x[0], *x[1:])
+        return jax.tree.map(lambda *vals: np.stack(vals, axis=0), x[0], *x[1:])
 
       graphs_shards = f(graphs_shards)
       labels_shards = f(labels_shards)
 
@@ -20,8 +20,8 @@
 
 def _pytorch_map(inputs: Any) -> Any:
   if USE_PYTORCH_DDP:
-    return jax.tree_map(lambda a: torch.as_tensor(a, device=DEVICE), inputs)
-  return jax.tree_map(
+    return jax.tree.map(lambda a: torch.as_tensor(a, device=DEVICE), inputs)
+  return jax.tree.map(
       lambda a: torch.as_tensor(a, device=DEVICE).view(-1, a.shape[-1])
       if len(a.shape) == 3 else torch.as_tensor(a, device=DEVICE).view(-1),
       inputs)
@@ -30,7 +30,7 @@ def _pytorch_map(inputs: Any) -> Any:
 def _shard(inputs: Any) -> Any:
   if not USE_PYTORCH_DDP:
     return inputs
-  return jax.tree_map(lambda tensor: tensor[RANK], inputs)
+  return jax.tree.map(lambda tensor: tensor[RANK], inputs)
 
 
 def _graph_map(function: Callable, graph: GraphsTuple) -> GraphsTuple: