mlcommons
diff --git a/‎prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py‎
Lines changed: 52 additions & 26 deletions b/‎prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py‎
Lines changed: 52 additions & 26 deletions
diff --git a/‎prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py‎
Lines changed: 1 addition & 1 deletion b/‎prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎reference_algorithms/development_algorithms/__init__.py‎ b/‎reference_algorithms/development_algorithms/__init__.py‎
diff --git a/‎reference_algorithms/development_algorithms/cifar/__init__.py‎ b/‎reference_algorithms/development_algorithms/cifar/__init__.py‎
diff --git a/‎reference_algorithms/development_algorithms/cifar/cifar_jax/__init__.py‎ b/‎reference_algorithms/development_algorithms/cifar/cifar_jax/__init__.py‎
diff --git a/‎reference_algorithms/development_algorithms/cifar/cifar_jax/submission.py‎
Lines changed: 0 additions & 180 deletions b/‎reference_algorithms/development_algorithms/cifar/cifar_jax/submission.py‎
Lines changed: 0 additions & 180 deletions
diff --git a/‎reference_algorithms/development_algorithms/cifar/cifar_pytorch/__init__.py‎ b/‎reference_algorithms/development_algorithms/cifar/cifar_pytorch/__init__.py‎
@@ -192,24 +192,18 @@ def jax_cosine_warmup(step_hint: int, hyperparameters):
                                    workload.param_shapes)
   optimizer_state = opt_init_fn(params_zeros_like)
 
-  return jax_utils.replicate(optimizer_state), opt_update_fn
-
-
-@functools.partial(
-    jax.pmap,
-    axis_name='batch',
-    in_axes=(None, None, 0, 0, 0, 0, 0, None, None),
-    static_broadcasted_argnums=(0, 1),
-    donate_argnums=(2, 3, 4))
-def pmapped_train_step(workload,
-                       opt_update_fn,
-                       model_state,
-                       optimizer_state,
-                       current_param_container,
-                       batch,
-                       rng,
-                       grad_clip,
-                       label_smoothing):
+  return optimizer_state, opt_update_fn
+
+
+def train_step(workload,
+               opt_update_fn,
+               model_state,
+               optimizer_state,
+               current_param_container,
+               batch,
+               rng,
+               grad_clip,
+               label_smoothing):
 
   def _loss_fn(params):
     """Loss function used for training."""
@@ -232,9 +226,7 @@ def _loss_fn(params):
   grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
   (summed_loss, (n_valid_examples, new_model_state)), grad = grad_fn(
       current_param_container)
-  # Get correct global mean loss and grad.
-  (summed_loss, n_valid_examples, grad) = lax.psum(
-      (summed_loss, n_valid_examples, grad), axis_name='batch')
+  # Compute mean loss and grad
   loss = summed_loss / n_valid_examples
   grad = jax.tree.map(lambda x: x / n_valid_examples, grad)
 
@@ -272,7 +264,6 @@ def update_params(
   del eval_results
 
   optimizer_state, opt_update_fn = optimizer_state
-  per_device_rngs = jax.random.split(rng, jax.local_device_count())
   if hasattr(hyperparameters, 'label_smoothing'):
     label_smoothing = hyperparameters.label_smoothing
   else:
@@ -281,13 +272,48 @@ def update_params(
     grad_clip = hyperparameters.grad_clip
   else:
     grad_clip = None
-  outputs = pmapped_train_step(workload,
+
+  # Get mesh 
+  mesh = jax_sharding_utils.get_mesh()
+  # Create shardings for each argument
+  replicated = jax_sharding_utils.get_replicated_sharding(mesh)  # No partitioning
+  sharded = jax_sharding_utils.get_batch_sharding(
+      mesh)  # Partition along batch dimension
+
+  # Create the sharding rules for each argument
+  arg_shardings = (
+      # workload is static
+      # opt_update_fn is static
+      replicated,  # model_state
+      replicated,  # optimizer_state
+      replicated,  # current_param_container
+      sharded,  # batch
+      replicated,  # rng
+      replicated,  # grad_clip
+      replicated  # label_smoothing
+  )
+  out_shardings = (
+      replicated,  # new_optimizer_state
+      replicated,  # updated_params
+      replicated,  # new_model_state
+      replicated,  # loss
+      replicated  # grad_norm
+  )
+  # Jit with shardings
+  jitted_train_step = jax.jit(
+      train_step,
+      static_argnums=(0, 1),
+      donate_argnums=(2, 3, 4),
+      in_shardings=arg_shardings,
+      out_shardings=out_shardings)
+
+  new_optimizer_state, new_params, new_model_state, loss, grad_norm = jitted_train_step(workload,
                                opt_update_fn,
                                model_state,
                                optimizer_state,
                                current_param_container,
                                batch,
-                               per_device_rngs,
+                               rng,
                                grad_clip,
                                label_smoothing)
   new_optimizer_state, new_params, new_model_state, loss, grad_norm = outputs
@@ -296,8 +322,8 @@ def update_params(
   if global_step % 100 == 0 and workload.metrics_logger is not None:
     workload.metrics_logger.append_scalar_metrics(
         {
-            'loss': loss[0],
-            'grad_norm': grad_norm[0],
+            'loss': loss.item(),
+            'grad_norm': grad_norm.item()
         }, global_step)
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
 
@@ -192,7 +192,7 @@ def jax_cosine_warmup(step_hint: int, hyperparameters):
                                    workload.param_shapes)
   optimizer_state = opt_init_fn(params_zeros_like)
 
-  return jax_utils.replicate(optimizer_state), opt_update_fn
+  return optimizer_state, opt_update_fn
 
 
 @functools.partial(