sharding deepspeech

priyakasimbeg · priyakasimbeg · commit c208cc7a760b · 2025-03-19T19:59:51.000Z
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/models.py b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/models.py
@@ -397,15 +397,15 @@ def __call__(
       seq_lengths_np = np.shape(seq_lengths)
 
       n = jax.devices()
-      logging.info(f"jax num devices {n}")
-      logging.info(f'inputs shape {inputs_shape}')
-      logging.info(f'h_0 shape {h_0_shape}')
-      logging.info(f'c_0 shape {c_0_shape}')
-      logging.info(f'seq_lengths shape {seq_lengths_np}')
-      logging.info(f'weights_shape {weights_shape}')
-      logging.info(f'input_size {input_size}')
-      logging.info(f'hidden_size {self.features}')
-      logging.info(f'num_layers {self.num_layers}')
+      # logging.info(f"jax num devices {n}")
+      # logging.info(f'inputs shape {inputs_shape}')
+      # logging.info(f'h_0 shape {h_0_shape}')
+      # logging.info(f'c_0 shape {c_0_shape}')
+      # logging.info(f'seq_lengths shape {seq_lengths_np}')
+      # logging.info(f'weights_shape {weights_shape}')
+      # logging.info(f'input_size {input_size}')
+      # logging.info(f'hidden_size {self.features}')
+      # logging.info(f'num_layers {self.num_layers}')
 
       y, h, c = rnn.lstm(
           x=inputs, h_0=h_0, c_0=c_0, weights=weights,
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py
@@ -4,6 +4,8 @@
 from flax import jax_utils
 import jax
 import jax.numpy as jnp
+from jax.experimental.shard_map import shard_map
+from jax.sharding import PartitionSpec as P
 import numpy as np
 
 from algoperf import param_utils
@@ -66,6 +68,21 @@ def model_fn(
       update_batch_norm: bool,
       use_running_average_bn: Optional[bool] = None
   ) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
+
+    model_fn_sharded = shard_map(model_fn_ref,
+                                 self.mesh,
+                                 )
+      
+  def model_fn_ref(
+      self,
+      params: spec.ParameterContainer,
+      augmented_and_preprocessed_input_batch: Dict[str, spec.Tensor],
+      model_state: spec.ModelAuxiliaryState,
+      mode: spec.ForwardPassMode,
+      rng: spec.RandomState,
+      update_batch_norm: bool,
+      use_running_average_bn: Optional[bool] = None
+  ) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
     variables = {'params': params, **model_state}
     inputs, input_paddings = augmented_and_preprocessed_input_batch['inputs']
     is_train_mode = mode == spec.ForwardPassMode.TRAIN
diff --git a/pyproject.toml b/pyproject.toml
@@ -106,15 +106,11 @@ jax_core_deps = [
   "protobuf==4.25.5",
 ]
 jax_cpu = [
-  "jax==0.4.28",
-  "jaxlib==0.4.28",
+  "jax",
   "algoperf[jax_core_deps]",
 ]
 jax_gpu = [
-  "jax==0.4.28",
-  "jaxlib==0.4.28",
-  "jax-cuda12-plugin[with_cuda]==0.4.28",
-  "jax-cuda12-pjrt==0.4.28",
+  "jax[cuda12]",
   "algoperf[jax_core_deps]",
 ]
 pytorch_cpu = ["torch==2.5.1", "torchvision==0.20.1"]
diff --git a/reference_algorithms/paper_baselines/adamw/jax/submission.py b/reference_algorithms/paper_baselines/adamw/jax/submission.py
@@ -74,7 +74,7 @@ def _loss_fn(params):
         model_state,
         spec.ForwardPassMode.TRAIN,
         rng,
-        update_batch_norm=True)
+        update_batch_norm=True,)
     loss_dict = workload.loss_fn(
         label_batch=batch['targets'],
         logits_batch=logits,

Original file line number	Diff line number	Diff line change
`@@ -106,15 +106,11 @@ jax_core_deps = [`
`106`	`106`	`"protobuf==4.25.5",`
`107`	`107`	`]`
`108`	`108`	`jax_cpu = [`
`109`		`- "jax==0.4.28",`
`110`		`- "jaxlib==0.4.28",`
	`109`	`+ "jax",`
`111`	`110`	`"algoperf[jax_core_deps]",`
`112`	`111`	`]`
`113`	`112`	`jax_gpu = [`
`114`		`- "jax==0.4.28",`
`115`		`- "jaxlib==0.4.28",`
`116`		`- "jax-cuda12-plugin[with_cuda]==0.4.28",`
`117`		`- "jax-cuda12-pjrt==0.4.28",`
	`113`	`+ "jax[cuda12]",`
`118`	`114`	`"algoperf[jax_core_deps]",`
`119`	`115`	`]`
`120`	`116`	`pytorch_cpu = ["torch==2.5.1", "torchvision==0.20.1"]`