ogbg debugging

priyakasimbeg · priyakasimbeg · commit 93ff95876b33 · 2025-03-20T00:06:20.000Z
diff --git a/algoperf/workloads/ogbg/input_pipeline.py b/algoperf/workloads/ogbg/input_pipeline.py
@@ -11,7 +11,7 @@
 import torch
 
 AVG_NODES_PER_GRAPH = 26
-AVG_EDGES_PER_GRAPH = 56
+AVG_EDGES_PER_GRAPH = 28
 
 TFDS_SPLIT_NAME = {
     'train': 'train',
@@ -148,24 +148,24 @@ def _get_batch_iterator(dataset_iter, global_batch_size, num_shards=None):
     weights_shards.append(weights)
 
     if count == num_shards:
-      # yield {
-      #     'inputs': jraph.batch(graphs_shards),
-      #     'targets': np.vstack(labels_shards),
-      #     'weights': np.vstack(weights_shards)
-      # }
-
-      def f(x):
-        return jax.tree.map(lambda *vals: np.concatenate(vals, axis=0), x[0], *x[1:])
-      
-      graphs_shards = f(graphs_shards)
-      labels_shards = f(labels_shards)
-      weights_shards = f(weights_shards)
       yield {
-          'inputs': graphs_shards,
-          'targets': labels_shards,
-          'weights': weights_shards,
+          'inputs': jraph.batch(graphs_shards),
+          'targets': np.vstack(labels_shards),
+          'weights': np.vstack(weights_shards)
       }
 
+      # def f(x):
+      #   return jax.tree.map(lambda *vals: np.concatenate(vals, axis=0), x[0], *x[1:])
+      
+      # graphs_shards = f(graphs_shards)
+      # labels_shards = f(labels_shards)
+      # weights_shards = f(weights_shards)
+      # yield {
+      #     'inputs': graphs_shards,
+      #     'targets': labels_shards,
+      #     'weights': weights_shards,
+      # }
+
       count = 0
       graphs_shards = []
       labels_shards = []
diff --git a/reference_algorithms/paper_baselines/adamw/jax/submission.py b/reference_algorithms/paper_baselines/adamw/jax/submission.py
@@ -74,7 +74,7 @@ def _loss_fn(params):
         model_state,
         spec.ForwardPassMode.TRAIN,
         rng,
-        update_batch_norm=True,)
+        update_batch_norm=True)
     jax.debug.print("logits: {logits}", logits=logits)
     loss_dict = workload.loss_fn(
         label_batch=batch['targets'],
@@ -136,6 +136,10 @@ def update_params(
   else:
     grad_clip = None
 
+  batch_shapes = jax.tree.map(jnp.shape, batch)
+  print("batch shapes:")
+  print(batch_shapes)
+
   # Set up mesh and sharding
   mesh = sharding_utils.get_mesh()
   replicated = NamedSharding(mesh, P())  # No partitioning
diff --git a/submission_runner.py b/submission_runner.py
@@ -32,6 +32,8 @@
 import jax
 import tensorflow as tf
 
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
 # New PRNG implementation for correct sharding
 jax.config.update('jax_default_prng_impl', 'threefry2x32')
 jax.config.update('jax_threefry_partitionable', True)