Ensure that suggested COO buffer size does not exceed the theoretical max based on limits.

Google-ML-Automation · Google-ML-Automation · commit eb2078991649 · 2025-10-07T07:09:02.000-07:00
PiperOrigin-RevId: 816192724
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc
@@ -134,7 +134,8 @@ struct TableState {
         stacked_table_metadata(metadata),
         coo_buffer_size_per_device(
             ComputeCooBufferSizePerDevice(num_scs, options.num_sc_per_device,
-                                          metadata, options.batch_number)),
+                                          metadata, options.batch_number,
+                                          options.enable_minibatching)),
         csr_arrays_per_host(options.local_device_count,
                             row_pointers_size_per_bucket *
                                 (options.enable_minibatching
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc
@@ -713,11 +713,9 @@ TEST_F(MinibatchingCountTest, SingleHostMinibatchCountIsCorrectWhenRequired) {
   // Also increase buffer size.
   stacked_table_metadata_[0].max_ids_per_partition = 5;
   stacked_table_metadata_[0].max_unique_ids_per_partition = 2;
-  stacked_table_metadata_[0].suggested_coo_buffer_size_per_device = 2048;
 
   stacked_table_metadata_[1].max_ids_per_partition = 5;
   stacked_table_metadata_[1].max_unique_ids_per_partition = 6;
-  stacked_table_metadata_[1].suggested_coo_buffer_size_per_device = 2048;
 
   auto input_batches =
       CreateInputBatches(/*max_ids_per_partitions=*/{10, 20},
@@ -799,8 +797,6 @@ TEST_F(MinibatchingCountTest, MultiHostMinibatchCountIsCorrectWhenRequired) {
   absl::Mutex mutex;
   std::vector<int> minibatches_per_host(kHosts, -1);
 
-  stacked_table_metadata_[0].suggested_coo_buffer_size_per_device = 8192;
-  stacked_table_metadata_[1].suggested_coo_buffer_size_per_device = 8192;
   absl::flat_hash_map<std::string, std::vector<StackedTableMetadata>>
       stacked_tables({{"table_0", stacked_table_metadata_}});
 
@@ -850,8 +846,6 @@ TEST_F(MinibatchingCountTest, MultiHostMinibatchCountIsCorrectWhenOneRequires) {
   absl::Mutex mutex;
   std::vector<int> minibatches_per_host(kHosts, -1);
 
-  stacked_table_metadata_[0].suggested_coo_buffer_size_per_device = 8192;
-  stacked_table_metadata_[1].suggested_coo_buffer_size_per_device = 8192;
   absl::flat_hash_map<std::string, std::vector<StackedTableMetadata>>
       stacked_tables({{"table_0", stacked_table_metadata_}});
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.cc
@@ -210,10 +210,29 @@ RowCombiner GetRowCombiner(absl::string_view combiner) {
   return RowCombiner::kSum;
 }
 
+int64_t MayBeUpdateBufferSize(
+    int64_t theoretical_max,
+    std::optional<int64_t> suggested_coo_buffer_size_per_device,
+    int num_scs_per_device, absl::string_view stacked_table_name) {
+  // Since the suggested size corresponds to only current device (local SCs),
+  // Buffer for each SC should be properly aligned, hence ALIGNMENT *
+  // num_scs_per_device
+  int64_t suggested_value = RoundUpTo<int64_t>(
+      suggested_coo_buffer_size_per_device.value(),
+      TPU_VECTOR_REGISTER_ALIGMENT_SIZE * num_scs_per_device);
+  CHECK(suggested_value <= theoretical_max)
+      << "Suggested Coo Buffer Size is larger than the theoretical "
+         "max for table "
+      << stacked_table_name << ": " << suggested_value << " vs "
+      << theoretical_max
+      << ". Adjust the suggested size or the max_ids_per_partition values.";
+  return suggested_value;
+}
+
 int ComputeCooBufferSizePerDevice(
     const int num_scs, const int num_scs_per_device,
     absl::Span<const StackedTableMetadata> stacked_table_metadata,
-    const int batch_number) {
+    const int batch_number, bool use_minibatching) {
   const int max_ids_per_partition =
       MaxIdsPerPartitionForStackedTables(stacked_table_metadata);
   const std::optional<int> suggested_coo_buffer_size_per_device =
@@ -222,7 +241,8 @@ int ComputeCooBufferSizePerDevice(
   const int64_t max_ids_rounded_up = RoundUpTo<int64_t>(
       max_ids_per_partition, TPU_VECTOR_REGISTER_ALIGMENT_SIZE);
   const int64_t theoretical_max =
-      max_ids_rounded_up * num_scs_per_device * num_scs;
+      max_ids_rounded_up * num_scs_per_device * num_scs *
+      (use_minibatching ? CooFormat::kMaxMinibatchingBuckets : 1);
   const std::string& stacked_table_name = stacked_table_metadata[0].name;
   LOG_IF(INFO, batch_number % 100 == 0)
       << "Theoretical Max for table " << stacked_table_name << ": "
@@ -234,12 +254,9 @@ int ComputeCooBufferSizePerDevice(
     LOG_IF(INFO, batch_number % 100 == 0)
         << "Suggested Coo Buffer Size for table " << stacked_table_name << ": "
         << suggested_coo_buffer_size_per_device.value();
-    // Since the suggested size corresponds to only current device (local SCs),
-    // Buffer for each SC should be properly aligned, hence ALIGNMENT *
-    // num_scs_per_device
-    result = RoundUpTo<int64_t>(
-        suggested_coo_buffer_size_per_device.value(),
-        TPU_VECTOR_REGISTER_ALIGMENT_SIZE * num_scs_per_device);
+    result = MayBeUpdateBufferSize(
+        theoretical_max, suggested_coo_buffer_size_per_device,
+        num_scs_per_device, stacked_table_name);
   } else {
     LOG_IF(WARNING, batch_number % 100 == 0)
         << "No Coo Buffer Size provided for table " << stacked_table_name
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.h b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.h
@@ -293,7 +293,7 @@ struct StackedTableMetadata {
 int ComputeCooBufferSizePerDevice(
     int num_scs, int num_scs_per_device,
     absl::Span<const StackedTableMetadata> stacked_table_metadata,
-    int batch_number = 0);
+    int batch_number = 0, bool use_minibatching = false);
 
 int MaxIdsPerPartitionForStackedTables(
     absl::Span<const StackedTableMetadata> stacked_table_metadata);
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util_test.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util_test.cc
@@ -139,6 +139,12 @@ TEST(InputPreprocessingUtilTest, ComputeCooBufferSize) {
                                           /*num_scs_per_device=*/4,
                                           stacked_table_metadata),
             96);
+  stacked_table_metadata[0].suggested_coo_buffer_size_per_device = 1024;
+  // The theoretical max is 16 * 4 * 4 = 256. This is less than the suggestion.
+  EXPECT_DEATH(ComputeCooBufferSizePerDevice(/*num_scs=*/4,
+                                             /*num_scs_per_device=*/4,
+                                             stacked_table_metadata),
+               ".*Check failed: suggested_value <= theoretical_max.*");
 }
 
 TEST(SortAndGroupTest, Base) {
diff --git a/jax_tpu_embedding/sparsecore/lib/nn/tests/minibatching_test.py b/jax_tpu_embedding/sparsecore/lib/nn/tests/minibatching_test.py
@@ -137,7 +137,6 @@ def setUp(self):
         optimizer=embedding_spec.SGDOptimizerSpec(),
         combiner="sum",
         name="table_a",
-        suggested_coo_buffer_size_per_device=8192,
     )
     self.feature_spec = embedding_spec.FeatureSpec(
         table_spec=self.table_spec,
@@ -404,7 +403,6 @@ def setUp(self):
         optimizer=embedding_spec.SGDOptimizerSpec(),
         combiner="sum",
         name="table_a",
-        suggested_coo_buffer_size_per_device=16384,
     )
     self.feature_spec = embedding_spec.FeatureSpec(
         table_spec=self.table_spec,
diff --git a/jax_tpu_embedding/sparsecore/lib/nn/tests/preprocess_input_benchmarks.py b/jax_tpu_embedding/sparsecore/lib/nn/tests/preprocess_input_benchmarks.py
@@ -81,6 +81,7 @@ def generate_feature_specs(num_features: int, num_samples: int):
             total_sample_count=num_samples,
             max_ids_per_partition=1024,
             max_unique_ids_per_partition=1024,
+            suggested_coo_buffer_size_per_device=4096,
         ),
     )
     feature_spec = embedding_spec.FeatureSpec(

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ def generate_feature_specs(num_features: int, num_samples: int):`
`81`	`81`	`total_sample_count=num_samples,`
`82`	`82`	`max_ids_per_partition=1024,`
`83`	`83`	`max_unique_ids_per_partition=1024,`
	`84`	`+ suggested_coo_buffer_size_per_device=4096,`
`84`	`85`	`),`
`85`	`86`	`)`
`86`	`87`	`feature_spec = embedding_spec.FeatureSpec(`