Remove unnecessary trimming of cache padding

DavidLandup0 · DavidLandup0 · commit a94acc9c0234 · 2025-10-12T01:27:09.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_causal_lm.py b/keras_hub/src/models/smollm3/smollm3_causal_lm.py
@@ -85,22 +85,18 @@ def call_with_cache(
         logits = self.backbone.token_embedding(x, reverse=True)
         return logits, hidden_states, cache
 
-    def _build_cache(self, token_ids, cache_max_length):
-        """Build an empty cache for use with `call_with_cache()`.
-
-        Args:
-            token_ids: Prompt tokens to seed the cache with
-            cache_max_length: Maximum length for the cache (should be max generation length)
-        """
+    def _build_cache(self, token_ids):
+        """Build an empty cache for use with `call_with_cache()`."""
         batch_size = ops.shape(token_ids)[0]
+        max_length = ops.shape(token_ids)[1]
         num_layers = self.backbone.num_layers
         num_key_value_heads = self.backbone.num_key_value_heads
         head_dim = self.backbone.hidden_dim // self.backbone.num_attention_heads
         shape = [
             batch_size,
             num_layers,
             2,
-            cache_max_length,
+            max_length,
             num_key_value_heads,
             head_dim,
         ]
@@ -130,17 +126,12 @@ def generate_step(
         """
         token_ids, padding_mask = inputs["token_ids"], inputs["padding_mask"]
 
+        hidden_states, cache = self._build_cache(token_ids)
         # Compute the lengths of all user inputted tokens ids.
         row_lengths = ops.sum(ops.cast(padding_mask, "int32"), axis=-1)
         # Start at the first index that has no user inputted id.
         index = ops.min(row_lengths)
 
-        # Only pass actual prompt tokens to _build_cache, not padding
-        # But cache must be sized for the full max_length
-        max_length = ops.shape(token_ids)[1]
-        prompt_token_ids = token_ids[:, :index]
-        hidden_states, cache = self._build_cache(prompt_token_ids, max_length)
-
         def next(prompt, cache, index):
             # The cache index is the index of our previous token.
             cache_update_index = index - 1