Fix rope and caching indexing

DavidLandup0 · DavidLandup0 · commit 4d141204f7f6 · 2025-10-12T01:27:09.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_backbone.py b/keras_hub/src/models/smollm3/smollm3_backbone.py
@@ -7,7 +7,6 @@
 )
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.smollm3.smollm3_layers import SmolLM3DecoderLayer
-from keras_hub.src.models.smollm3.smollm3_layers import SmolLM3RotaryEmbedding
 
 
 @keras_hub_export(
@@ -91,6 +90,9 @@ def __init__(
                 intermediate_size=intermediate_dim,
                 mlp_bias=mlp_bias,
                 layer_norm_epsilon=layer_norm_epsilon,
+                max_position_embeddings=max_position_embeddings,
+                rope_theta=rope_theta,
+                partial_rotary_factor=partial_rotary_factor,
                 name=f"transformer_layer_{i}",
             )
             self.transformer_layers.append(layer)
diff --git a/keras_hub/src/models/smollm3/smollm3_causal_lm.py b/keras_hub/src/models/smollm3/smollm3_causal_lm.py
@@ -85,18 +85,22 @@ def call_with_cache(
         logits = self.backbone.token_embedding(x, reverse=True)
         return logits, hidden_states, cache
 
-    def _build_cache(self, token_ids):
-        """Build an empty cache for use with `call_with_cache()`."""
+    def _build_cache(self, token_ids, cache_max_length):
+        """Build an empty cache for use with `call_with_cache()`.
+
+        Args:
+            token_ids: Prompt tokens to seed the cache with
+            cache_max_length: Maximum length for the cache (should be max generation length)
+        """
         batch_size = ops.shape(token_ids)[0]
-        max_length = ops.shape(token_ids)[1]
         num_layers = self.backbone.num_layers
         num_key_value_heads = self.backbone.num_key_value_heads
         head_dim = self.backbone.hidden_dim // self.backbone.num_attention_heads
         shape = [
             batch_size,
             num_layers,
             2,
-            max_length,
+            cache_max_length,
             num_key_value_heads,
             head_dim,
         ]
@@ -126,17 +130,23 @@ def generate_step(
         """
         token_ids, padding_mask = inputs["token_ids"], inputs["padding_mask"]
 
-        hidden_states, cache = self._build_cache(token_ids)
         # Compute the lengths of all user inputted tokens ids.
         row_lengths = ops.sum(ops.cast(padding_mask, "int32"), axis=-1)
         # Start at the first index that has no user inputted id.
         index = ops.min(row_lengths)
 
+        # Only pass actual prompt tokens to _build_cache, not padding
+        # But cache must be sized for the full max_length
+        max_length = ops.shape(token_ids)[1]
+        prompt_token_ids = token_ids[:, :index]
+        hidden_states, cache = self._build_cache(prompt_token_ids, max_length)
+
         def next(prompt, cache, index):
             # The cache index is the index of our previous token.
             cache_update_index = index - 1
             batch_size = ops.shape(prompt)[0]
             prompt = ops.slice(prompt, [0, cache_update_index], [batch_size, 1])
+
             logits, hidden_states, cache = self.call_with_cache(
                 prompt,
                 cache,
diff --git a/keras_hub/src/models/smollm3/smollm3_layers.py b/keras_hub/src/models/smollm3/smollm3_layers.py
@@ -10,7 +10,7 @@
     merge_padding_and_attention_mask,
 )
 from keras_hub.src.models.smollm3.smollm3_utils import rope_init
-from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
+from keras_hub.src.models.smollm3.smollm3_utils import apply_rotary_pos_emb
 import math
 
 
@@ -39,6 +39,9 @@ def __init__(
         rope_layer_enabled_list: list[bool],
         layer_types: list[str],
         layer_idx: int,
+        max_position_embeddings: int = 2048,
+        rope_theta: float = 10000.0,
+        partial_rotary_factor: float = 1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -50,19 +53,17 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.rope_layer_enabled_list = rope_layer_enabled_list
         self.layer_types = layer_types
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.partial_rotary_factor = partial_rotary_factor
+
         self._dot_product_equation = "bquh,bkuh->buqk"
         self._combine_equation = "buqk,bkuh->bquh"
 
         self.head_dim = hidden_size // self.num_attention_heads
         self._inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
 
-        self.rotary_embedding = RotaryEmbedding(
-            max_wavelength=5000000.0,
-        )
-
         self.layer_idx = layer_idx
-
-        self.head_dim = self.hidden_size // self.num_attention_heads
         self.num_key_value_groups = (
             self.num_attention_heads // self.num_key_value_heads
         )
@@ -97,6 +98,15 @@ def __init__(
             else True
         )  # Default to True if index out of bounds
 
+        self.rotary_embedding = SmolLM3RotaryEmbedding(
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            max_position_embeddings=self.max_position_embeddings,
+            rope_theta=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+            name="rotary_emb",
+        )
+
         self._softmax = layers.Softmax(
             axis=-1,
             dtype="float32",
@@ -172,7 +182,15 @@ def _compute_kv_values(x_input):
                 value = value_cache
             else:
                 key_update, value_update = _compute_kv_values(hidden_states)
-                start = [0, self_attention_cache_update_index, 0, 0]
+
+                # Apply RoPE to key_update BEFORE caching
+                if self.use_rope:
+                    cos, sin = self.rotary_embedding(query, start_index=start_index)
+                    query_rope, key_update = apply_rotary_pos_emb(query, key_update, cos, sin, expansion_axis=2)
+                    query = query_rope
+
+                start = (0, self_attention_cache_update_index, 0, 0)
+
                 key = ops.slice_update(key_cache, start, key_update)
                 value = ops.slice_update(
                     value_cache, start, value_update
@@ -189,14 +207,13 @@ def _compute_kv_values(x_input):
                 )
             key, value = _compute_kv_values(hidden_states)
 
-        if self.use_rope:
-            query = self.rotary_embedding(query, start_index=start_index)
-            key = self.rotary_embedding(key, start_index=start_index)
+            # Apply RoPE when not using cache
+            if self.use_rope:
+                cos, sin = self.rotary_embedding(query, start_index=start_index)
+                query, key = apply_rotary_pos_emb(query, key, cos, sin, expansion_axis=2)
 
-        print('pre', key.shape, value.shape)
         key = ops.repeat(key, repeats=self.num_key_value_groups, axis=2)
         value = ops.repeat(value, repeats=self.num_key_value_groups, axis=2)
-        print('post', key.shape, value.shape)
         
         attn_output = self._compute_attention(
             query,
@@ -400,6 +417,9 @@ def __init__(
         intermediate_size: int,
         mlp_bias: bool,
         layer_norm_epsilon: float,
+        max_position_embeddings: int = 2048,
+        rope_theta: float = 10000.0,
+        partial_rotary_factor: float = 1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -415,6 +435,9 @@ def __init__(
             rope_layer_enabled_list=rope_layer_enabled_list,
             layer_types=layer_types,
             layer_idx=layer_idx,
+            max_position_embeddings=max_position_embeddings,
+            rope_theta=rope_theta,
+            partial_rotary_factor=partial_rotary_factor,
             name="self_attn",
         )
 
@@ -641,26 +664,34 @@ def call(
                Shape can vary, but the last dimension is head_dim.
             position_ids: Tensor of position IDs of shape (batch_size, seq_len).
         """
-        inv_freq_expanded = ops.expand_dims(
-            ops.expand_dims(self.inv_freq, axis=0), axis=-1
-        )
-        
         batch_size = ops.shape(x)[0]
         seq_len = ops.shape(x)[1]
         positions = ops.arange(seq_len, dtype="float32")
         positions = positions + ops.cast(start_index, dtype="float32")
 
+        # inv_freq: (inv_freq_dim,) -> (1, inv_freq_dim, 1) -> (batch, inv_freq_dim, 1)
+        inv_freq_expanded = ops.expand_dims(
+            ops.expand_dims(self.inv_freq, axis=0), axis=-1
+        )
         inv_freq_expanded = ops.broadcast_to(
             inv_freq_expanded, (batch_size, ops.shape(self.inv_freq)[0], 1)
         )
 
-        position_ids_expanded = ops.expand_dims(positions, axis=1).T
+        # positions: (seq_len,) -> (1, 1, seq_len) -> (batch, 1, seq_len)
+        position_ids_expanded = ops.expand_dims(
+            ops.expand_dims(positions, axis=0), axis=0
+        )
+        position_ids_expanded = ops.broadcast_to(
+            position_ids_expanded, (batch_size, 1, seq_len)
+        )
 
+        # matmul: (batch, inv_freq_dim, 1) @ (batch, 1, seq_len) -> (batch, inv_freq_dim, seq_len)
         freqs = ops.matmul(
             ops.cast(inv_freq_expanded, "float32"),
             ops.cast(position_ids_expanded, "float32"),
         )
 
+        # transpose: (batch, inv_freq_dim, seq_len) -> (batch, seq_len, inv_freq_dim)
         freqs = ops.transpose(freqs, axes=(0, 2, 1))
 
         emb = ops.concatenate((freqs, freqs), axis=-1)
diff --git a/keras_hub/src/utils/transformers/convert_smollm3.py b/keras_hub/src/utils/transformers/convert_smollm3.py
@@ -27,7 +27,10 @@ def convert_backbone_config(transformers_config):
         "partial_rotary_factor": 1.0,
         "attention_bias": transformers_config["attention_bias"],
         "attention_dropout": transformers_config["attention_dropout"],
-        "rope_layer_enabled_list": transformers_config["no_rope_layers"],
+        # Despite the name, no_rope_layers: 1 = HAS RoPE, 0 = NO RoPE
+        "rope_layer_enabled_list": [
+            bool(x) for x in transformers_config["no_rope_layers"]
+        ],
         "layer_types": transformers_config["layer_types"],
         "mlp_bias": transformers_config["mlp_bias"]
     }