Enable CB for GptOssModel

mamtsing · ochougul · commit 084412ac1eea · 2025-11-03T16:04:26.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
Signed-off-by: Onkar Chougule &lt;ochougul@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
@@ -609,16 +609,28 @@ def update(
             position_ids = cache_kwargs.get("position_ids")
             is_sliding_layer = cache_kwargs.get("is_sliding")
             sliding_window = cache_kwargs.get("sliding_window")
+            batch_index = cache_kwargs.get("batch_index", None)  # Check and fetch batch index value from the kwargs
 
             if is_sliding_layer:
                 kv_position_ids = torch.where(position_ids == -1, position_ids, position_ids % sliding_window)
             else:
                 kv_position_ids = position_ids
 
-            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], kv_position_ids, key_states)
-            self.value_cache[layer_idx] = CtxScatterFunc.apply(
-                self.value_cache[layer_idx], kv_position_ids, value_states
-            )
+            if batch_index is not None:
+                invalid_scatter_index = torch.iinfo(torch.int32).max
+                scatter_position_ids = torch.where(kv_position_ids < 0, invalid_scatter_index, kv_position_ids)
+                self.key_cache[layer_idx] = CtxScatterFuncCB.apply(
+                    self.key_cache[layer_idx], batch_index, scatter_position_ids, key_states
+                )
+                self.value_cache[layer_idx] = CtxScatterFuncCB.apply(
+                    self.value_cache[layer_idx], batch_index, scatter_position_ids, value_states
+                )
+            else:
+                self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], kv_position_ids, key_states)
+                self.value_cache[layer_idx] = CtxScatterFunc.apply(
+                    self.value_cache[layer_idx], kv_position_ids, value_states
+                )
+
             k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
 
             # Original Gather
@@ -632,7 +644,12 @@ def update(
                 invalid_idx_value = 0
             ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
 
-            k_out = CtxGatherFunc.apply(k_out, ctx_indices)
-            v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+            if batch_index is not None:
+                k_out = CtxGatherFuncCB.apply(k_out, batch_index, ctx_indices)
+                v_out = CtxGatherFuncCB.apply(v_out, batch_index, ctx_indices)
+            else:
+                k_out = CtxGatherFunc.apply(k_out, ctx_indices)
+                v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+
             v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
         return k_out, v_out
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -428,9 +428,6 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        # kv_seq_len = key_states.shape[-2]
-
-        # kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=32 * 1024)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
@@ -508,6 +505,7 @@ def forward(
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states, _ = self.mlp(hidden_states)  # diff with llama: router scores
         # alth, _ = self.mlp.alt_forward(hidden_states)
+        hidden_states = hidden_states.reshape(residual.shape)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
 
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
@@ -87,8 +87,8 @@ def prepare_pytorch_inputs(self):
 
         if self.full_batch_size:
             inputs["input_ids"] = input_ids
-            inputs["position_ids"] = torch.arange(input_len).view(1, input_len)
-            inputs["batch_index"] = torch.arange(1).view(-1, 1)
+            inputs["position_ids"] = position_ids
+            inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1)
 
         past_key_values = []
         sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]]
@@ -117,18 +117,15 @@ def update_pytorch_inputs(self, inputs, pt_outputs):
         """
         updated_inputs = {}
         if self.full_batch_size:
-            batch_index = torch.arange(1).view(-1, 1)
-
             input_ids = pt_outputs.logits.detach().argmax(2)
             updated_inputs["input_ids"] = torch.full((self.full_batch_size, 1), self.tokenizer.pad_token_id)
-            updated_inputs["input_ids"][batch_index.view(-1)] = input_ids
+            updated_inputs["input_ids"][inputs["batch_index"].view(-1)] = input_ids
 
             position_ids = inputs["position_ids"].max(1, keepdim=True).values + 1
             updated_inputs["position_ids"] = torch.full((self.full_batch_size, 1), 0)
-            updated_inputs["position_ids"][batch_index.view(-1)] = position_ids
-
-            updated_inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1)
+            updated_inputs["position_ids"][inputs["batch_index"].view(-1)] = position_ids
 
+            updated_inputs["batch_index"] = inputs["batch_index"]
         else:
             updated_inputs["input_ids"] = pt_outputs["logits"].argmax(-1).reshape(-1, 1)
             updated_inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1
@@ -172,9 +169,15 @@ def prepare_ort_inputs(self):
                 inputs["past_key." + str(i)] = np.zeros((cache_shape), dtype=np.float32)
                 inputs["past_value." + str(i)] = np.zeros((cache_shape), dtype=np.float32)
         else:
+            sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]]
             for i in range(self.n_layer):
-                inputs["past_key." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
-                inputs["past_value." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
+                pad_shape = (
+                    sliding_padding_shape if self.config.layer_types[i] == "sliding_attention" else self.padding_shape
+                )
+                inputs["past_key." + str(i)] = np.zeros((pad_shape), dtype=np.float32)
+                inputs["past_value." + str(i)] = np.zeros((pad_shape), dtype=np.float32)
+        if self.full_batch_size:
+            inputs["batch_index"] = np.arange(self.full_batch_size).reshape(-1, 1)
         return inputs
 
     def update_ort_inputs(self, inputs, ort_outputs):
@@ -195,7 +198,8 @@ def update_ort_inputs(self, inputs, ort_outputs):
         for i in range(self.n_layer):
             updated_inputs["past_key." + str(i)] = ort_outputs["past_key_values"][i * 2]
             updated_inputs["past_value." + str(i)] = ort_outputs["past_key_values"][i * 2 + 1]
-
+        if self.full_batch_size:
+            updated_inputs["batch_index"] = inputs["batch_index"]
         return updated_inputs
 
     def update_ort_outputs(self, ort_outputs):