optimize mllama

grimoire · grimoire · commit 28b62eac2ae5 · 2024-11-16T14:54:42.000+08:00
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -34,6 +34,7 @@ def __init__(
         alibi: bool = None,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> None:
         if scale is None:
@@ -53,6 +54,7 @@ def __init__(
         self.alibi = alibi
         self.sliding_window = sliding_window
         self.logit_softcapping = logit_softcapping
+        self.causal = causal
 
     @abstractmethod
     def forward(
@@ -82,6 +84,7 @@ def build(
         alibi: bool = False,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> AttentionImpl[T]:
         """build."""
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -41,6 +41,7 @@ def __init__(
         alibi: bool = False,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ):
         super().__init__(
@@ -52,8 +53,10 @@ def __init__(
             alibi=alibi,
             sliding_window=sliding_window,
             logit_softcapping=logit_softcapping,
+            causal=causal,
             **kwargs,
         )
+        assert not (alibi and not causal)
 
         from lmdeploy.pytorch.kernels.cuda import (alibi_paged_attention_fwd,
                                                    fill_kv_cache,
@@ -169,6 +172,7 @@ def forward(
                     window_size=self.sliding_window,
                     sm_scale=self.scale,
                     logit_softcapping=self.logit_softcapping,
+                    causal=self.causal,
                 )
         else:
             self.alibi_paged_attention_fwd(
@@ -204,6 +208,7 @@ def build(
         alibi: bool = False,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
@@ -215,4 +220,5 @@ def build(
                                    alibi=alibi,
                                    sliding_window=sliding_window,
                                    logical_softcapping=logical_softcapping,
+                                   causal=causal,
                                    **kwargs)
diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -30,8 +30,10 @@ def __init__(
         alibi: bool = None,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ):
+        assert causal
         super().__init__(
             num_heads,
             head_size,
@@ -41,6 +43,7 @@ def __init__(
             alibi,
             sliding_window,
             logit_softcapping,
+            causal=causal,
             **kwargs,
         )
 
@@ -121,6 +124,7 @@ def build(
         alibi_scale: float = None,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> DlinferAttentionImpl:
         """build."""
@@ -132,4 +136,5 @@ def build(
                                     alibi_scale=alibi_scale,
                                     sliding_window=sliding_window,
                                     logical_softcapping=logical_softcapping,
+                                    causal=causal,
                                     **kwargs)
diff --git a/lmdeploy/pytorch/models/mllama.py b/lmdeploy/pytorch/models/mllama.py
@@ -195,6 +195,7 @@ def __init__(self,
             self.head_dim,
             num_kv_heads=self.num_key_value_heads,
             v_head_size=self.head_dim,
+            causal=False,
         )
 
         self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -1128,10 +1129,12 @@ def forward(
 
         # Collect intermediate layer outputs from encoder output
         all_intermediate_hidden_states = output[1]
+        all_intermediate_hidden_states = [
+            all_intermediate_hidden_states[i]
+            for i in self.intermediate_layers_indices
+        ]
         intermediate_hidden_states = torch.stack(
             all_intermediate_hidden_states, dim=-1)
-        intermediate_hidden_states = intermediate_hidden_states[
-            ..., self.intermediate_layers_indices]
 
         # Remove padding from intermediate hidden states
         intermediate_hidden_states = intermediate_hidden_states.reshape(
@@ -1196,8 +1199,8 @@ def __init__(self,
         # preprocessor
         self.input_processor = MLlamaInputProcessor(self.config, dtype)
 
-    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
-                            attn_metadata: Any, input_ids: torch.LongTensor):
+    def flat_encoder_result(self, attn_metadata: Any,
+                            input_ids: torch.LongTensor):
         # since every state share the same shape
         full_text_row_masked_out_mask = torch.ones(
             (attn_metadata.q_seqlens.sum(), 1), dtype=torch.bool)
@@ -1208,9 +1211,9 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor,
             full_text_row_masked_out_mask[start_pos:img_id] = False
             start_pos += q_seq_len
         full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
-            cross_attention_states.device)
+            input_ids.device)
 
-        return cross_attention_states, full_text_row_masked_out_mask
+        return full_text_row_masked_out_mask
 
     def forward(
         self,
@@ -1227,6 +1230,19 @@ def forward(
     ):
         """model forward, return logits."""
 
+        if cross_attn_metadata is None:
+            full_text_row_masked_out_mask = None
+        # FIXME basically, we want to inference
+        # text requests and image requests separately
+        elif pixel_values is None and (cross_attn_metadata.kv_seqlens is None):
+            full_text_row_masked_out_mask = None
+        elif cross_attn_metadata.is_decoding:
+            full_text_row_masked_out_mask = input_ids.new_ones(
+                input_ids.size(-1), 1)
+        else:
+            full_text_row_masked_out_mask = self.flat_encoder_result(
+                cross_attn_metadata, input_ids)  # noqa
+
         cross_attention_states = None
         if pixel_values is not None:
             cross_attention_states = self.vision_model(
@@ -1240,21 +1256,6 @@ def forward(
             cross_attention_states = cross_attention_states.view(
                 bsz, -1, image_token_dim)
 
-        if cross_attn_metadata is None:
-            full_text_row_masked_out_mask = None
-        # FIXME basically, we want to inference
-        # text requests and image requests separately
-        elif cross_attention_states is None and (cross_attn_metadata.kv_seqlens
-                                                 is None):
-            full_text_row_masked_out_mask = None
-        elif cross_attn_metadata.is_decoding:
-            full_text_row_masked_out_mask = input_ids.new_ones(
-                input_ids.size(-1), 1)
-        else:
-            (cross_attention_states,
-             full_text_row_masked_out_mask) = self.flat_encoder_result(
-                 cross_attention_states, cross_attn_metadata,
-                 input_ids)  # noqa
         hidden_states = self.language_model(
             input_ids=input_ids,
             position_ids=position_ids,
diff --git a/lmdeploy/pytorch/nn/attention.py b/lmdeploy/pytorch/nn/attention.py
@@ -32,6 +32,7 @@ def __init__(
         sliding_window: int = None,
         logit_softcapping: float = None,
         replicate_kv: bool = False,
+        causal: bool = True,
         **kwargs,
     ):
         super().__init__()
@@ -55,6 +56,7 @@ def __init__(
             alibi=alibi,
             sliding_window=sliding_window,
             logit_softcapping=logit_softcapping,
+            causal=causal,
             **kwargs,
         )