Merge pull request #315 from kvcache-ai/Atream-add-adapted

Atream · web-flow · commit f9f9f746c053 · 2025-02-15T15:45:34.000+08:00
Atream add adapted
diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py
@@ -262,7 +262,7 @@ def forward_linux(
             """
 
             # flash attn doesn't support head_dim bigger than 256
-            # use vLLM triton attention kernel for MQA
+            # use triton attention kernel adapted from vLLM and SGLang for MQA
             decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
                              page_table,
                              position_ids.squeeze(0).to(torch.int32), attn_logits,
@@ -551,4 +551,4 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights, past_key_value
diff --git a/ktransformers/operators/triton_attention.py b/ktransformers/operators/triton_attention.py
@@ -1,3 +1,9 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
 import triton
 import triton.language as tl
 
@@ -376,4 +382,4 @@ def decode_attention_fwd_grouped(
     )
 
     _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
-                                num_kv_splits)
+                                num_kv_splits)