fix sliding window

grimoire · grimoire · commit 34dd841bab22 · 2024-11-05T13:36:26.000+08:00
diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -199,10 +199,10 @@ def _flash_prefill_fwd_kernel(
     loop_start = 0
     kv_min_loc = tl.zeros([BLOCK_M], dtype=tl.int32)
     if window_size > 0:
-        start_block_id = tl.maximum(history_len - window_size, 0) // BLOCK_N
+        start_block_id = tl.maximum(
+            history_len + start_m * BLOCK_M - window_size, 0) // BLOCK_N
         kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
         loop_start = start_block_id * BLOCK_N
-        kv_start_loc += loop_start
 
     offs_dk = tl.arange(0, BLOCK_DK)
     mask_dk = offs_dk < head_dim_k
diff --git a/tests/pytorch/kernel/test_flash_attention.py b/tests/pytorch/kernel/test_flash_attention.py
@@ -224,7 +224,7 @@ def window_gt(self, conti_q, conti_kv, q_seqlens, kv_seqlens, win_size):
     @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
                              indirect=True)
     @pytest.mark.parametrize(['q_seqlens', 'history_lens'], [
-        ([30, 50, 70, 90], [50, 40, 30, 20]),
+        ([30, 50, 70, 90], [50, 40, 30, 90]),
     ],
                              indirect=True)
     @pytest.mark.parametrize('win_size', (32, ), indirect=True)