integrate xattn_post_proc kernel and FP16 kernel works. TODOto verify u8 kvcache.

ceciliapeng2011 · ceciliapeng2011 · commit d9a7806751f8 · 2025-09-22T16:58:23.000+08:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention.cpp
@@ -37,6 +37,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {
     Stage::Ptr pa_multi_token = make_stage<PagedAttentionGeneratorMultiToken>();
     Stage::Ptr xattn_estimate_gemmqk = make_stage<XAttentionEstimateGEMMQK>();
     Stage::Ptr xattn_estimate_find_block = make_stage<XAttentionEstimateFindBlock>();
+    Stage::Ptr xattn_estimate_post_proc = make_stage<XAttentionEstimatePostProc>();
 
     PagedAttentionCmImpl(): PrimitiveImplCM(PagedAttentionImplementationManager::get_type_info_static()) {
         m_rt_params = std::make_unique<PagedAttentionRuntimeParams>();
@@ -53,6 +54,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {
         if (xattn_block_size > 1) {
             add_stage(xattn_estimate_gemmqk, params);
             add_stage(xattn_estimate_find_block, params);
+            add_stage(xattn_estimate_post_proc, params);
         }
     }
 
@@ -124,6 +126,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {
                     pa_id++;
                 }
 #endif
+                res_event = {execute_stage(res_event, instance, xattn_estimate_post_proc)};
             }
             res_event = {execute_stage(res_event, instance, pa_multi_token)};
         } else if (rt_params->stage == PagedAttentionStage::GENERATE) {
@@ -202,6 +205,11 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {
 
                 auto count_elements_mask = static_cast<int64_t>(desc->heads_num * q_block_pad * k_block_pad);
                 internal_buffers.emplace_back(count_elements_mask, ov::element::boolean);        // 4: sparse_block_mask
+
+                const uint32_t MERGED_Q_NUM = 2; // TODO
+                const uint32_t q_block_pad_merged = ceil_div(q_block_pad, MERGED_Q_NUM);
+                auto count_elements_mask_merged = static_cast<int64_t>(desc->heads_num * q_block_pad_merged * k_block_pad);
+                internal_buffers.emplace_back(count_elements_mask_merged, ov::element::boolean);  // 5: sparse_block_mask_wg
             }
         }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention_gen.cpp b/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention_gen.cpp
@@ -429,8 +429,10 @@ Arguments PagedAttentionGeneratorMultiToken::get_arguments_desc(const kernel_imp
     args.push_back({ArgumentDescriptor::Types::INPUT, PagedAttentionInputIdx::SUBSEQUENCE_BEGINS});    // subsequence_begins
 
     const size_t block_size = get_xattn_block_size(params);
-    if (block_size > 1)
+    if (block_size > 1) {
         args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4});  // sparse_block_mask
+        args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 5});  // sparse_block_mask_wg
+    }
 
     args.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
 
@@ -944,4 +946,74 @@ DispatchDataFunc XAttentionEstimateFindBlock::get_dispatch_data_func() const {
     }};
 }
 
+//-----------------------------------------------------------------------------------------------------------------
+// XAttention Estimate post_proc generator
+//-----------------------------------------------------------------------------------------------------------------
+JitConstants XAttentionEstimatePostProc::get_jit_constants(const kernel_impl_params& params) const {
+    auto jit = XAttentionEstimateGeneratorBase::get_jit_constants(params);
+
+    jit.make("MERGED_Q_NUM", 2);  // TODO
+
+    return jit;
+}
+
+Arguments XAttentionEstimatePostProc::get_arguments_desc(const kernel_impl_params& params) const {
+    Arguments args;
+
+    // inputs
+    args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4});  // block_mask
+
+    // outputs
+    args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 5});  // block_mask_merged
+
+    // scalar
+    args.push_back({ArgumentDescriptor::Types::SCALAR, 0});  // q_stride_pad
+    args.push_back({ArgumentDescriptor::Types::SCALAR, 1});  // q_block_pad
+    args.push_back({ArgumentDescriptor::Types::SCALAR, 2});  // k_block_pad
+
+    return args;
+}
+
+DispatchDataFunc XAttentionEstimatePostProc::get_dispatch_data_func() const {
+    return DispatchDataFunc{[&](const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params) {
+        assert(!params.is_dynamic());
+        auto& wgs = kd.params.workGroups;
+
+        const auto desc = params.typed_desc<paged_attention>();
+
+        assert(rt_params != nullptr);
+
+        const size_t block_size = get_xattn_block_size(params);
+        const size_t heads_num = desc->heads_num;
+
+        auto out_shape = params.output_layouts[0].get_shape();
+        const size_t kv_len = get_max_context_len(params) / STRIDE * STRIDE;
+        const size_t q_len = out_shape[0];
+        const uint32_t M = static_cast<uint32_t>(q_len / STRIDE);   //# will slient drop the tails which is less than `stride`
+        const uint32_t N = static_cast<uint32_t>(kv_len / STRIDE);
+        const size_t q_stride_pad = round_up_to(M, BLOCK_WG_M);
+        const size_t N_kq_groups = ceil_div(N, BLOCK_WG_N);
+
+        const uint32_t sum_per_token_in_block = static_cast<uint32_t>(block_size / STRIDE);
+        const uint32_t k_block_in_group = static_cast<uint32_t>(BLOCK_WG_N / sum_per_token_in_block);
+        const uint32_t k_block_pad = k_block_in_group * N_kq_groups;
+        const uint32_t q_block_pad = ceil_div(q_len, block_size);
+
+        const uint32_t MERGED_Q_NUM = 2; // TODO
+        const uint32_t q_block_pad_merged = ceil_div(q_block_pad, MERGED_Q_NUM);
+
+        wgs.global = {q_block_pad_merged, heads_num, 1};
+        wgs.local = {1, 1, 1};
+
+        auto& scalars = kd.params.scalars;
+        std::vector<size_t> scaler_value = {q_stride_pad, q_block_pad, k_block_pad};
+        scalars.resize(scaler_value.size());
+
+        for (size_t i = 0; i < scaler_value.size(); ++i) {
+            scalars[i].t = ScalarDescriptor::Types::UINT32;
+            scalars[i].v.u32 = static_cast<uint32_t>(scaler_value[i]);
+        }
+    }};
+}
+
 }  // namespace ov::intel_gpu::cm
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention_gen.hpp b/src/plugins/intel_gpu/src/graph/impls/cm/paged_attention_gen.hpp
@@ -134,4 +134,12 @@ class XAttentionEstimateFindBlock : public XAttentionEstimateGeneratorBase {
     [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override;
 };
 
+class XAttentionEstimatePostProc : public XAttentionEstimateGeneratorBase {
+public:
+    XAttentionEstimatePostProc() : XAttentionEstimateGeneratorBase("xattn_post_proc") {}
+    [[nodiscard]] JitConstants get_jit_constants(const kernel_impl_params& params) const override;
+    [[nodiscard]] Arguments get_arguments_desc(const kernel_impl_params& params) const override;
+    [[nodiscard]] DispatchDataFunc get_dispatch_data_func() const override;
+};
+
 }  // namespace ov::intel_gpu::cm
diff --git a/src/plugins/intel_gpu/src/graph/impls/cm/xattn_post_proc.cm b/src/plugins/intel_gpu/src/graph/impls/cm/xattn_post_proc.cm
@@ -0,0 +1,61 @@
+/*******************************************************************************
+ * Copyright (c) 2022-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+namespace KERNEL_NAME {
+#include "estimate.hpp"
+
+// NOTE: q_stride_pad / TOKEN_IN_BLOCK <= q_block_pad, case for q_stride_pad / TOKEN_IN_BLOCK < q_block_pad:
+//       query = 256*16+1, then
+//       q_stride_pad = 256
+//       q_stride_pad / TOKEN_IN_BLOCK = 32
+//       q_block_pad = div_up(256*16+1, 128) = 33
+// _GENX_MAIN_ void post_proc_mask(
+extern "C" _GENX_MAIN_ void KERNEL_NAME(svmptr_t block_mask ATTR, svmptr_t merged_block_mask ATTR, uint q_stride_pad, uint q_block_pad, uint k_block_pad) {
+    // block_mask:                [b, hq, q_block_pad, k_block_pad]
+    // merged_block_mask:         [b, hq, q_block_pad/MERGED_Q_NUM, k_block_pad]
+    // global:                    [q_block_pad/MERGED_Q_NUM, hq, b]
+    const int TOKEN_IN_BLOCK = BLOCK_SIZE / STRIDE;
+    const int TOKEN_SHARE_MAX = BLOCK_SHARE_MAX / TOKEN_IN_BLOCK;
+    uint m_mereged = cm_group_id(0);
+    uint hq = cm_group_id(1);
+    uint b = cm_group_id(2);
+    block_mask += (b * HQ + hq) * q_block_pad * k_block_pad;
+    merged_block_mask += (b * HQ + hq) * cm_group_count(0) * k_block_pad;
+    merged_block_mask += m_mereged * k_block_pad;
+    block_mask += m_mereged * MERGED_Q_NUM * k_block_pad;
+    vector<uchar, 32> one = 1;
+    // q is not inside mask, aka q=1~15 which is less than param `stride`
+    //for (int i = 0; i < MERGED_Q_NUM; i++) {
+    //    auto q_stride_cur = m_mereged * MERGED_Q_NUM + i;
+    //    if (q_stride_cur >= q_stride_pad / TOKEN_IN_BLOCK && q_stride_cur < q_block_pad) {
+    //        for (int j = 0; j < k_block_pad; j += 32) {
+    //            cm_ptr_store<int, 32 / 4>((int*)block_mask, j + i * k_block_pad, one.format<int>());
+    //        }
+    //    }
+    //}
+    for (int j = 0; j < k_block_pad; j += 32) {
+        vector<uchar, 32> new_mask = cm_ptr_load<int, 8>((int*)block_mask, j).format<uchar>();
+        for (int i = 1; i < MERGED_Q_NUM; i++) {
+            if (m_mereged * MERGED_Q_NUM + i < q_stride_pad / TOKEN_IN_BLOCK) {
+                vector<uchar, 32> cur_mask = cm_ptr_load<int, 8>((int*)block_mask, j + i * k_block_pad).format<uchar>();
+                new_mask &= cur_mask;
+            }
+        }
+        cm_ptr_store<int, 32 / 4>((int*)merged_block_mask, j, new_mask.format<int>());
+    }
+}
+
+}  // NAMESPACE

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {`
`37`	`37`	`Stage::Ptr pa_multi_token = make_stage<PagedAttentionGeneratorMultiToken>();`
`38`	`38`	`Stage::Ptr xattn_estimate_gemmqk = make_stage<XAttentionEstimateGEMMQK>();`
`39`	`39`	`Stage::Ptr xattn_estimate_find_block = make_stage<XAttentionEstimateFindBlock>();`
	`40`	`+ Stage::Ptr xattn_estimate_post_proc = make_stage<XAttentionEstimatePostProc>();`
`40`	`41`
`41`	`42`	`PagedAttentionCmImpl(): PrimitiveImplCM(PagedAttentionImplementationManager::get_type_info_static()) {`
`42`	`43`	`m_rt_params = std::make_unique<PagedAttentionRuntimeParams>();`
`@@ -53,6 +54,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {`
`53`	`54`	`if (xattn_block_size > 1) {`
`54`	`55`	`add_stage(xattn_estimate_gemmqk, params);`
`55`	`56`	`add_stage(xattn_estimate_find_block, params);`
	`57`	`+ add_stage(xattn_estimate_post_proc, params);`
`56`	`58`	`}`
`57`	`59`	`}`
`58`	`60`
`@@ -124,6 +126,7 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {`
`124`	`126`	`pa_id++;`
`125`	`127`	`}`
`126`	`128`	`#endif`
	`129`	`+ res_event = {execute_stage(res_event, instance, xattn_estimate_post_proc)};`
`127`	`130`	`}`
`128`	`131`	`res_event = {execute_stage(res_event, instance, pa_multi_token)};`
`129`	`132`	`} else if (rt_params->stage == PagedAttentionStage::GENERATE) {`
`@@ -202,6 +205,11 @@ class PagedAttentionCmImpl : public PrimitiveImplCM {`
`202`	`205`
`203`	`206`	`auto count_elements_mask = static_cast<int64_t>(desc->heads_num * q_block_pad * k_block_pad);`
`204`	`207`	`internal_buffers.emplace_back(count_elements_mask, ov::element::boolean); // 4: sparse_block_mask`
	`208`	`+`
	`209`	`+ const uint32_t MERGED_Q_NUM = 2; // TODO`
	`210`	`+ const uint32_t q_block_pad_merged = ceil_div(q_block_pad, MERGED_Q_NUM);`
	`211`	`+ auto count_elements_mask_merged = static_cast<int64_t>(desc->heads_num * q_block_pad_merged * k_block_pad);`
	`212`	`+ internal_buffers.emplace_back(count_elements_mask_merged, ov::element::boolean); // 5: sparse_block_mask_wg`
`205`	`213`	`}`
`206`	`214`	`}`
`207`	`215`