openvinotoolkit · ceciliapeng2011 · Aug 29, 2025 · Aug 31, 2025 · Aug 31, 2025 · Sep 2, 2025
@@ -109,7 +109,7 @@ ov::pass::ConvertPagedAttnInputs::ConvertPagedAttnInputs(const KVCacheConfig& co
         value_cache->set_element_type(value_cache_precision);
         bool status = false;
         if (pa_op->get_rt_info().count("num_k_heads") && pa_op->get_rt_info().count("k_head_size") &&
-            pa_op->get_rt_info().count("num_v_heads") && pa_op->get_rt_info().count("num_v_heads")) {
+            pa_op->get_rt_info().count("num_v_heads") && pa_op->get_rt_info().count("v_head_size")) {
             const auto key_cache_shape = init_cache_shape(pa_op->get_rt_info()["num_k_heads"].as<size_t>(),
                                                           pa_op->get_rt_info()["k_head_size"].as<size_t>(),
                                                           m_config.keyCacheBlockSize,

@@ -38,6 +38,7 @@ struct paged_attention : public primitive_base<paged_attention> {
     };
 
     static constexpr size_t block_size = 16;
+    static constexpr size_t block_size_xattn = 256;
 
     paged_attention() : primitive_base("", {}) {}
 

@@ -176,6 +176,7 @@ static constexpr Property<bool, ov::PropertyMutability::RW> could_use_flashattn_
 static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization_group_size_max{"GPU_DYNAMIC_QUANTIZATION_GROUP_SIZE_MAX"};
 static constexpr Property<bool, ov::PropertyMutability::RW> validate_output_buffer{"GPU_VALIDATE_OUTPUT_BUFFER"};
 static constexpr Property<float, ov::PropertyMutability::RW> mem_pool_util_threshold{"GPU_MEM_POOL_UTIL_THRESHOLD"};
+static constexpr Property<bool, ov::PropertyMutability::RW> dump_src_after_exec{"GPU_DUMP_SRC_TENSORS_AFTER_EXEC"};
 }  // namespace ov::intel_gpu
 
 namespace cldnn {

@@ -81,6 +81,7 @@ OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_layer_names, std::vector<std::string>
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_memory_pool_path, "", "Save csv file with memory pool info to specified folder")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_memory_pool, false, "Enable verbose output for memory pool")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_iterations, std::set<int64_t>{}, "Space separated list of iterations where other dump options should be enabled")
+OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_src_after_exec, false, "Enable source data dump after layer execution. Useful for capturing updated states in stateful models.")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, host_time_profiling, 0, "Measure and print host time spent from the beginning of the infer until all host work is done and plugin is ready to block thread on the final clFinish() call")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, disable_async_compilation, false, "Disable feature that allows to asynchronously prepare static-shaped implementations for the primitives with shape-agnostic kernels selected during compilation")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, disable_runtime_buffer_fusing, false, "Disable runtime inplace optimizations for operations like concat and crop")

@@ -250,7 +250,7 @@ void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std
         dump<int8_t>(actual_mem, stream, file_stream, dump_raw);
     else if (mem_dt == cldnn::data_types::u8)
         dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::u8)
+    else if (mem_dt == cldnn::data_types::boolean)
         dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
     else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4)
         dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw);
@@ -536,7 +536,7 @@ NodeDebugHelper::~NodeDebugHelper() {
             for (size_t i = 0; i < m_inst.get_intermediates_memories().size(); i++) {
                 std::string name = get_file_prefix() + "_intermediates_" + std::to_string(i);
                 auto output_mem = m_inst.get_intermediates_memories()[i];
-                if (output_mem == nullptr) {
+                if (output_mem == nullptr || output_mem->size() == 0) {
                     GPU_DEBUG_COUT << " intermediates_mem is nullptr. Nothing to dump." << std::endl;
                     continue;
                 }
@@ -558,6 +558,35 @@ NodeDebugHelper::~NodeDebugHelper() {
                     log_memory_to_file(output_mem, output_layout, m_stream, filename, dump_raw);
                 }
             }
+
+            if (config.get_dump_src_after_exec()) {
+                for (size_t i = 0; i < m_inst.inputs_memory_count(); i++) {
+                    std::string name = get_file_prefix() + "_updated_src_" + std::to_string(i);
+                    auto output_mem = m_inst.input_memory_ptr(i);
+                    if (output_mem == nullptr) {
+                        GPU_DEBUG_COUT << " updated_input_mem is nullptr. Nothing to dump." << std::endl;
+                        continue;
+                    }
+
+                    auto& output_layout = m_inst.get_input_layout(i);
+                    if (config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::binary) {
+                        // Binary dump : raw
+                        auto filename = get_file_path_for_binary_dump(output_layout, name, config.get_dump_tensors_path());
+
+                        mem_lock<char, mem_lock_type::read> lock(output_mem, m_stream);
+                        ov::util::save_binary(filename, lock.data(), output_mem->size());
+                        GPU_DEBUG_COUT << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
+                        debug_str_for_bin_load += (filename + ",");
+                    } else {
+                        const bool dump_raw = config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::text_raw;
+                        GPU_DEBUG_COUT << " Dump " << (dump_raw ? "raw " : "") << name << std::endl;
+                        auto filename = config.get_dump_tensors_path() + get_name_for_dump(name) + ".txt";
+                        // Text dump
+                        log_memory_to_file(output_mem, output_layout, m_stream, filename, dump_raw);
+                    }
+                }
+            }
+
             if (config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::binary && m_inst.is_input()) {
                 debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
                 GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
-Original file line number
+Diff line change
@@ Expand Up @@
         };
         static constexpr size_t block_size = 16;
+        static constexpr size_t block_size_xattn = 256;
         paged_attention() : primitive_base("", {}) {}
@@ Expand Down @@