From f0f8ea582e142d6a7a5cc02836134c72d486d44a Mon Sep 17 00:00:00 2001
From: wenzengc <wenzeng.chen@intel.com>
Date: Sun, 3 Aug 2025 12:05:57 +0800
Subject: [PATCH 01/13] support generate more than 1 token per inference

Signed-off-by: wenzengc <wenzeng.chen@intel.com>
---
 .../src/al/include/intel_npu/config/npuw.hpp  |  1 +
 .../intel_npu/npuw_private_properties.hpp     |  8 ++++
 .../intel_npu/src/al/src/config/npuw.cpp      |  1 +
 .../src/plugin/npuw/llm_compiled_model.cpp    |  9 +++-
 .../src/plugin/npuw/llm_compiled_model.hpp    |  1 +
 .../src/plugin/npuw/llm_infer_request.cpp     | 45 ++++++++++++++-----
 .../src/plugin/npuw/llm_infer_request.hpp     |  1 +
 7 files changed, 54 insertions(+), 12 deletions(-)
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
index 61455f23d46c0a..e76678e89793c8 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -107,6 +107,7 @@ DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, RunTime);
 DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, RunTime);
 DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, RunTime);
 DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, RunTime);
+DEFINE_OPT(NPUW_LLM_MAX_GENERATION_TOKEN_LEN, uint32_t, 1, npuw::llm::max_generation_token_len, RunTime);
 DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, RunTime);
 DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, RunTime);
 DEFINE_OPT(NPUW_LLM_PREFILL_CHUNK_SIZE, uint64_t, 256, npuw::llm::prefill_chunk_size, RunTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
index d68259b6949ec6..1f9a526aba00be 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -423,6 +423,14 @@ static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};
  */
 static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};
 
+/**
++ * @brief
++ * Type: uint32_t.
++ * Desirable max input token length for generation.
++ * Default value: 1.
++ */
+static constexpr ov::Property<uint32_t> max_generation_token_len{"NPUW_LLM_MAX_GENERATION_TOKEN_LEN"};
+
 /**
  * @brief
  * Type: uint32_t.
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 11e266fca25d8d..9cb3376242157f 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -65,6 +65,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM_MAX_LORA_RANK>();
     desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
     desc.add<NPUW_LLM_PREFILL_CHUNK_SIZE>();
+    desc.add<NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
     desc.add<NPUW_LLM_PREFILL_HINT>();
     desc.add<NPUW_LLM_GENERATE_HINT>();
     desc.add<NPUW_LLM_SHARED_HEAD>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index dd9f791f46bd48..2f64c678e14ad9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -955,6 +955,9 @@ ov::AnyMap get_default_generate_config(const std::optional<NPUDesc>& npudesc,
     if (hint == ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE) {
         config.emplace("NPUW_UNFOLD_IREQS", "YES");
     }
+    // We don't need slice out for kv cache model, especially for speculative decoding which need
+    // to generate more than 1 token for each inference
+    config.erase("NPUW_SLICE_OUT");
     return config;
 }
 
@@ -1124,6 +1127,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     KVAxesPosition axes{batch_dim, seq_len_dim};
     const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
     const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
+    const uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
 
     // NB: PREFILL_HINT is now compatible with the PREFILL_CONFIG section, unlike for
     // the generate model they're not mutually exclusive
@@ -1154,7 +1158,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
         }
     }
 
-    m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
+    m_kvcache_desc =
+        KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len};
     LOG_DEBUG("Make prefill model with static shapes");
     m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
     if (m_use_chunk_prefill) {
@@ -1171,7 +1176,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
                           m_max_lora_rank);
     }
     LOG_DEBUG("Make kvcache model with static shapes");
-    reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes, m_max_lora_rank);
+    reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank);
     if (lm_head_model) {
         LOG_DEBUG("Shared LM head: slice the prefill output");
         // KVCache model is already reshaped to [1, 1, embed size], so only apply slice to
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
index fbeedcbe809969..36445f2858c246 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
@@ -24,6 +24,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
         uint32_t total_size = 0u;
         uint32_t num_stored_tokens = 0u;
         uint32_t dim = 0u;
+        uint32_t max_generation_token_len = 0u;
         bool v_tensors_transposed = false;
     };
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index b629a175290617..b7131581dc4db2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -571,6 +571,14 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
     LOG_DEBUG("Done.");
 }
 
+void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr<ov::ITensor> position_ids) {
+    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
+    auto position_id = position_ids->data<int64_t>()[0];
+    auto dirty_num = kvcache_desc.num_stored_tokens - static_cast<uint32_t>(position_id);
+    LOG_DEBUG("Update kv cache length from " << kvcache_desc.num_stored_tokens << " to " << position_id);
+    kvcache_desc.num_stored_tokens -= dirty_num;
+}
+
 void ov::npuw::LLMInferRequest::clear_chunk_prefill_kv_cache() {
     const auto& prefill_compiled = m_prefill_request->get_compiled_model();
 
@@ -739,6 +747,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
                                                ov::SoPtr<ov::ITensor> position_ids) {
     LOG_DEBUG("Calling inference for generate model...");
     LOG_BLOCK();
+    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
+    auto in_token_len = input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM];
+    if (in_token_len != kvcache_desc.max_generation_token_len) {
+        OPENVINO_THROW("Input lenth for KV cache model mismatch with \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ",
+                       kvcache_desc.max_generation_token_len,
+                       ".\nPlease adjust it ");
+    }
 
     if (!m_generate_initialized) {
         LOG_DEBUG("Copy kv-cache from prefill to generate model.");
@@ -749,14 +764,15 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         fill_tensor<int64_t>(kv_attn_mask, 0);
         // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask.
         //       We can safely set this "1" once and then copy on one "1" less in the infer_generate().
-        kv_attn_mask->data<int64_t>()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - 1] = 1;
+        for (std::size_t i = 0; i < kvcache_desc.max_generation_token_len; i++) {
+            kv_attn_mask->data<int64_t>()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - i - 1] = 1;
+        }
 
         m_generate_initialized = true;
     }
 
-    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
     // NB: KV-cache is full, further generation is impossible
-    if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
+    if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
         OPENVINO_THROW("KV-Cache is full.");
     }
 
@@ -770,27 +786,35 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask.
     //       As it is already set above, here we copy on one "1" unit less.
     auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
-    std::copy_n(attention_mask->data<int64_t>(), attention_mask->get_size() - 1, kv_attn_mask->data<int64_t>());
+    std::copy_n(attention_mask->data<int64_t>(),
+                attention_mask->get_size() - kvcache_desc.max_generation_token_len,
+                kv_attn_mask->data<int64_t>());
 
     auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
     std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), kv_pos_ids->data<int64_t>());
 
     m_kvcache_request->infer();
-    kvcache_desc.num_stored_tokens += 1;
+    kvcache_desc.num_stored_tokens += kvcache_desc.max_generation_token_len;
 
     if (m_lm_head_request) {
         LOG_DEBUG("Calling inference for LM head model asynchronously");
         m_lm_head_request->start_async();
-        if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
-            update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1);
+        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
+            update_kvcache_for(m_kvcache_request,
+                               m_kvcache_in_ports,
+                               m_kvcache_out_ports,
+                               kvcache_desc.max_generation_token_len);
         }
         m_lm_head_request->wait();
         LOG_DEBUG("Calling inference for LM head model -- done.");
 
         m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
     } else {
-        if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
-            update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1);
+        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
+            update_kvcache_for(m_kvcache_request,
+                               m_kvcache_in_ports,
+                               m_kvcache_out_ports,
+                               kvcache_desc.max_generation_token_len);
         }
 
         m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
@@ -815,9 +839,10 @@ void ov::npuw::LLMInferRequest::infer() {
 
     // NB: Check the sequence length provided for input_ids
     // in order to distinguish prefill / generate stages
-    if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] != 1) {
+    if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data<int64_t>()[0] == 0) {
         infer_prefill(input_ids, attention_mask, position_ids);
     } else {
+        trim_kvcache_for_speculative_decoding(position_ids);
         infer_generate(input_ids, attention_mask, position_ids);
     }
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
index 7197ef4f19fe54..86ccfef6e41700 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
@@ -52,6 +52,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
                             std::unordered_map<std::string, ov::Output<const ov::Node>> in_ports,
                             std::unordered_map<std::string, ov::Output<const ov::Node>> out_ports,
                             uint32_t tokens);
+    void trim_kvcache_for_speculative_decoding(ov::SoPtr<ov::ITensor> position_ids);
 
     void infer_chunked_prefill(ov::SoPtr<ov::ITensor> input_ids,
                                ov::SoPtr<ov::ITensor> attention_mask,

From 25fb0562a281c8fcf3ee60e10fc9acbe29ff9e20 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 7 Aug 2025 18:25:28 +0100
Subject: [PATCH 02/13] Fixes for 3-model pipeline

---
 .../src/plugin/npuw/llm_compiled_model.cpp    | 26 ++++++++++---------
 .../src/plugin/npuw/llm_infer_request.cpp     | 17 +++++++++++-
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 2f64c678e14ad9..39418a3566affc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -781,10 +781,11 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, const uint32_t& batch_dim) {
-    // We have only one input with dynamic shapes: output of Slice operation, and this output
-    // should have "1" for dimension representing number of embeddings to send to the matmul.
-    // Batch size should be also equal "1" for NPU.
+void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, const uint32_t& batch_dim,
+                                   std::size_t max_generation_token_len) {
+    // We have only one input with dynamic shapes: output embeds.
+    // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings
+    // to send to the matmul. Batch size should be equal "1" for NPU.
     const auto& input = lm_head_model->input(0);
     const auto& partial_shape = input.get_partial_shape();
     NPUW_ASSERT(partial_shape.size() == 3);
@@ -794,7 +795,7 @@ void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, con
     // Left dynamic axis will be for number of embeddings
     for (auto i = 0; i < new_shape.rank().get_length(); i++) {
         if (new_shape[i].is_dynamic()) {
-            new_shape[i] = 1;
+            new_shape[i] = max_generation_token_len;
             // Sanity check that only one left dimension is dynamic, as
             // another one should contain embedding space rank
             break;
@@ -804,7 +805,8 @@ void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, con
     lm_head_model->reshape(new_shape);
 }
 
-void slice_out_embeds(std::shared_ptr<ov::Model> model, const uint32_t& batch_dim) {
+void slice_out_embeds(std::shared_ptr<ov::Model> model, const uint32_t& batch_dim,
+                      std::size_t max_generation_token_len) {
     std::shared_ptr<ov::Node> embed_result;
     for (auto&& output : model->outputs()) {
         if (output.get_any_name() == ov::npuw::LLMCompiledModel::output_embeds) {
@@ -821,8 +823,8 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model, const uint32_t& batch_di
         if (shape.size() == 3) {
             uint32_t num_embeds_dim = 1 - batch_dim;
             if (shape[num_embeds_dim] > 1) {
-                std::vector<int32_t> start_pos{static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - 1)),
-                                               static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - 1)),
+                std::vector<int32_t> start_pos{static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)),
+                                               static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)),
                                                0};
                 std::vector<int32_t> stop_pos{static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - 1)) + 1,
                                               static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - 1)) + 1,
@@ -1179,11 +1181,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank);
     if (lm_head_model) {
         LOG_DEBUG("Shared LM head: slice the prefill output");
-        // KVCache model is already reshaped to [1, 1, embed size], so only apply slice to
-        // the Prefill model:
-        slice_out_embeds(prefill_model, axes.batch);
+        // KVCache model is already reshaped to [1, max_generation_token_len, embed size],
+        // so only apply slice to the Prefill model:
+        slice_out_embeds(prefill_model, axes.batch, m_kvcache_desc.max_generation_token_len);
         LOG_DEBUG("Make LM head model with static shapes");
-        reshape_sliced_head_to_static(lm_head_model, axes.batch);
+        reshape_sliced_head_to_static(lm_head_model, axes.batch, m_kvcache_desc.max_generation_token_len);
     }
 
     LOG_DEBUG("5.1, decompose GroupQueryAttention OP");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index b7131581dc4db2..2745a53b71d782 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -838,7 +838,22 @@ void ov::npuw::LLMInferRequest::infer() {
     OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type());
 
     // NB: Check the sequence length provided for input_ids
-    // in order to distinguish prefill / generate stages
+    //     and start position idx in order to distinguish prefill
+    //     and generate stages.
+    // Notes for Speculative Decoding:
+    // 1. If model is a draft one in speculative decoding setting,
+    //    we expect it to be launched for more than 1 token only once,
+    //    while all other candidates to be generated consequentively
+    //    on previous token output.
+    // 2. If model is a main one in speculative decoding setting,
+    //    then it will be launched on multiple tokens at every iteration.
+    //    However, only the first iteration will take the input prompt
+    //    of variable length, while others will be launched on fixed
+    //    number of candidates, that can be easily done in generate phase
+    //    if generate model is reshaped to output kvcache for such fixed
+    //    number of candidates. To differentiate prefill and generate
+    //    calls for main model, we just check that start position id
+    //    is 0, meaning this is the first input prompt.
     if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data<int64_t>()[0] == 0) {
         infer_prefill(input_ids, attention_mask, position_ids);
     } else {

From f74a821cd814bd053022e7d88791e40c40fe5982 Mon Sep 17 00:00:00 2001
From: wenzengc <wenzeng.chen@intel.com>
Date: Fri, 8 Aug 2025 09:58:43 +0800
Subject: [PATCH 03/13] fix clang format

Signed-off-by: wenzengc <wenzeng.chen@intel.com>
---
 .../src/plugin/npuw/llm_compiled_model.cpp    | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 39418a3566affc..5762ae52b3aa11 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -781,7 +781,8 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, const uint32_t& batch_dim,
+void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model,
+                                   const uint32_t& batch_dim,
                                    std::size_t max_generation_token_len) {
     // We have only one input with dynamic shapes: output embeds.
     // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings
@@ -805,7 +806,8 @@ void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model, con
     lm_head_model->reshape(new_shape);
 }
 
-void slice_out_embeds(std::shared_ptr<ov::Model> model, const uint32_t& batch_dim,
+void slice_out_embeds(std::shared_ptr<ov::Model> model,
+                      const uint32_t& batch_dim,
                       std::size_t max_generation_token_len) {
     std::shared_ptr<ov::Node> embed_result;
     for (auto&& output : model->outputs()) {
@@ -823,9 +825,10 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model, const uint32_t& batch_di
         if (shape.size() == 3) {
             uint32_t num_embeds_dim = 1 - batch_dim;
             if (shape[num_embeds_dim] > 1) {
-                std::vector<int32_t> start_pos{static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)),
-                                               static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)),
-                                               0};
+                std::vector<int32_t> start_pos{
+                    static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)),
+                    static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)),
+                    0};
                 std::vector<int32_t> stop_pos{static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - 1)) + 1,
                                               static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - 1)) + 1,
                                               static_cast<int32_t>(shape[2])};
@@ -1178,7 +1181,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
                           m_max_lora_rank);
     }
     LOG_DEBUG("Make kvcache model with static shapes");
-    reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank);
+    reshape_to_static(kvcache_model,
+                      m_kvcache_desc.max_generation_token_len,
+                      m_kvcache_desc.total_size,
+                      axes,
+                      m_max_lora_rank);
     if (lm_head_model) {
         LOG_DEBUG("Shared LM head: slice the prefill output");
         // KVCache model is already reshaped to [1, max_generation_token_len, embed size],

From 517bfcaf33da9326347c959e1be451636c20baea Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Wed, 20 Aug 2025 02:15:28 +0100
Subject: [PATCH 04/13] Supported dynamic number of output tokens for generate
 model only

---
 .../src/plugin/npuw/llm_infer_request.cpp     | 68 ++++++++++++-------
 1 file changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 2745a53b71d782..6ea81b61b8be59 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -566,7 +566,20 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
                                            kvcache_desc.num_stored_tokens - num_tokens,
                                            kvcache_desc.num_stored_tokens);
         auto src_tensor = request->get_tensor(out_ports.at(output_name));
-        copy_tensor_by_dim(src_tensor, dst_slice, kv_dim);
+
+        // NOTE: Sometimes present kv layer can contain greater seq_len
+        //       than was sent to be processed
+        uint32_t src_seq_len = static_cast<uint32_t>(src_tensor->get_shape()[kv_dim]);
+        OPENVINO_ASSERT(num_tokens <= src_seq_len);
+        if (src_seq_len > num_tokens) {
+            auto src_slice = make_tensor_slice(src_tensor,
+                                               kv_dim,
+                                               src_seq_len - num_tokens,
+                                               src_seq_len);
+            copy_tensor_by_dim(src_slice, dst_slice, kv_dim);
+        } else {
+            copy_tensor_by_dim(src_tensor, dst_slice, kv_dim);
+        }
     }
     LOG_DEBUG("Done.");
 }
@@ -748,9 +761,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     LOG_DEBUG("Calling inference for generate model...");
     LOG_BLOCK();
     auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
-    auto in_token_len = input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM];
-    if (in_token_len != kvcache_desc.max_generation_token_len) {
-        OPENVINO_THROW("Input lenth for KV cache model mismatch with \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ",
+    uint32_t input_tokens_len = static_cast<uint32_t>(input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM]);
+    if (input_tokens_len > kvcache_desc.max_generation_token_len) {
+        OPENVINO_THROW("Input prompt length is greater than output \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ",
                        kvcache_desc.max_generation_token_len,
                        ".\nPlease adjust it ");
     }
@@ -759,62 +772,69 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         LOG_DEBUG("Copy kv-cache from prefill to generate model.");
         copy_kvcache();
 
-        LOG_DEBUG("Prepare attention mask pattern.");
-        auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
-        fill_tensor<int64_t>(kv_attn_mask, 0);
-        // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask.
-        //       We can safely set this "1" once and then copy on one "1" less in the infer_generate().
-        for (std::size_t i = 0; i < kvcache_desc.max_generation_token_len; i++) {
-            kv_attn_mask->data<int64_t>()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - i - 1] = 1;
-        }
-
+        LOG_DEBUG("Prepare inputs.");
+        fill_tensor_bytes(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)), 0u);
+        fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)), 0);
+        fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)), 0);
         m_generate_initialized = true;
     }
 
     // NB: KV-cache is full, further generation is impossible
-    if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
+    if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - input_tokens_len) {
         OPENVINO_THROW("KV-Cache is full.");
     }
 
     // FIXME: these tensors should be shared between the parent & child models
     auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
     // NB: input_ids can be either fp32(VLM) or i64(LLM)
+    // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len
     std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
                 input_ids->get_byte_size(),
-                reinterpret_cast<uint8_t*>(kv_input_ids->data()));
+                reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size());
 
-    // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask.
-    //       As it is already set above, here we copy on one "1" unit less.
+    // NOTE: Attention mask pattern for generate model requires the set of "1"
+    //       units of length of the current prompt on the right (for present
+    //       kv layers) and the set of "1" units of number of previously calculated
+    //       tokens on the left (for past kv layers).
     auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
     std::copy_n(attention_mask->data<int64_t>(),
-                attention_mask->get_size() - kvcache_desc.max_generation_token_len,
+                // All tokens that we should process in current generate(),
+                // will go to the right of the mask (for present layers), so
+                // copy only mask from previous generate() calls to the left.
+                attention_mask->get_size() - input_tokens_len,
                 kv_attn_mask->data<int64_t>());
+    if (input_tokens_len < kvcache_desc.max_generation_token_len) {
+        std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
+                    kvcache_desc.max_generation_token_len - input_tokens_len, 0);
+    }
+    std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len,
+                input_tokens_len, 1);
 
     auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
-    std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), kv_pos_ids->data<int64_t>());
+    pad_position_ids(kv_pos_ids, position_ids);
 
     m_kvcache_request->infer();
-    kvcache_desc.num_stored_tokens += kvcache_desc.max_generation_token_len;
+    kvcache_desc.num_stored_tokens += input_tokens_len;
 
     if (m_lm_head_request) {
         LOG_DEBUG("Calling inference for LM head model asynchronously");
         m_lm_head_request->start_async();
-        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
+        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
             update_kvcache_for(m_kvcache_request,
                                m_kvcache_in_ports,
                                m_kvcache_out_ports,
-                               kvcache_desc.max_generation_token_len);
+                               input_tokens_len);
         }
         m_lm_head_request->wait();
         LOG_DEBUG("Calling inference for LM head model -- done.");
 
         m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
     } else {
-        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) {
+        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
             update_kvcache_for(m_kvcache_request,
                                m_kvcache_in_ports,
                                m_kvcache_out_ports,
-                               kvcache_desc.max_generation_token_len);
+                               input_tokens_len);
         }
 
         m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));

From 0f14e3fe4794c5b896d2dfe6f9523bf28112918c Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Mon, 25 Aug 2025 19:13:04 +0200
Subject: [PATCH 05/13] Fixed clang-format

---
 .../src/plugin/npuw/llm_infer_request.cpp     | 28 +++++++------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index f89c9d6bec37fb..f3a1cfdc7baf0d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -590,10 +590,7 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
         uint32_t src_seq_len = static_cast<uint32_t>(src_tensor->get_shape()[kv_dim]);
         OPENVINO_ASSERT(num_tokens <= src_seq_len);
         if (src_seq_len > num_tokens) {
-            auto src_slice = make_tensor_slice(src_tensor,
-                                               kv_dim,
-                                               src_seq_len - num_tokens,
-                                               src_seq_len);
+            auto src_slice = make_tensor_slice(src_tensor, kv_dim, src_seq_len - num_tokens, src_seq_len);
             copy_tensor_by_dim(src_slice, dst_slice, kv_dim);
         } else {
             copy_tensor_by_dim(src_tensor, dst_slice, kv_dim);
@@ -806,9 +803,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
     // NB: input_ids can be either fp32(VLM) or i64(LLM)
     // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len
-    std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
-                input_ids->get_byte_size(),
-                reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size());
+    std::copy_n(
+        reinterpret_cast<uint8_t*>(input_ids->data()),
+        input_ids->get_byte_size(),
+        reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size());
 
     // NOTE: Attention mask pattern for generate model requires the set of "1"
     //       units of length of the current prompt on the right (for present
@@ -823,10 +821,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
                 kv_attn_mask->data<int64_t>());
     if (input_tokens_len < kvcache_desc.max_generation_token_len) {
         std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
-                    kvcache_desc.max_generation_token_len - input_tokens_len, 0);
+                    kvcache_desc.max_generation_token_len - input_tokens_len,
+                    0);
     }
-    std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len,
-                input_tokens_len, 1);
+    std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
 
     auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
     pad_position_ids(kv_pos_ids, position_ids);
@@ -838,10 +836,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         LOG_DEBUG("Calling inference for LM head model asynchronously");
         m_lm_head_request->start_async();
         if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
-            update_kvcache_for(m_kvcache_request,
-                               m_kvcache_in_ports,
-                               m_kvcache_out_ports,
-                               input_tokens_len);
+            update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len);
         }
         m_lm_head_request->wait();
         LOG_DEBUG("Calling inference for LM head model -- done.");
@@ -849,10 +844,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
     } else {
         if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
-            update_kvcache_for(m_kvcache_request,
-                               m_kvcache_in_ports,
-                               m_kvcache_out_ports,
-                               input_tokens_len);
+            update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len);
         }
 
         m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));

From c3fc6ba8c149400dd6b14dee9afcd8de46b404d3 Mon Sep 17 00:00:00 2001
From: wenzengc <wenzeng.chen@intel.com>
Date: Sun, 3 Aug 2025 12:05:57 +0800
Subject: [PATCH 06/13] Align number of outputs of generate model to
 NPU-friendly power of two

---
 src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index a785468e2d5522..5afa224ebded13 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -857,7 +857,10 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     KVAxesPosition axes{batch_dim, seq_len_dim};
     uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
     const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
-    const uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
+    uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
+    if (max_generation_token_len != 1) {
+        max_generation_token_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(), 8u);
+    }
 
     // If chunk size covers the entire prompt, just follow the static behavior.
     // Otherwise, use chunking and align the prompt size to the chunk size.

From 137db087453dd96e7215bce2f27f883951445da2 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Wed, 3 Sep 2025 22:41:17 +0100
Subject: [PATCH 07/13] Polishing the PR

---
 .../src/plugin/include/properties.hpp         |  1 +
 .../src/plugin/npuw/llm_compiled_model.cpp    | 11 ++-
 .../src/plugin/npuw/llm_infer_request.cpp     | 78 +++++++++++--------
 .../src/plugin/npuw/serialization.hpp         |  2 +-
 .../intel_npu/src/plugin/src/plugin.cpp       |  1 +
 .../intel_npu/src/plugin/src/properties.cpp   |  1 +
 6 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/include/properties.hpp b/src/plugins/intel_npu/src/plugin/include/properties.hpp
index d6f0b5f04fa9c3..4f27e8552e3df9 100644
--- a/src/plugins/intel_npu/src/plugin/include/properties.hpp
+++ b/src/plugins/intel_npu/src/plugin/include/properties.hpp
@@ -111,6 +111,7 @@ class Properties final {
         ov::intel_npu::npuw::llm::batch_dim.name(),
         ov::intel_npu::npuw::llm::seq_len_dim.name(),
         ov::intel_npu::npuw::llm::max_prompt_len.name(),
+        ov::intel_npu::npuw::llm::max_generation_token_len.name(),
         ov::intel_npu::npuw::llm::min_response_len.name(),
         ov::intel_npu::npuw::llm::optimize_v_tensors.name(),
         ov::intel_npu::npuw::llm::prefill_hint.name(),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 5afa224ebded13..0b14900fccc94a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -503,8 +503,9 @@ void reshape_sliced_head_to_static(std::shared_ptr<ov::Model> lm_head_model,
                                    const uint32_t& batch_dim,
                                    std::size_t max_generation_token_len) {
     // We have only one input with dynamic shapes: output embeds.
-    // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings
-    // to send to the matmul. Batch size should be equal "1" for NPU.
+    // Output embeds should have "max_generation_token_len" for dimension representing
+    // number of embeddings to send to the matmul. Batch size should be equal to "1"
+    // for NPU.
     const auto& input = lm_head_model->input(0);
     const auto& partial_shape = input.get_partial_shape();
     NPUW_ASSERT(partial_shape.size() == 3);
@@ -537,12 +538,12 @@ void slice_out_embeds(std::shared_ptr<ov::Model> model,
     if (embed_result) {
         auto shape = embed_result->input(0).get_shape();
         // If shape.size() is 3, then last axis should be the Vocab size.
-        // But 1st and 2nd axis can mean different things.
+        // But 1st and 2nd axes can mean different things.
         // 1st axis can represent the batch size, while 2nd - the number of embeddings,
         // or vice-versa (in chatglm)
         if (shape.size() == 3) {
             uint32_t num_embeds_dim = 1 - batch_dim;
-            if (shape[num_embeds_dim] > 1) {
+            if (shape[num_embeds_dim] > max_generation_token_len) {
                 std::vector<int32_t> start_pos{
                     static_cast<int32_t>(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)),
                     static_cast<int32_t>(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)),
@@ -1106,6 +1107,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
         write(model_stream, m_kvcache_desc.total_size);
         write(model_stream, m_kvcache_desc.num_stored_tokens);
         write(model_stream, m_kvcache_desc.dim);
+        write(model_stream, m_kvcache_desc.max_generation_token_len);
         write(model_stream, m_kvcache_desc.v_tensors_transposed);
         write(model_stream, m_prefill_chunk_size);
         write(model_stream, m_use_chunk_prefill);
@@ -1314,6 +1316,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
         read(model_stream, compiled->m_kvcache_desc.total_size);
         read(model_stream, compiled->m_kvcache_desc.num_stored_tokens);
         read(model_stream, compiled->m_kvcache_desc.dim);
+        read(model_stream, compiled->m_kvcache_desc.max_generation_token_len);
         read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed);
         read(model_stream, compiled->m_prefill_chunk_size);
         read(model_stream, compiled->m_use_chunk_prefill);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index f4f460fd8b0289..8a9e27b587bd8e 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -178,14 +178,7 @@ void pad_position_ids(const ov::SoPtr<ov::ITensor>& padded_position_ids, const o
 
     OPENVINO_ASSERT(position_shape.size() <= 3);
 
-    size_t diff_dim = 0;
-    for (size_t i = 0; i < padded_shape.size(); ++i) {
-        if (padded_shape[i] != position_shape[i]) {
-            diff_dim = i;
-            break;
-        }
-    }
-
+    size_t diff_dim = position_shape.size() - 1;
     size_t keep_elements = padded_shape[diff_dim] - position_shape[diff_dim];
 
     size_t batch_size = 1;
@@ -601,9 +594,14 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
 
 void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr<ov::ITensor> position_ids) {
     auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
+    // FIXME: It can not work with OmniThinker for now.
+    OPENVINO_ASSERT((position_ids->get_shape().size() >= 2) && (position_ids->get_shape().back() >= 1));
     auto position_id = position_ids->data<int64_t>()[0];
     auto dirty_num = kvcache_desc.num_stored_tokens - static_cast<uint32_t>(position_id);
-    LOG_DEBUG("Update kv cache length from " << kvcache_desc.num_stored_tokens << " to " << position_id);
+    if (dirty_num > 0) {
+        LOG_DEBUG("Trim kv cache from " << kvcache_desc.num_stored_tokens << " length"
+                   << " to " << position_id << " length");
+    }
     kvcache_desc.num_stored_tokens -= dirty_num;
 }
 
@@ -790,7 +788,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     if (input_tokens_len > kvcache_desc.max_generation_token_len) {
         OPENVINO_THROW("Input prompt length is greater than output \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ",
                        kvcache_desc.max_generation_token_len,
-                       ".\nPlease adjust it ");
+                       ".\nPlease adjust it.");
     }
 
     if (!m_generate_initialized) {
@@ -805,14 +803,35 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     }
 
     // NB: KV-cache is full, further generation is impossible
-    if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - input_tokens_len) {
+    if (kvcache_desc.num_stored_tokens + input_tokens_len > kvcache_desc.total_size) {
         OPENVINO_THROW("KV-Cache is full.");
     }
 
     // FIXME: these tensors should be shared between the parent & child models
-    auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
     // NB: input_ids can be either fp32(VLM) or i64(LLM)
-    // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len
+    auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
+    auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
+    auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
+
+    // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
+    //       input layers of generation model are resized to, then we need to put
+    //       `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
+    //       We need to fill the the left unusable space with zeroes for attention mask, but
+    //       better to do this for all tensors. 
+    if (input_tokens_len < kvcache_desc.max_generation_token_len) {
+        std::fill_n(kv_input_ids->data<int64_t>() + kv_input_ids->get_size() - kvcache_desc.max_generation_token_len,
+                    kvcache_desc.max_generation_token_len - input_tokens_len,
+                    0);
+        std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
+                    kvcache_desc.max_generation_token_len - input_tokens_len,
+                    0);
+        std::fill_n(kv_pos_ids->data<int64_t>() + kv_pos_ids->get_size() - kvcache_desc.max_generation_token_len,
+                    kvcache_desc.max_generation_token_len - input_tokens_len,
+                    0);
+    }
+
+    // NOTE: Copying to the end to handle the case when `input_tokens_len` <
+    //       `kvcache_desc.max_generation_token_len`
     std::copy_n(
         reinterpret_cast<uint8_t*>(input_ids->data()),
         input_ids->get_byte_size(),
@@ -824,16 +843,8 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     //       tokens on the left (for past kv layers).
     auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
     std::copy_n(attention_mask->data<int64_t>(),
-                // All tokens that we should process in current generate(),
-                // will go to the right of the mask (for present layers), so
-                // copy only mask from previous generate() calls to the left.
                 attention_mask->get_size() - input_tokens_len,
                 kv_attn_mask->data<int64_t>());
-    if (input_tokens_len < kvcache_desc.max_generation_token_len) {
-        std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
-                    kvcache_desc.max_generation_token_len - input_tokens_len,
-                    0);
-    }
     std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
 
     auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
@@ -845,7 +856,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     if (m_lm_head_request) {
         LOG_DEBUG("Calling inference for LM head model asynchronously");
         m_lm_head_request->start_async();
-        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
+        if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
             update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len);
         }
         m_lm_head_request->wait();
@@ -853,7 +864,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
 
         m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
     } else {
-        if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) {
+        if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
             update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len);
         }
 
@@ -886,14 +897,19 @@ void ov::npuw::LLMInferRequest::infer() {
     //    while all other candidates to be generated consequentively
     //    on previous token output.
     // 2. If model is a main one in speculative decoding setting,
-    //    then it will be launched on multiple tokens at every iteration.
-    //    However, only the first iteration will take the input prompt
-    //    of variable length, while others will be launched on fixed
-    //    number of candidates, that can be easily done in generate phase
-    //    if generate model is reshaped to output kvcache for such fixed
-    //    number of candidates. To differentiate prefill and generate
-    //    calls for main model, we just check that start position id
-    //    is 0, meaning this is the first input prompt.
+    //    then it can be launched on multiple tokens at every iteration.
+    //    The first iteration will take the input prompt of variable
+    //    length in range [0, NPUW_LLM_MAX_PROMPT_LEN], while others
+    //    will be launched on variable number of candidates in range
+    //    [0, NPUW_LLM_MAX_GENERATION_TOKEN_LEN].
+    //    NPUW_LLM_MAX_GENERATION_TOKEN_LEN is much lesser than
+    //    NPUW_LLM_MAX_PROMPT_LEN. So, for second and next iterations
+    //    generate model will be utilized, that is reshaped to take
+    //    NPUW_LLM_MAX_GENERATION_TOKEN_LEN tokens and output the same
+    //    number of logits.
+    // The outcome of two items is that prefill and generate stages
+    //    can be safely differentiated by start position id for
+    //    both main and draft models.
     if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data<int64_t>()[0] == 0) {
         infer_prefill(input_ids, attention_mask, position_ids);
     } else {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
index 613ad079d807bc..7a69bb2ce513cb 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
@@ -34,7 +34,7 @@ const constexpr ov::npuw::s11n::IndicatorType NPUW_COMPILED_MODEL_INDICATOR =
 const constexpr ov::npuw::s11n::IndicatorType NPUW_LLM_COMPILED_MODEL_INDICATOR =
     {char{0x4c}, char{0x4c}, char{0x4d}, char{0x43}, char{0x4d}, char{0x4f}};
 
-const constexpr char* NPUW_SERIALIZATION_VERSION = "0.8";
+const constexpr char* NPUW_SERIALIZATION_VERSION = "0.10";
 
 // Forward declaration
 namespace intel_npu {
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index 4a6206c3731b4e..5f4d82cda9436d 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -329,6 +329,7 @@ void Plugin::init_options() {
     REGISTER_OPTION(NPUW_LLM_BATCH_DIM);
     REGISTER_OPTION(NPUW_LLM_SEQ_LEN_DIM);
     REGISTER_OPTION(NPUW_LLM_MAX_PROMPT_LEN);
+    REGISTER_OPTION(NPUW_LLM_MAX_GENERATION_TOKEN_LEN);
     REGISTER_OPTION(NPUW_LLM_MIN_RESPONSE_LEN);
     REGISTER_OPTION(NPUW_LLM_OPTIMIZE_V_TENSORS);
     REGISTER_OPTION(NPUW_LLM_CACHE_ROPE);
diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp
index 507048ac090dcf..9dfc7200465931 100644
--- a/src/plugins/intel_npu/src/plugin/src/properties.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp
@@ -446,6 +446,7 @@ void Properties::registerPluginProperties() {
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::seq_len_dim, NPUW_LLM_SEQ_LEN_DIM);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN);
+    TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_len, NPUW_LLM_MAX_GENERATION_LEN);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT);

From 5ad3d5a9dced25588f3af452afbc7e5f5c0fdcc2 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 4 Sep 2025 12:29:41 +0100
Subject: [PATCH 08/13] Fixed review comment and build issues

---
 src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 6 ++++--
 src/plugins/intel_npu/src/plugin/src/properties.cpp         | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 8a9e27b587bd8e..03e9a737de37b6 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -179,6 +179,10 @@ void pad_position_ids(const ov::SoPtr<ov::ITensor>& padded_position_ids, const o
     OPENVINO_ASSERT(position_shape.size() <= 3);
 
     size_t diff_dim = position_shape.size() - 1;
+    for (size_t i = 0; i < diff_dim; ++i) {
+        OPENVINO_ASSERT(padded_shape[i] == position_shape[i]);
+    }
+
     size_t keep_elements = padded_shape[diff_dim] - position_shape[diff_dim];
 
     size_t batch_size = 1;
@@ -841,13 +845,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     //       units of length of the current prompt on the right (for present
     //       kv layers) and the set of "1" units of number of previously calculated
     //       tokens on the left (for past kv layers).
-    auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
     std::copy_n(attention_mask->data<int64_t>(),
                 attention_mask->get_size() - input_tokens_len,
                 kv_attn_mask->data<int64_t>());
     std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
 
-    auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
     pad_position_ids(kv_pos_ids, position_ids);
 
     m_kvcache_request->infer();
diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp
index 9dfc7200465931..73be75a56caec2 100644
--- a/src/plugins/intel_npu/src/plugin/src/properties.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp
@@ -446,7 +446,7 @@ void Properties::registerPluginProperties() {
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::seq_len_dim, NPUW_LLM_SEQ_LEN_DIM);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN);
-    TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_len, NPUW_LLM_MAX_GENERATION_LEN);
+    TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_token_len, NPUW_LLM_MAX_GENERATION_TOKEN_LEN);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT);

From 56e4aaa880d433238ebface222f05ae7d4b48929 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Fri, 5 Sep 2025 13:03:31 +0100
Subject: [PATCH 09/13] Removed extra changes

---
 .../src/plugin/npuw/llm_infer_request.cpp     | 27 ++++++-------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 03e9a737de37b6..1c30dc95f09c3d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -814,28 +814,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     // FIXME: these tensors should be shared between the parent & child models
     // NB: input_ids can be either fp32(VLM) or i64(LLM)
     auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
-    auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
-    auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
-
     // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
     //       input layers of generation model are resized to, then we need to put
     //       `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
-    //       We need to fill the the left unusable space with zeroes for attention mask, but
-    //       better to do this for all tensors. 
-    if (input_tokens_len < kvcache_desc.max_generation_token_len) {
-        std::fill_n(kv_input_ids->data<int64_t>() + kv_input_ids->get_size() - kvcache_desc.max_generation_token_len,
-                    kvcache_desc.max_generation_token_len - input_tokens_len,
-                    0);
-        std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
-                    kvcache_desc.max_generation_token_len - input_tokens_len,
-                    0);
-        std::fill_n(kv_pos_ids->data<int64_t>() + kv_pos_ids->get_size() - kvcache_desc.max_generation_token_len,
-                    kvcache_desc.max_generation_token_len - input_tokens_len,
-                    0);
-    }
-
-    // NOTE: Copying to the end to handle the case when `input_tokens_len` <
-    //       `kvcache_desc.max_generation_token_len`
+    //       Attention mask should rule out all left unusable space.
     std::copy_n(
         reinterpret_cast<uint8_t*>(input_ids->data()),
         input_ids->get_byte_size(),
@@ -845,11 +827,18 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     //       units of length of the current prompt on the right (for present
     //       kv layers) and the set of "1" units of number of previously calculated
     //       tokens on the left (for past kv layers).
+    auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
     std::copy_n(attention_mask->data<int64_t>(),
                 attention_mask->get_size() - input_tokens_len,
                 kv_attn_mask->data<int64_t>());
+    if (input_tokens_len < kvcache_desc.max_generation_token_len) {
+        std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
+                    kvcache_desc.max_generation_token_len - input_tokens_len,
+                    0);
+    }
     std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
 
+    auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
     pad_position_ids(kv_pos_ids, position_ids);
 
     m_kvcache_request->infer();

From 1fed6a2e247623fdb08454f139c558589a2f62d8 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Fri, 5 Sep 2025 15:47:46 +0200
Subject: [PATCH 10/13] Applied review comments

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 0b14900fccc94a..211bd639f99775 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -860,7 +860,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
     uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
     if (max_generation_token_len != 1) {
-        max_generation_token_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(), 8u);
+        max_generation_token_len = align_to(max_generation_token_len, 8u);
     }
 
     // If chunk size covers the entire prompt, just follow the static behavior.

From 8493155775f4f8eb2424d9992ada561cf26a4246 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Fri, 5 Sep 2025 15:49:05 +0200
Subject: [PATCH 11/13] Update llm_compiled_model.cpp

---
 src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 211bd639f99775..cdf886efbbced0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -885,7 +885,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     LOG_VERB("Prefill chunk size: " << m_prefill_chunk_size);
     LOG_VERB("Maximum prompt length: " << max_prompt_len);
 
-    m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len};
+    m_kvcache_desc =
+        KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len};
 
     LOG_DEBUG("Make prefill model with static shapes");
     m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();

From d93bb29f7c9ebb873894c2163f6626912b69173f Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Fri, 5 Sep 2025 15:50:15 +0200
Subject: [PATCH 12/13] Update llm_infer_request.cpp

---
 src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
index 1c30dc95f09c3d..8861c9d419a918 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -604,7 +604,7 @@ void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr<
     auto dirty_num = kvcache_desc.num_stored_tokens - static_cast<uint32_t>(position_id);
     if (dirty_num > 0) {
         LOG_DEBUG("Trim kv cache from " << kvcache_desc.num_stored_tokens << " length"
-                   << " to " << position_id << " length");
+                                        << " to " << position_id << " length");
     }
     kvcache_desc.num_stored_tokens -= dirty_num;
 }

From 71c85250611b09db6934e674f0a241641c119bc7 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Mon, 8 Sep 2025 16:33:01 +0200
Subject: [PATCH 13/13] Blob version fixed to 0.9

---
 src/plugins/intel_npu/src/plugin/npuw/serialization.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
index 7a69bb2ce513cb..77e90c13f06c25 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
@@ -34,7 +34,7 @@ const constexpr ov::npuw::s11n::IndicatorType NPUW_COMPILED_MODEL_INDICATOR =
 const constexpr ov::npuw::s11n::IndicatorType NPUW_LLM_COMPILED_MODEL_INDICATOR =
     {char{0x4c}, char{0x4c}, char{0x4d}, char{0x43}, char{0x4d}, char{0x4f}};
 
-const constexpr char* NPUW_SERIALIZATION_VERSION = "0.10";
+const constexpr char* NPUW_SERIALIZATION_VERSION = "0.9";
 
 // Forward declaration
 namespace intel_npu {