From f0f8ea582e142d6a7a5cc02836134c72d486d44a Mon Sep 17 00:00:00 2001 From: wenzengc Date: Sun, 3 Aug 2025 12:05:57 +0800 Subject: [PATCH 01/13] support generate more than 1 token per inference Signed-off-by: wenzengc --- .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 8 ++++ .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../src/plugin/npuw/llm_compiled_model.cpp | 9 +++- .../src/plugin/npuw/llm_compiled_model.hpp | 1 + .../src/plugin/npuw/llm_infer_request.cpp | 45 ++++++++++++++----- .../src/plugin/npuw/llm_infer_request.hpp | 1 + 7 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 61455f23d46c0a..e76678e89793c8 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -107,6 +107,7 @@ DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, RunTime); DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, RunTime); DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, RunTime); DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, RunTime); +DEFINE_OPT(NPUW_LLM_MAX_GENERATION_TOKEN_LEN, uint32_t, 1, npuw::llm::max_generation_token_len, RunTime); DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, RunTime); DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, RunTime); DEFINE_OPT(NPUW_LLM_PREFILL_CHUNK_SIZE, uint64_t, 256, npuw::llm::prefill_chunk_size, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index d68259b6949ec6..1f9a526aba00be 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -423,6 +423,14 @@ static constexpr ov::Property seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"}; */ static constexpr ov::Property max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"}; +/** ++ * @brief ++ * Type: uint32_t. ++ * Desirable max input token length for generation. ++ * Default value: 1. ++ */ +static constexpr ov::Property max_generation_token_len{"NPUW_LLM_MAX_GENERATION_TOKEN_LEN"}; + /** * @brief * Type: uint32_t. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 11e266fca25d8d..9cb3376242157f 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -65,6 +65,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index dd9f791f46bd48..2f64c678e14ad9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -955,6 +955,9 @@ ov::AnyMap get_default_generate_config(const std::optional& npudesc, if (hint == ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE) { config.emplace("NPUW_UNFOLD_IREQS", "YES"); } + // We don't need slice out for kv cache model, especially for speculative decoding which need + // to generate more than 1 token for each inference + config.erase("NPUW_SLICE_OUT"); return config; } @@ -1124,6 +1127,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m KVAxesPosition axes{batch_dim, seq_len_dim}; const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); + const uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(); // NB: PREFILL_HINT is now compatible with the PREFILL_CONFIG section, unlike for // the generate model they're not mutually exclusive @@ -1154,7 +1158,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m } } - m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim}; + m_kvcache_desc = + KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len}; LOG_DEBUG("Make prefill model with static shapes"); m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>(); if (m_use_chunk_prefill) { @@ -1171,7 +1176,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m m_max_lora_rank); } LOG_DEBUG("Make kvcache model with static shapes"); - reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes, m_max_lora_rank); + reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank); if (lm_head_model) { LOG_DEBUG("Shared LM head: slice the prefill output"); // KVCache model is already reshaped to [1, 1, embed size], so only apply slice to diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index fbeedcbe809969..36445f2858c246 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -24,6 +24,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { uint32_t total_size = 0u; uint32_t num_stored_tokens = 0u; uint32_t dim = 0u; + uint32_t max_generation_token_len = 0u; bool v_tensors_transposed = false; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index b629a175290617..b7131581dc4db2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -571,6 +571,14 @@ void ov::npuw::LLMInferRequest::update_kvcache_for( LOG_DEBUG("Done."); } +void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr position_ids) { + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; + auto position_id = position_ids->data()[0]; + auto dirty_num = kvcache_desc.num_stored_tokens - static_cast(position_id); + LOG_DEBUG("Update kv cache length from " << kvcache_desc.num_stored_tokens << " to " << position_id); + kvcache_desc.num_stored_tokens -= dirty_num; +} + void ov::npuw::LLMInferRequest::clear_chunk_prefill_kv_cache() { const auto& prefill_compiled = m_prefill_request->get_compiled_model(); @@ -739,6 +747,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, ov::SoPtr position_ids) { LOG_DEBUG("Calling inference for generate model..."); LOG_BLOCK(); + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; + auto in_token_len = input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM]; + if (in_token_len != kvcache_desc.max_generation_token_len) { + OPENVINO_THROW("Input lenth for KV cache model mismatch with \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ", + kvcache_desc.max_generation_token_len, + ".\nPlease adjust it "); + } if (!m_generate_initialized) { LOG_DEBUG("Copy kv-cache from prefill to generate model."); @@ -749,14 +764,15 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, fill_tensor(kv_attn_mask, 0); // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask. // We can safely set this "1" once and then copy on one "1" less in the infer_generate(). - kv_attn_mask->data()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - 1] = 1; + for (std::size_t i = 0; i < kvcache_desc.max_generation_token_len; i++) { + kv_attn_mask->data()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - i - 1] = 1; + } m_generate_initialized = true; } - auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; // NB: KV-cache is full, further generation is impossible - if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) { + if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { OPENVINO_THROW("KV-Cache is full."); } @@ -770,27 +786,35 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask. // As it is already set above, here we copy on one "1" unit less. auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); - std::copy_n(attention_mask->data(), attention_mask->get_size() - 1, kv_attn_mask->data()); + std::copy_n(attention_mask->data(), + attention_mask->get_size() - kvcache_desc.max_generation_token_len, + kv_attn_mask->data()); auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); std::copy_n(position_ids->data(), position_ids->get_size(), kv_pos_ids->data()); m_kvcache_request->infer(); - kvcache_desc.num_stored_tokens += 1; + kvcache_desc.num_stored_tokens += kvcache_desc.max_generation_token_len; if (m_lm_head_request) { LOG_DEBUG("Calling inference for LM head model asynchronously"); m_lm_head_request->start_async(); - if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) { - update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1); + if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { + update_kvcache_for(m_kvcache_request, + m_kvcache_in_ports, + m_kvcache_out_ports, + kvcache_desc.max_generation_token_len); } m_lm_head_request->wait(); LOG_DEBUG("Calling inference for LM head model -- done."); m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port); } else { - if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) { - update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1); + if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { + update_kvcache_for(m_kvcache_request, + m_kvcache_in_ports, + m_kvcache_out_ports, + kvcache_desc.max_generation_token_len); } m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits)); @@ -815,9 +839,10 @@ void ov::npuw::LLMInferRequest::infer() { // NB: Check the sequence length provided for input_ids // in order to distinguish prefill / generate stages - if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] != 1) { + if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data()[0] == 0) { infer_prefill(input_ids, attention_mask, position_ids); } else { + trim_kvcache_for_speculative_decoding(position_ids); infer_generate(input_ids, attention_mask, position_ids); } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index 7197ef4f19fe54..86ccfef6e41700 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -52,6 +52,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest { std::unordered_map> in_ports, std::unordered_map> out_ports, uint32_t tokens); + void trim_kvcache_for_speculative_decoding(ov::SoPtr position_ids); void infer_chunked_prefill(ov::SoPtr input_ids, ov::SoPtr attention_mask, From 25fb0562a281c8fcf3ee60e10fc9acbe29ff9e20 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 7 Aug 2025 18:25:28 +0100 Subject: [PATCH 02/13] Fixes for 3-model pipeline --- .../src/plugin/npuw/llm_compiled_model.cpp | 26 ++++++++++--------- .../src/plugin/npuw/llm_infer_request.cpp | 17 +++++++++++- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 2f64c678e14ad9..39418a3566affc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -781,10 +781,11 @@ void reshape_to_static(std::shared_ptr model, model->reshape(new_shapes); } -void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, const uint32_t& batch_dim) { - // We have only one input with dynamic shapes: output of Slice operation, and this output - // should have "1" for dimension representing number of embeddings to send to the matmul. - // Batch size should be also equal "1" for NPU. +void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, const uint32_t& batch_dim, + std::size_t max_generation_token_len) { + // We have only one input with dynamic shapes: output embeds. + // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings + // to send to the matmul. Batch size should be equal "1" for NPU. const auto& input = lm_head_model->input(0); const auto& partial_shape = input.get_partial_shape(); NPUW_ASSERT(partial_shape.size() == 3); @@ -794,7 +795,7 @@ void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, con // Left dynamic axis will be for number of embeddings for (auto i = 0; i < new_shape.rank().get_length(); i++) { if (new_shape[i].is_dynamic()) { - new_shape[i] = 1; + new_shape[i] = max_generation_token_len; // Sanity check that only one left dimension is dynamic, as // another one should contain embedding space rank break; @@ -804,7 +805,8 @@ void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, con lm_head_model->reshape(new_shape); } -void slice_out_embeds(std::shared_ptr model, const uint32_t& batch_dim) { +void slice_out_embeds(std::shared_ptr model, const uint32_t& batch_dim, + std::size_t max_generation_token_len) { std::shared_ptr embed_result; for (auto&& output : model->outputs()) { if (output.get_any_name() == ov::npuw::LLMCompiledModel::output_embeds) { @@ -821,8 +823,8 @@ void slice_out_embeds(std::shared_ptr model, const uint32_t& batch_di if (shape.size() == 3) { uint32_t num_embeds_dim = 1 - batch_dim; if (shape[num_embeds_dim] > 1) { - std::vector start_pos{static_cast(batch_dim * (shape[num_embeds_dim] - 1)), - static_cast(num_embeds_dim * (shape[num_embeds_dim] - 1)), + std::vector start_pos{static_cast(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)), + static_cast(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)), 0}; std::vector stop_pos{static_cast(batch_dim * (shape[num_embeds_dim] - 1)) + 1, static_cast(num_embeds_dim * (shape[num_embeds_dim] - 1)) + 1, @@ -1179,11 +1181,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank); if (lm_head_model) { LOG_DEBUG("Shared LM head: slice the prefill output"); - // KVCache model is already reshaped to [1, 1, embed size], so only apply slice to - // the Prefill model: - slice_out_embeds(prefill_model, axes.batch); + // KVCache model is already reshaped to [1, max_generation_token_len, embed size], + // so only apply slice to the Prefill model: + slice_out_embeds(prefill_model, axes.batch, m_kvcache_desc.max_generation_token_len); LOG_DEBUG("Make LM head model with static shapes"); - reshape_sliced_head_to_static(lm_head_model, axes.batch); + reshape_sliced_head_to_static(lm_head_model, axes.batch, m_kvcache_desc.max_generation_token_len); } LOG_DEBUG("5.1, decompose GroupQueryAttention OP"); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index b7131581dc4db2..2745a53b71d782 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -838,7 +838,22 @@ void ov::npuw::LLMInferRequest::infer() { OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type()); // NB: Check the sequence length provided for input_ids - // in order to distinguish prefill / generate stages + // and start position idx in order to distinguish prefill + // and generate stages. + // Notes for Speculative Decoding: + // 1. If model is a draft one in speculative decoding setting, + // we expect it to be launched for more than 1 token only once, + // while all other candidates to be generated consequentively + // on previous token output. + // 2. If model is a main one in speculative decoding setting, + // then it will be launched on multiple tokens at every iteration. + // However, only the first iteration will take the input prompt + // of variable length, while others will be launched on fixed + // number of candidates, that can be easily done in generate phase + // if generate model is reshaped to output kvcache for such fixed + // number of candidates. To differentiate prefill and generate + // calls for main model, we just check that start position id + // is 0, meaning this is the first input prompt. if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data()[0] == 0) { infer_prefill(input_ids, attention_mask, position_ids); } else { From f74a821cd814bd053022e7d88791e40c40fe5982 Mon Sep 17 00:00:00 2001 From: wenzengc Date: Fri, 8 Aug 2025 09:58:43 +0800 Subject: [PATCH 03/13] fix clang format Signed-off-by: wenzengc --- .../src/plugin/npuw/llm_compiled_model.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 39418a3566affc..5762ae52b3aa11 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -781,7 +781,8 @@ void reshape_to_static(std::shared_ptr model, model->reshape(new_shapes); } -void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, const uint32_t& batch_dim, +void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, + const uint32_t& batch_dim, std::size_t max_generation_token_len) { // We have only one input with dynamic shapes: output embeds. // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings @@ -805,7 +806,8 @@ void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, con lm_head_model->reshape(new_shape); } -void slice_out_embeds(std::shared_ptr model, const uint32_t& batch_dim, +void slice_out_embeds(std::shared_ptr model, + const uint32_t& batch_dim, std::size_t max_generation_token_len) { std::shared_ptr embed_result; for (auto&& output : model->outputs()) { @@ -823,9 +825,10 @@ void slice_out_embeds(std::shared_ptr model, const uint32_t& batch_di if (shape.size() == 3) { uint32_t num_embeds_dim = 1 - batch_dim; if (shape[num_embeds_dim] > 1) { - std::vector start_pos{static_cast(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)), - static_cast(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)), - 0}; + std::vector start_pos{ + static_cast(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)), + static_cast(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)), + 0}; std::vector stop_pos{static_cast(batch_dim * (shape[num_embeds_dim] - 1)) + 1, static_cast(num_embeds_dim * (shape[num_embeds_dim] - 1)) + 1, static_cast(shape[2])}; @@ -1178,7 +1181,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m m_max_lora_rank); } LOG_DEBUG("Make kvcache model with static shapes"); - reshape_to_static(kvcache_model, m_kvcache_desc.max_generation_token_len, m_kvcache_desc.total_size, axes, m_max_lora_rank); + reshape_to_static(kvcache_model, + m_kvcache_desc.max_generation_token_len, + m_kvcache_desc.total_size, + axes, + m_max_lora_rank); if (lm_head_model) { LOG_DEBUG("Shared LM head: slice the prefill output"); // KVCache model is already reshaped to [1, max_generation_token_len, embed size], From 517bfcaf33da9326347c959e1be451636c20baea Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Wed, 20 Aug 2025 02:15:28 +0100 Subject: [PATCH 04/13] Supported dynamic number of output tokens for generate model only --- .../src/plugin/npuw/llm_infer_request.cpp | 68 ++++++++++++------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 2745a53b71d782..6ea81b61b8be59 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -566,7 +566,20 @@ void ov::npuw::LLMInferRequest::update_kvcache_for( kvcache_desc.num_stored_tokens - num_tokens, kvcache_desc.num_stored_tokens); auto src_tensor = request->get_tensor(out_ports.at(output_name)); - copy_tensor_by_dim(src_tensor, dst_slice, kv_dim); + + // NOTE: Sometimes present kv layer can contain greater seq_len + // than was sent to be processed + uint32_t src_seq_len = static_cast(src_tensor->get_shape()[kv_dim]); + OPENVINO_ASSERT(num_tokens <= src_seq_len); + if (src_seq_len > num_tokens) { + auto src_slice = make_tensor_slice(src_tensor, + kv_dim, + src_seq_len - num_tokens, + src_seq_len); + copy_tensor_by_dim(src_slice, dst_slice, kv_dim); + } else { + copy_tensor_by_dim(src_tensor, dst_slice, kv_dim); + } } LOG_DEBUG("Done."); } @@ -748,9 +761,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Calling inference for generate model..."); LOG_BLOCK(); auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; - auto in_token_len = input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM]; - if (in_token_len != kvcache_desc.max_generation_token_len) { - OPENVINO_THROW("Input lenth for KV cache model mismatch with \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ", + uint32_t input_tokens_len = static_cast(input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM]); + if (input_tokens_len > kvcache_desc.max_generation_token_len) { + OPENVINO_THROW("Input prompt length is greater than output \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ", kvcache_desc.max_generation_token_len, ".\nPlease adjust it "); } @@ -759,62 +772,69 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Copy kv-cache from prefill to generate model."); copy_kvcache(); - LOG_DEBUG("Prepare attention mask pattern."); - auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); - fill_tensor(kv_attn_mask, 0); - // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask. - // We can safely set this "1" once and then copy on one "1" less in the infer_generate(). - for (std::size_t i = 0; i < kvcache_desc.max_generation_token_len; i++) { - kv_attn_mask->data()[m_npuw_llm_compiled_model->m_kvcache_desc.total_size - i - 1] = 1; - } - + LOG_DEBUG("Prepare inputs."); + fill_tensor_bytes(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)), 0u); + fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)), 0); + fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)), 0); m_generate_initialized = true; } // NB: KV-cache is full, further generation is impossible - if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { + if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - input_tokens_len) { OPENVINO_THROW("KV-Cache is full."); } // FIXME: these tensors should be shared between the parent & child models auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)); // NB: input_ids can be either fp32(VLM) or i64(LLM) + // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len std::copy_n(reinterpret_cast(input_ids->data()), input_ids->get_byte_size(), - reinterpret_cast(kv_input_ids->data())); + reinterpret_cast(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size()); - // NOTE: Attention mask pattern for generate model requires last "1" to be in the end of the mask. - // As it is already set above, here we copy on one "1" unit less. + // NOTE: Attention mask pattern for generate model requires the set of "1" + // units of length of the current prompt on the right (for present + // kv layers) and the set of "1" units of number of previously calculated + // tokens on the left (for past kv layers). auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); std::copy_n(attention_mask->data(), - attention_mask->get_size() - kvcache_desc.max_generation_token_len, + // All tokens that we should process in current generate(), + // will go to the right of the mask (for present layers), so + // copy only mask from previous generate() calls to the left. + attention_mask->get_size() - input_tokens_len, kv_attn_mask->data()); + if (input_tokens_len < kvcache_desc.max_generation_token_len) { + std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, + kvcache_desc.max_generation_token_len - input_tokens_len, 0); + } + std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, + input_tokens_len, 1); auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); - std::copy_n(position_ids->data(), position_ids->get_size(), kv_pos_ids->data()); + pad_position_ids(kv_pos_ids, position_ids); m_kvcache_request->infer(); - kvcache_desc.num_stored_tokens += kvcache_desc.max_generation_token_len; + kvcache_desc.num_stored_tokens += input_tokens_len; if (m_lm_head_request) { LOG_DEBUG("Calling inference for LM head model asynchronously"); m_lm_head_request->start_async(); - if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { + if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, - kvcache_desc.max_generation_token_len); + input_tokens_len); } m_lm_head_request->wait(); LOG_DEBUG("Calling inference for LM head model -- done."); m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port); } else { - if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - kvcache_desc.max_generation_token_len) { + if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, - kvcache_desc.max_generation_token_len); + input_tokens_len); } m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits)); From 0f14e3fe4794c5b896d2dfe6f9523bf28112918c Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Mon, 25 Aug 2025 19:13:04 +0200 Subject: [PATCH 05/13] Fixed clang-format --- .../src/plugin/npuw/llm_infer_request.cpp | 28 +++++++------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index f89c9d6bec37fb..f3a1cfdc7baf0d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -590,10 +590,7 @@ void ov::npuw::LLMInferRequest::update_kvcache_for( uint32_t src_seq_len = static_cast(src_tensor->get_shape()[kv_dim]); OPENVINO_ASSERT(num_tokens <= src_seq_len); if (src_seq_len > num_tokens) { - auto src_slice = make_tensor_slice(src_tensor, - kv_dim, - src_seq_len - num_tokens, - src_seq_len); + auto src_slice = make_tensor_slice(src_tensor, kv_dim, src_seq_len - num_tokens, src_seq_len); copy_tensor_by_dim(src_slice, dst_slice, kv_dim); } else { copy_tensor_by_dim(src_tensor, dst_slice, kv_dim); @@ -806,9 +803,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)); // NB: input_ids can be either fp32(VLM) or i64(LLM) // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len - std::copy_n(reinterpret_cast(input_ids->data()), - input_ids->get_byte_size(), - reinterpret_cast(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size()); + std::copy_n( + reinterpret_cast(input_ids->data()), + input_ids->get_byte_size(), + reinterpret_cast(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size()); // NOTE: Attention mask pattern for generate model requires the set of "1" // units of length of the current prompt on the right (for present @@ -823,10 +821,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, kv_attn_mask->data()); if (input_tokens_len < kvcache_desc.max_generation_token_len) { std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, - kvcache_desc.max_generation_token_len - input_tokens_len, 0); + kvcache_desc.max_generation_token_len - input_tokens_len, + 0); } - std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, - input_tokens_len, 1); + std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1); auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); pad_position_ids(kv_pos_ids, position_ids); @@ -838,10 +836,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Calling inference for LM head model asynchronously"); m_lm_head_request->start_async(); if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { - update_kvcache_for(m_kvcache_request, - m_kvcache_in_ports, - m_kvcache_out_ports, - input_tokens_len); + update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len); } m_lm_head_request->wait(); LOG_DEBUG("Calling inference for LM head model -- done."); @@ -849,10 +844,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port); } else { if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { - update_kvcache_for(m_kvcache_request, - m_kvcache_in_ports, - m_kvcache_out_ports, - input_tokens_len); + update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len); } m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits)); From c3fc6ba8c149400dd6b14dee9afcd8de46b404d3 Mon Sep 17 00:00:00 2001 From: wenzengc Date: Sun, 3 Aug 2025 12:05:57 +0800 Subject: [PATCH 06/13] Align number of outputs of generate model to NPU-friendly power of two --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index a785468e2d5522..5afa224ebded13 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -857,7 +857,10 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m KVAxesPosition axes{batch_dim, seq_len_dim}; uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); - const uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(); + uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(); + if (max_generation_token_len != 1) { + max_generation_token_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(), 8u); + } // If chunk size covers the entire prompt, just follow the static behavior. // Otherwise, use chunking and align the prompt size to the chunk size. From 137db087453dd96e7215bce2f27f883951445da2 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Wed, 3 Sep 2025 22:41:17 +0100 Subject: [PATCH 07/13] Polishing the PR --- .../src/plugin/include/properties.hpp | 1 + .../src/plugin/npuw/llm_compiled_model.cpp | 11 ++- .../src/plugin/npuw/llm_infer_request.cpp | 78 +++++++++++-------- .../src/plugin/npuw/serialization.hpp | 2 +- .../intel_npu/src/plugin/src/plugin.cpp | 1 + .../intel_npu/src/plugin/src/properties.cpp | 1 + 6 files changed, 58 insertions(+), 36 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/include/properties.hpp b/src/plugins/intel_npu/src/plugin/include/properties.hpp index d6f0b5f04fa9c3..4f27e8552e3df9 100644 --- a/src/plugins/intel_npu/src/plugin/include/properties.hpp +++ b/src/plugins/intel_npu/src/plugin/include/properties.hpp @@ -111,6 +111,7 @@ class Properties final { ov::intel_npu::npuw::llm::batch_dim.name(), ov::intel_npu::npuw::llm::seq_len_dim.name(), ov::intel_npu::npuw::llm::max_prompt_len.name(), + ov::intel_npu::npuw::llm::max_generation_token_len.name(), ov::intel_npu::npuw::llm::min_response_len.name(), ov::intel_npu::npuw::llm::optimize_v_tensors.name(), ov::intel_npu::npuw::llm::prefill_hint.name(), diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 5afa224ebded13..0b14900fccc94a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -503,8 +503,9 @@ void reshape_sliced_head_to_static(std::shared_ptr lm_head_model, const uint32_t& batch_dim, std::size_t max_generation_token_len) { // We have only one input with dynamic shapes: output embeds. - // Output embeds should have "max_generation_token_len" for dimension representing number of embeddings - // to send to the matmul. Batch size should be equal "1" for NPU. + // Output embeds should have "max_generation_token_len" for dimension representing + // number of embeddings to send to the matmul. Batch size should be equal to "1" + // for NPU. const auto& input = lm_head_model->input(0); const auto& partial_shape = input.get_partial_shape(); NPUW_ASSERT(partial_shape.size() == 3); @@ -537,12 +538,12 @@ void slice_out_embeds(std::shared_ptr model, if (embed_result) { auto shape = embed_result->input(0).get_shape(); // If shape.size() is 3, then last axis should be the Vocab size. - // But 1st and 2nd axis can mean different things. + // But 1st and 2nd axes can mean different things. // 1st axis can represent the batch size, while 2nd - the number of embeddings, // or vice-versa (in chatglm) if (shape.size() == 3) { uint32_t num_embeds_dim = 1 - batch_dim; - if (shape[num_embeds_dim] > 1) { + if (shape[num_embeds_dim] > max_generation_token_len) { std::vector start_pos{ static_cast(batch_dim * (shape[num_embeds_dim] - max_generation_token_len)), static_cast(num_embeds_dim * (shape[num_embeds_dim] - max_generation_token_len)), @@ -1106,6 +1107,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw: write(model_stream, m_kvcache_desc.total_size); write(model_stream, m_kvcache_desc.num_stored_tokens); write(model_stream, m_kvcache_desc.dim); + write(model_stream, m_kvcache_desc.max_generation_token_len); write(model_stream, m_kvcache_desc.v_tensors_transposed); write(model_stream, m_prefill_chunk_size); write(model_stream, m_use_chunk_prefill); @@ -1314,6 +1316,7 @@ std::shared_ptr ov::npuw::LLMCompiledModel::deserial read(model_stream, compiled->m_kvcache_desc.total_size); read(model_stream, compiled->m_kvcache_desc.num_stored_tokens); read(model_stream, compiled->m_kvcache_desc.dim); + read(model_stream, compiled->m_kvcache_desc.max_generation_token_len); read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed); read(model_stream, compiled->m_prefill_chunk_size); read(model_stream, compiled->m_use_chunk_prefill); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index f4f460fd8b0289..8a9e27b587bd8e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -178,14 +178,7 @@ void pad_position_ids(const ov::SoPtr& padded_position_ids, const o OPENVINO_ASSERT(position_shape.size() <= 3); - size_t diff_dim = 0; - for (size_t i = 0; i < padded_shape.size(); ++i) { - if (padded_shape[i] != position_shape[i]) { - diff_dim = i; - break; - } - } - + size_t diff_dim = position_shape.size() - 1; size_t keep_elements = padded_shape[diff_dim] - position_shape[diff_dim]; size_t batch_size = 1; @@ -601,9 +594,14 @@ void ov::npuw::LLMInferRequest::update_kvcache_for( void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr position_ids) { auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; + // FIXME: It can not work with OmniThinker for now. + OPENVINO_ASSERT((position_ids->get_shape().size() >= 2) && (position_ids->get_shape().back() >= 1)); auto position_id = position_ids->data()[0]; auto dirty_num = kvcache_desc.num_stored_tokens - static_cast(position_id); - LOG_DEBUG("Update kv cache length from " << kvcache_desc.num_stored_tokens << " to " << position_id); + if (dirty_num > 0) { + LOG_DEBUG("Trim kv cache from " << kvcache_desc.num_stored_tokens << " length" + << " to " << position_id << " length"); + } kvcache_desc.num_stored_tokens -= dirty_num; } @@ -790,7 +788,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (input_tokens_len > kvcache_desc.max_generation_token_len) { OPENVINO_THROW("Input prompt length is greater than output \"NPUW_LLM_MAX_GENERATION_TOKEN_LEN\": ", kvcache_desc.max_generation_token_len, - ".\nPlease adjust it "); + ".\nPlease adjust it."); } if (!m_generate_initialized) { @@ -805,14 +803,35 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, } // NB: KV-cache is full, further generation is impossible - if (kvcache_desc.num_stored_tokens > kvcache_desc.total_size - input_tokens_len) { + if (kvcache_desc.num_stored_tokens + input_tokens_len > kvcache_desc.total_size) { OPENVINO_THROW("KV-Cache is full."); } // FIXME: these tensors should be shared between the parent & child models - auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)); // NB: input_ids can be either fp32(VLM) or i64(LLM) - // NOTE: Copying to the end to handle case when input_tokens_len < kvcache_desc.max_generation_token_len + auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)); + auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); + auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); + + // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which + // input layers of generation model are resized to, then we need to put + // `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors. + // We need to fill the the left unusable space with zeroes for attention mask, but + // better to do this for all tensors. + if (input_tokens_len < kvcache_desc.max_generation_token_len) { + std::fill_n(kv_input_ids->data() + kv_input_ids->get_size() - kvcache_desc.max_generation_token_len, + kvcache_desc.max_generation_token_len - input_tokens_len, + 0); + std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, + kvcache_desc.max_generation_token_len - input_tokens_len, + 0); + std::fill_n(kv_pos_ids->data() + kv_pos_ids->get_size() - kvcache_desc.max_generation_token_len, + kvcache_desc.max_generation_token_len - input_tokens_len, + 0); + } + + // NOTE: Copying to the end to handle the case when `input_tokens_len` < + // `kvcache_desc.max_generation_token_len` std::copy_n( reinterpret_cast(input_ids->data()), input_ids->get_byte_size(), @@ -824,16 +843,8 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // tokens on the left (for past kv layers). auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); std::copy_n(attention_mask->data(), - // All tokens that we should process in current generate(), - // will go to the right of the mask (for present layers), so - // copy only mask from previous generate() calls to the left. attention_mask->get_size() - input_tokens_len, kv_attn_mask->data()); - if (input_tokens_len < kvcache_desc.max_generation_token_len) { - std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, - kvcache_desc.max_generation_token_len - input_tokens_len, - 0); - } std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1); auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); @@ -845,7 +856,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (m_lm_head_request) { LOG_DEBUG("Calling inference for LM head model asynchronously"); m_lm_head_request->start_async(); - if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { + if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) { update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len); } m_lm_head_request->wait(); @@ -853,7 +864,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port); } else { - if (kvcache_desc.num_stored_tokens <= kvcache_desc.total_size - input_tokens_len) { + if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) { update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, input_tokens_len); } @@ -886,14 +897,19 @@ void ov::npuw::LLMInferRequest::infer() { // while all other candidates to be generated consequentively // on previous token output. // 2. If model is a main one in speculative decoding setting, - // then it will be launched on multiple tokens at every iteration. - // However, only the first iteration will take the input prompt - // of variable length, while others will be launched on fixed - // number of candidates, that can be easily done in generate phase - // if generate model is reshaped to output kvcache for such fixed - // number of candidates. To differentiate prefill and generate - // calls for main model, we just check that start position id - // is 0, meaning this is the first input prompt. + // then it can be launched on multiple tokens at every iteration. + // The first iteration will take the input prompt of variable + // length in range [0, NPUW_LLM_MAX_PROMPT_LEN], while others + // will be launched on variable number of candidates in range + // [0, NPUW_LLM_MAX_GENERATION_TOKEN_LEN]. + // NPUW_LLM_MAX_GENERATION_TOKEN_LEN is much lesser than + // NPUW_LLM_MAX_PROMPT_LEN. So, for second and next iterations + // generate model will be utilized, that is reshaped to take + // NPUW_LLM_MAX_GENERATION_TOKEN_LEN tokens and output the same + // number of logits. + // The outcome of two items is that prefill and generate stages + // can be safely differentiated by start position id for + // both main and draft models. if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data()[0] == 0) { infer_prefill(input_ids, attention_mask, position_ids); } else { diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp index 613ad079d807bc..7a69bb2ce513cb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp @@ -34,7 +34,7 @@ const constexpr ov::npuw::s11n::IndicatorType NPUW_COMPILED_MODEL_INDICATOR = const constexpr ov::npuw::s11n::IndicatorType NPUW_LLM_COMPILED_MODEL_INDICATOR = {char{0x4c}, char{0x4c}, char{0x4d}, char{0x43}, char{0x4d}, char{0x4f}}; -const constexpr char* NPUW_SERIALIZATION_VERSION = "0.8"; +const constexpr char* NPUW_SERIALIZATION_VERSION = "0.10"; // Forward declaration namespace intel_npu { diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 4a6206c3731b4e..5f4d82cda9436d 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -329,6 +329,7 @@ void Plugin::init_options() { REGISTER_OPTION(NPUW_LLM_BATCH_DIM); REGISTER_OPTION(NPUW_LLM_SEQ_LEN_DIM); REGISTER_OPTION(NPUW_LLM_MAX_PROMPT_LEN); + REGISTER_OPTION(NPUW_LLM_MAX_GENERATION_TOKEN_LEN); REGISTER_OPTION(NPUW_LLM_MIN_RESPONSE_LEN); REGISTER_OPTION(NPUW_LLM_OPTIMIZE_V_TENSORS); REGISTER_OPTION(NPUW_LLM_CACHE_ROPE); diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index 507048ac090dcf..9dfc7200465931 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -446,6 +446,7 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::seq_len_dim, NPUW_LLM_SEQ_LEN_DIM); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_len, NPUW_LLM_MAX_GENERATION_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT); From 5ad3d5a9dced25588f3af452afbc7e5f5c0fdcc2 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 4 Sep 2025 12:29:41 +0100 Subject: [PATCH 08/13] Fixed review comment and build issues --- src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 6 ++++-- src/plugins/intel_npu/src/plugin/src/properties.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 8a9e27b587bd8e..03e9a737de37b6 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -179,6 +179,10 @@ void pad_position_ids(const ov::SoPtr& padded_position_ids, const o OPENVINO_ASSERT(position_shape.size() <= 3); size_t diff_dim = position_shape.size() - 1; + for (size_t i = 0; i < diff_dim; ++i) { + OPENVINO_ASSERT(padded_shape[i] == position_shape[i]); + } + size_t keep_elements = padded_shape[diff_dim] - position_shape[diff_dim]; size_t batch_size = 1; @@ -841,13 +845,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // units of length of the current prompt on the right (for present // kv layers) and the set of "1" units of number of previously calculated // tokens on the left (for past kv layers). - auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); std::copy_n(attention_mask->data(), attention_mask->get_size() - input_tokens_len, kv_attn_mask->data()); std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1); - auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); pad_position_ids(kv_pos_ids, position_ids); m_kvcache_request->infer(); diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index 9dfc7200465931..73be75a56caec2 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -446,7 +446,7 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::seq_len_dim, NPUW_LLM_SEQ_LEN_DIM); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN); - TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_len, NPUW_LLM_MAX_GENERATION_LEN); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_generation_token_len, NPUW_LLM_MAX_GENERATION_TOKEN_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT); From 56e4aaa880d433238ebface222f05ae7d4b48929 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Fri, 5 Sep 2025 13:03:31 +0100 Subject: [PATCH 09/13] Removed extra changes --- .../src/plugin/npuw/llm_infer_request.cpp | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 03e9a737de37b6..1c30dc95f09c3d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -814,28 +814,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // FIXME: these tensors should be shared between the parent & child models // NB: input_ids can be either fp32(VLM) or i64(LLM) auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)); - auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); - auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); - // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which // input layers of generation model are resized to, then we need to put // `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors. - // We need to fill the the left unusable space with zeroes for attention mask, but - // better to do this for all tensors. - if (input_tokens_len < kvcache_desc.max_generation_token_len) { - std::fill_n(kv_input_ids->data() + kv_input_ids->get_size() - kvcache_desc.max_generation_token_len, - kvcache_desc.max_generation_token_len - input_tokens_len, - 0); - std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, - kvcache_desc.max_generation_token_len - input_tokens_len, - 0); - std::fill_n(kv_pos_ids->data() + kv_pos_ids->get_size() - kvcache_desc.max_generation_token_len, - kvcache_desc.max_generation_token_len - input_tokens_len, - 0); - } - - // NOTE: Copying to the end to handle the case when `input_tokens_len` < - // `kvcache_desc.max_generation_token_len` + // Attention mask should rule out all left unusable space. std::copy_n( reinterpret_cast(input_ids->data()), input_ids->get_byte_size(), @@ -845,11 +827,18 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // units of length of the current prompt on the right (for present // kv layers) and the set of "1" units of number of previously calculated // tokens on the left (for past kv layers). + auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)); std::copy_n(attention_mask->data(), attention_mask->get_size() - input_tokens_len, kv_attn_mask->data()); + if (input_tokens_len < kvcache_desc.max_generation_token_len) { + std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len, + kvcache_desc.max_generation_token_len - input_tokens_len, + 0); + } std::fill_n(kv_attn_mask->data() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1); + auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)); pad_position_ids(kv_pos_ids, position_ids); m_kvcache_request->infer(); From 1fed6a2e247623fdb08454f139c558589a2f62d8 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Fri, 5 Sep 2025 15:47:46 +0200 Subject: [PATCH 10/13] Applied review comments Co-authored-by: Dmitry Matveev --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 0b14900fccc94a..211bd639f99775 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -860,7 +860,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); uint32_t max_generation_token_len = m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(); if (max_generation_token_len != 1) { - max_generation_token_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_GENERATION_TOKEN_LEN>(), 8u); + max_generation_token_len = align_to(max_generation_token_len, 8u); } // If chunk size covers the entire prompt, just follow the static behavior. From 8493155775f4f8eb2424d9992ada561cf26a4246 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Fri, 5 Sep 2025 15:49:05 +0200 Subject: [PATCH 11/13] Update llm_compiled_model.cpp --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 211bd639f99775..cdf886efbbced0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -885,7 +885,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m LOG_VERB("Prefill chunk size: " << m_prefill_chunk_size); LOG_VERB("Maximum prompt length: " << max_prompt_len); - m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len}; + m_kvcache_desc = + KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, max_generation_token_len}; LOG_DEBUG("Make prefill model with static shapes"); m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>(); From d93bb29f7c9ebb873894c2163f6626912b69173f Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Fri, 5 Sep 2025 15:50:15 +0200 Subject: [PATCH 12/13] Update llm_infer_request.cpp --- src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 1c30dc95f09c3d..8861c9d419a918 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -604,7 +604,7 @@ void ov::npuw::LLMInferRequest::trim_kvcache_for_speculative_decoding(ov::SoPtr< auto dirty_num = kvcache_desc.num_stored_tokens - static_cast(position_id); if (dirty_num > 0) { LOG_DEBUG("Trim kv cache from " << kvcache_desc.num_stored_tokens << " length" - << " to " << position_id << " length"); + << " to " << position_id << " length"); } kvcache_desc.num_stored_tokens -= dirty_num; } From 71c85250611b09db6934e674f0a241641c119bc7 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Mon, 8 Sep 2025 16:33:01 +0200 Subject: [PATCH 13/13] Blob version fixed to 0.9 --- src/plugins/intel_npu/src/plugin/npuw/serialization.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp index 7a69bb2ce513cb..77e90c13f06c25 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp @@ -34,7 +34,7 @@ const constexpr ov::npuw::s11n::IndicatorType NPUW_COMPILED_MODEL_INDICATOR = const constexpr ov::npuw::s11n::IndicatorType NPUW_LLM_COMPILED_MODEL_INDICATOR = {char{0x4c}, char{0x4c}, char{0x4d}, char{0x43}, char{0x4d}, char{0x4f}}; -const constexpr char* NPUW_SERIALIZATION_VERSION = "0.10"; +const constexpr char* NPUW_SERIALIZATION_VERSION = "0.9"; // Forward declaration namespace intel_npu {