Restrict StatefulSpeculativeLLMPipeline to launch only if NPU specified for one or both the models

AsyaPronina · AsyaPronina · commit e72008c665c9 · 2025-10-07T13:58:03.000+01:00
diff --git a/samples/cpp/text_generation/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp
@@ -31,7 +31,7 @@ int main(int argc, char* argv[]) try {
     // User can run main and draft model on different devices.
     // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
     // CPU, GPU and NPU can be used. Please be aware that GPU is performant only with Continuous Batching pipeline, so it is not recommented
-    // to use it in conjuction with NPU or in configuration when main model doesn't work in Paged Attention mode.
+    // to use it in conjuction with NPU.
     std::string main_device = "CPU", draft_device = "CPU";
 
     ov::genai::LLMPipeline pipe(
diff --git a/samples/python/text_generation/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py
@@ -21,13 +21,13 @@ def main():
     # User can run main and draft model on different devices.
     # Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in `openvino_genai.draft_model` for draft.
     # CPU, GPU and NPU can be used. Please be aware that GPU is performant only with Continuous Batching pipeline, so it is not
-    # recommented to use it in conjuction with NPU or in configuration when main model doesn't work in Paged Attention mode.
+    # recommented to use it in conjuction with NPU.
     main_device = 'CPU'
     draft_device = 'CPU'
 
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, "CPU", draft_model=draft_model)
     
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
@@ -69,5 +69,15 @@ def main():
         print(f"  Total iteration number: {len(draft_model_metrics.raw_metrics.m_durations)}")
         print()
 
+        print(f"DRAFT MODEL" )
+        print(f"  Generate time: {draft_model_metrics.get_generate_duration().mean:.2f} ms" )
+        print(f"  TTFT: {draft_model_metrics.get_ttft().mean:.2f} ms")
+        print(f"  TTST: {draft_model_metrics.get_ttst().mean:.2f} ms/token")
+        print(f"  TPOT: {draft_model_metrics.get_tpot().mean:.2f} ± {draft_model_metrics.get_tpot().std:.2f} ms/token")
+        print(f"  AVG Latency: {draft_model_metrics.get_latency().mean:.2f} ± {draft_model_metrics.get_latency().std:.2f} ms/iteration")
+        print(f"  Num generated token: {draft_model_metrics.get_num_generated_tokens()} tokens")
+        print(f"  Total iteration number: {len(draft_model_metrics.raw_metrics.m_durations)}")
+        print()
+
 if '__main__' == __name__:
     main()
diff --git a/src/cpp/src/llm/pipeline.cpp b/src/cpp/src/llm/pipeline.cpp
@@ -92,9 +92,10 @@ static std::unique_ptr<LLMPipelineImplBase> create(
     auto properties_without_draft_model = properties;
     auto draft_model_descr = ov::genai::utils::extract_draft_model_from_config(properties_without_draft_model);
     if (draft_model_descr.model != nullptr) {
-        OPENVINO_ASSERT(device != "GPU" && draft_model_descr.device != "GPU",
-            "Speculative Decoding with \"ATTENTION_BACKEND\" : \"SDPA\" or any of the models on NPU "
-            "doesn't support GPU device either for main or draft models currently!");
+        // FIXME: Add support for StatefulSpeculativeLLMPipeline for non-NPU devices for both models.
+        OPENVINO_ASSERT(device == "NPU" || draft_model_descr.device == "NPU",
+            "Stateful Speculative Decoding is expected to be launched when NPU is requsted as "
+            "execution device for one or both models.");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, {}, generation_config);
         return std::make_unique<StatefulSpeculativeLLMPipeline>(main_model_descr, draft_model_descr);
     }
@@ -144,7 +145,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     }
 
     if (m_pimpl == nullptr) {
-        m_pimpl = StatefulPipeline::create(models_path, tokenizer, device, properties);
+        // FIXME: Switch to StatefulPipeline::create after resolving issues
+        //        with GPU and CPU for StatefulSpeculativeLLMPipeline
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
 
     m_pimpl->save_load_time(start_time);
@@ -158,7 +161,6 @@ ov::genai::LLMPipeline::LLMPipeline(
     auto start_time = std::chrono::steady_clock::now();
 
     auto [properties, attention_backend] = utils::extract_attention_backend(user_properties);
-
     if (ov::genai::utils::is_npu_requested(device, properties)) {
         m_pimpl = StatefulPipeline::create(models_path, device, properties);
     } else if (utils::explicitly_requires_paged_attention(user_properties)) {
@@ -179,7 +181,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     }
 
     if (m_pimpl == nullptr) {
-        m_pimpl = StatefulPipeline::create(models_path, device, properties);
+        // FIXME: Switch to StatefulPipeline::create after resolving issues
+        //        with GPU and CPU for StatefulSpeculativeLLMPipeline
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
 
     m_pimpl->save_load_time(start_time);
@@ -224,7 +228,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     }
 
     if (m_pimpl == nullptr) {
-        m_pimpl = StatefulPipeline::create(
+        // FIXME: Switch to StatefulPipeline::create after resolving issues
+        //        with GPU and CPU for StatefulSpeculativeLLMPipeline
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(
             utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,
             device,
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -643,6 +643,14 @@ bool explicitly_requires_paged_attention(const ov::AnyMap& properties) {
         }
     }
 
+    if (properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end()) {
+        if (is_paged_attention_available()) {
+            return true;
+        } else {
+            OPENVINO_THROW("Speculative decoding on non-NPU devices requires PagedAttention operation support, which is available on x86_64 or ARM64 platforms only");
+        }
+    }
+
     auto prompt_lookup_prop = properties.find("prompt_lookup");
     if (prompt_lookup_prop != properties.end() && prompt_lookup_prop->second.as<bool>() == true) {
         if (is_paged_attention_available()) {
diff --git a/tests/python_tests/test_stateful_speculative_decoding.py b/tests/python_tests/test_stateful_speculative_decoding.py
@@ -14,13 +14,19 @@
 from utils.comparation import compare_generation_results
 from utils.ov_genai_pipelines import create_ov_pipeline, generate_and_compare, get_main_pipeline_types, PipelineType, convert_decoded_results_to_generation_result
 
+def get_npu_llm_properties_for_test():
+    config = get_default_llm_properties()
+    config["NPUW_DEVICES"] = "CPU"
+    config["GENERATE_HINT"] = "BEST_PERF"
+    return config
+
 models_and_input = [
     ("HuggingFaceTB/SmolLM2-360M", "HuggingFaceTB/SmolLM2-135M", "Alan Turing was a")]
 devices = [
-    ('CPU', 'CPU'),
-    ('CPU', 'NPUW:CPU'),
-    ('NPUW:CPU', 'CPU'),
-    ('NPUW:CPU', 'NPUW:CPU')
+    # FIXME: add 'CPU' and 'GPU' cases in future
+    ('CPU', 'NPU'),
+    ('NPU', 'CPU'),
+    ('NPU', 'NPU')
 ]
 @pytest.mark.parametrize("main_model,draft_model,prompt", models_and_input)
 @pytest.mark.parametrize("main_device,draft_device", devices)
@@ -31,19 +37,14 @@ def test_string_inputs(main_model, main_device, draft_model, draft_device, promp
     __, __, draft_model_path = download_and_convert_model(draft_model)
 
     # Create OpenVINO GenAI pipeline:
-    draft_config = get_default_llm_properties()
-    if draft_device == "NPUW:CPU":
-        draft_device = "NPU"
-        draft_config["NPUW_DEVICES"] = "CPU"
-        draft_config["GENERATE_HINT"] = "BEST_PERF"
+    draft_config = get_npu_llm_properties_for_test() \
+                       if (draft_device == "NPU") else \
+                   get_default_llm_properties()
     ov_draft_model = ov_genai.draft_model(draft_model_path, draft_device, **draft_config)
 
-    main_config = get_default_llm_properties()
-    if main_device == "NPUW:CPU":
-        main_device = "NPU"
-        main_config["NPUW_DEVICES"] = "CPU"
-        main_config["GENERATE_HINT"] = "BEST_PERF"
-    main_config["ATTENTION_BACKEND"] = "SDPA"
+    main_config = get_npu_llm_properties_for_test() \
+                      if (main_device == "NPU") else \
+                  get_default_llm_properties()
     ov_pipe = ov_genai.LLMPipeline(main_model_path, main_device, main_config, draft_model=ov_draft_model)
 
     # Run reference HF model:
@@ -65,10 +66,14 @@ def test_perf_metrics():
     import time
     start_time = time.perf_counter()
     model_id = 'katuni4ka/tiny-random-gemma2'
-    generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5)
     _, _, model_path = download_and_convert_model(model_id)
-    ov_pipe = create_ov_pipeline(model_path, pipeline_type=PipelineType.STATEFUL_SPECULATIVE_DECODING)
+
+    # Create OpenVINO GenAI pipeline:
+    ov_draft_model = ov_genai.draft_model(model_path, "NPU", **get_npu_llm_properties_for_test())
+    ov_pipe = ov_genai.LLMPipeline(model_path, "NPU", get_npu_llm_properties_for_test(), draft_model=ov_draft_model)
+
     prompt = 'table is made of'
+    generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5)
     perf_metrics = ov_pipe.generate([prompt], generation_config).perf_metrics
     total_time = (time.perf_counter() - start_time) * 1000
 
@@ -141,9 +146,12 @@ def test_extended_perf_metrics():
     import time
     start_time = time.perf_counter()
     model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5)
     _, _, model_path = download_and_convert_model(model_id)
-    ov_pipe = create_ov_pipeline(model_path, pipeline_type=PipelineType.STATEFUL_SPECULATIVE_DECODING)
+
+    ov_draft_model = ov_genai.draft_model(model_path, "NPU", **get_npu_llm_properties_for_test())
+    ov_pipe = ov_genai.LLMPipeline(model_path, "NPU", get_npu_llm_properties_for_test(), draft_model=ov_draft_model)
+
+    generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5)
     extended_perf_metrics = ov_pipe.generate(["Why is the Sun yellow?"], generation_config).extended_perf_metrics
     total_time = (time.perf_counter() - start_time) * 1000
 
diff --git a/tests/python_tests/utils/ov_genai_pipelines.py b/tests/python_tests/utils/ov_genai_pipelines.py
@@ -41,15 +41,13 @@ class PipelineType(Enum):
     PAGED_ATTENTION = 2
     CONTINUOUS_BATCHING = 3
     SPECULATIVE_DECODING = 4
-    STATEFUL_SPECULATIVE_DECODING = 5
-    PROMPT_LOOKUP_DECODING = 6
-    AUTO = 7
+    PROMPT_LOOKUP_DECODING = 5
+    AUTO = 6
 
 
 def get_all_pipeline_types():
-    return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION, PipelineType.CONTINUOUS_BATCHING, PipelineType.SPECULATIVE_DECODING, PipelineType.STATEFUL_SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING, PipelineType.AUTO]
+    return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION, PipelineType.CONTINUOUS_BATCHING, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING, PipelineType.AUTO]
 
-# TODO: Add PipelineType.STATEFUL_SPECULATIVE_DECODING, make its tests green.
 def get_main_pipeline_types():
     return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING]
 
@@ -99,10 +97,6 @@ def create_ov_pipeline(models_path: Path,
     elif pipeline_type == PipelineType.SPECULATIVE_DECODING:
         ov_draft_model = draft_model(models_path) if draft_model_path is None else draft_model(draft_model_path)
         return LLMPipeline(models_path, device, ov_config, scheduler_config=scheduler_config, draft_model=ov_draft_model)
-    elif pipeline_type == PipelineType.STATEFUL_SPECULATIVE_DECODING:
-        ov_draft_model = draft_model(models_path) if draft_model_path is None else draft_model(draft_model_path)
-        ov_config["ATTENTION_BACKEND"] = "SDPA"
-        return LLMPipeline(models_path, device, ov_config, draft_model=ov_draft_model)
     elif pipeline_type == PipelineType.PROMPT_LOOKUP_DECODING:
         return LLMPipeline(models_path, device, ov_config, scheduler_config=scheduler_config, prompt_lookup=True)
     else:
@@ -133,9 +127,6 @@ def prepare_generation_config_by_pipe_type(generation_config : GenerationConfig,
     if pipeline_type == PipelineType.SPECULATIVE_DECODING:
         assert not generation_config.is_beam_search()
         generation_config.assistant_confidence_threshold = 0.9
-    elif pipeline_type == PipelineType.STATEFUL_SPECULATIVE_DECODING:
-        assert not generation_config.is_beam_search()
-        generation_config.num_assistant_tokens = 5
     elif pipeline_type == PipelineType.PROMPT_LOOKUP_DECODING:
         assert not generation_config.is_beam_search()
         generation_config.num_assistant_tokens = 5

Original file line number	Diff line number	Diff line change
`@@ -643,6 +643,14 @@ bool explicitly_requires_paged_attention(const ov::AnyMap& properties) {`
`643`	`643`	`}`
`644`	`644`	`}`
`645`	`645`
	`646`	`+ if (properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end()) {`
	`647`	`+ if (is_paged_attention_available()) {`
	`648`	`+ return true;`
	`649`	`+ } else {`
	`650`	`+ OPENVINO_THROW("Speculative decoding on non-NPU devices requires PagedAttention operation support, which is available on x86_64 or ARM64 platforms only");`
	`651`	`+ }`
	`652`	`+ }`
	`653`	`+`
`646`	`654`	`auto prompt_lookup_prop = properties.find("prompt_lookup");`
`647`	`655`	`if (prompt_lookup_prop != properties.end() && prompt_lookup_prop->second.as<bool>() == true) {`
`648`	`656`	`if (is_paged_attention_available()) {`