14
14
from utils .comparation import compare_generation_results
15
15
from utils .ov_genai_pipelines import create_ov_pipeline , generate_and_compare , get_main_pipeline_types , PipelineType , convert_decoded_results_to_generation_result
16
16
17
+ def get_npu_llm_properties_for_test ():
18
+ config = get_default_llm_properties ()
19
+ config ["NPUW_DEVICES" ] = "CPU"
20
+ config ["GENERATE_HINT" ] = "BEST_PERF"
21
+ return config
22
+
17
23
models_and_input = [
18
24
("HuggingFaceTB/SmolLM2-360M" , "HuggingFaceTB/SmolLM2-135M" , "Alan Turing was a" )]
19
25
devices = [
20
- ( 'CPU' , 'CPU' ),
21
- ('CPU' , 'NPUW:CPU ' ),
22
- ('NPUW:CPU ' , 'CPU' ),
23
- ('NPUW:CPU ' , 'NPUW:CPU ' )
26
+ # FIXME: add 'CPU' and 'GPU' cases in future
27
+ ('CPU' , 'NPU ' ),
28
+ ('NPU ' , 'CPU' ),
29
+ ('NPU ' , 'NPU ' )
24
30
]
25
31
@pytest .mark .parametrize ("main_model,draft_model,prompt" , models_and_input )
26
32
@pytest .mark .parametrize ("main_device,draft_device" , devices )
@@ -31,19 +37,14 @@ def test_string_inputs(main_model, main_device, draft_model, draft_device, promp
31
37
__ , __ , draft_model_path = download_and_convert_model (draft_model )
32
38
33
39
# Create OpenVINO GenAI pipeline:
34
- draft_config = get_default_llm_properties ()
35
- if draft_device == "NPUW:CPU" :
36
- draft_device = "NPU"
37
- draft_config ["NPUW_DEVICES" ] = "CPU"
38
- draft_config ["GENERATE_HINT" ] = "BEST_PERF"
40
+ draft_config = get_npu_llm_properties_for_test () \
41
+ if (draft_device == "NPU" ) else \
42
+ get_default_llm_properties ()
39
43
ov_draft_model = ov_genai .draft_model (draft_model_path , draft_device , ** draft_config )
40
44
41
- main_config = get_default_llm_properties ()
42
- if main_device == "NPUW:CPU" :
43
- main_device = "NPU"
44
- main_config ["NPUW_DEVICES" ] = "CPU"
45
- main_config ["GENERATE_HINT" ] = "BEST_PERF"
46
- main_config ["ATTENTION_BACKEND" ] = "SDPA"
45
+ main_config = get_npu_llm_properties_for_test () \
46
+ if (main_device == "NPU" ) else \
47
+ get_default_llm_properties ()
47
48
ov_pipe = ov_genai .LLMPipeline (main_model_path , main_device , main_config , draft_model = ov_draft_model )
48
49
49
50
# Run reference HF model:
@@ -65,10 +66,14 @@ def test_perf_metrics():
65
66
import time
66
67
start_time = time .perf_counter ()
67
68
model_id = 'katuni4ka/tiny-random-gemma2'
68
- generation_config = ov_genai .GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
69
69
_ , _ , model_path = download_and_convert_model (model_id )
70
- ov_pipe = create_ov_pipeline (model_path , pipeline_type = PipelineType .STATEFUL_SPECULATIVE_DECODING )
70
+
71
+ # Create OpenVINO GenAI pipeline:
72
+ ov_draft_model = ov_genai .draft_model (model_path , "NPU" , ** get_npu_llm_properties_for_test ())
73
+ ov_pipe = ov_genai .LLMPipeline (model_path , "NPU" , get_npu_llm_properties_for_test (), draft_model = ov_draft_model )
74
+
71
75
prompt = 'table is made of'
76
+ generation_config = ov_genai .GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
72
77
perf_metrics = ov_pipe .generate ([prompt ], generation_config ).perf_metrics
73
78
total_time = (time .perf_counter () - start_time ) * 1000
74
79
@@ -141,9 +146,12 @@ def test_extended_perf_metrics():
141
146
import time
142
147
start_time = time .perf_counter ()
143
148
model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
144
- generation_config = ov_genai .GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
145
149
_ , _ , model_path = download_and_convert_model (model_id )
146
- ov_pipe = create_ov_pipeline (model_path , pipeline_type = PipelineType .STATEFUL_SPECULATIVE_DECODING )
150
+
151
+ ov_draft_model = ov_genai .draft_model (model_path , "NPU" , ** get_npu_llm_properties_for_test ())
152
+ ov_pipe = ov_genai .LLMPipeline (model_path , "NPU" , get_npu_llm_properties_for_test (), draft_model = ov_draft_model )
153
+
154
+ generation_config = ov_genai .GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
147
155
extended_perf_metrics = ov_pipe .generate (["Why is the Sun yellow?" ], generation_config ).extended_perf_metrics
148
156
total_time = (time .perf_counter () - start_time ) * 1000
149
157
0 commit comments