|
| 1 | +# Copyright (C) 2023-2025 Intel Corporation |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | + |
| 5 | +import pytest |
| 6 | +import numpy as np |
| 7 | +import logging |
| 8 | + |
| 9 | +import openvino as ov |
| 10 | +import openvino_genai as ov_genai |
| 11 | + |
| 12 | +from utils.constants import get_default_llm_properties |
| 13 | +from utils.hugging_face import generation_config_to_hf, download_and_convert_model, run_hugging_face |
| 14 | +from utils.comparation import compare_generation_results |
| 15 | +from utils.ov_genai_pipelines import create_ov_pipeline, generate_and_compare, get_main_pipeline_types, PipelineType, convert_decoded_results_to_generation_result |
| 16 | + |
| 17 | +test_cases = [ |
| 18 | + ('CPU', 'CPU'), |
| 19 | + ('CPU', 'NPUW:CPU'), |
| 20 | + ('NPUW:CPU', 'CPU'), |
| 21 | + ('NPUW:CPU', 'NPUW:CPU') |
| 22 | +] |
| 23 | +@pytest.mark.parametrize("main_device,draft_device", test_cases) |
| 24 | +@pytest.mark.precommit |
| 25 | +def test_string_inputs(main_device, draft_device): |
| 26 | + # FIXME: For now SmolLM2-135M is used as a main and a draft model in the test. |
| 27 | + # However, it is more desirable to use SmolLM2-360M as a main one to simulate the real case |
| 28 | + # for speculative decoding. |
| 29 | + # It seems like temporary directory from model downloading stage isn't removed after test |
| 30 | + # launch for SmolLM2-360M model, that is why it is not used now. |
| 31 | + MODEL_UNDER_TEST = { |
| 32 | + "name": "HuggingFaceTB/SmolLM2-135M", |
| 33 | + "convert_args": ['--trust-remote-code'] |
| 34 | + } |
| 35 | + prompt = "Alan Turing was a" |
| 36 | + |
| 37 | + # Download and convert model: |
| 38 | + main_opt_model, main_hf_tokenizer, main_model_path = download_and_convert_model(MODEL_UNDER_TEST["name"]) |
| 39 | + draft_model_path = main_model_path |
| 40 | + |
| 41 | + # Create OpenVINO GenAI pipeline: |
| 42 | + draft_config = get_default_llm_properties() |
| 43 | + if draft_device == "NPUW:CPU": |
| 44 | + draft_device = "NPU" |
| 45 | + draft_config["NPUW_DEVICES"] = "CPU" |
| 46 | + draft_config["GENERATE_HINT"] = "BEST_PERF" |
| 47 | + # FIXME: Currently, the same draft and main model fails to work in NPUW_WEIGHTS_BANK: shared mode. |
| 48 | + # To workaround this, we name banks differently for draft and main. |
| 49 | + draft_config["NPUW_WEIGHTS_BANK"] = "draft" |
| 50 | + ov_draft_model = ov_genai.draft_model(draft_model_path, draft_device, **draft_config) |
| 51 | + |
| 52 | + main_config = get_default_llm_properties() |
| 53 | + if main_device == "NPUW:CPU": |
| 54 | + main_device = "NPU" |
| 55 | + main_config["NPUW_DEVICES"] = "CPU" |
| 56 | + # FIXME: SmolLM-135M with GENERATE_HINT: FAST_COMPILE will output garbage on NPUW:CPU if used with configuration |
| 57 | + # NPUW_LLM_MAX_GENERATION_TOKEN_LEN > 1. |
| 58 | + # Setting GENERATE_HINT: BEST_PERF to workaround an issue currently. |
| 59 | + main_config["GENERATE_HINT"] = "BEST_PERF" |
| 60 | + # FIXME: Currently, the same draft and main model fails to work in NPUW_WEIGHTS_BANK: shared mode. |
| 61 | + # To workaround this, we name banks differently for draft and main. |
| 62 | + main_config["NPUW_WEIGHTS_BANK"] = "main" |
| 63 | + main_config["ATTENTION_BACKEND"] = "SDPA" |
| 64 | + ov_pipe = ov_genai.LLMPipeline(main_model_path, main_device, main_config, draft_model=ov_draft_model) |
| 65 | + |
| 66 | + # Run reference HF model: |
| 67 | + ov_generation_config = ov_genai.GenerationConfig(max_new_tokens=20) |
| 68 | + main_hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
| 69 | + ref_gen_results = run_hugging_face(main_opt_model, main_hf_tokenizer, [prompt], ov_generation_config) |
| 70 | + |
| 71 | + # Run OpenVINO GenAI pipeline: |
| 72 | + ov_decoded_results = ov_pipe.generate([prompt], ov_generation_config) |
| 73 | + ov_gen_results = convert_decoded_results_to_generation_result(ov_decoded_results, 1, 1, False) |
| 74 | + |
| 75 | + del ov_pipe |
| 76 | + |
| 77 | + # Compare results: |
| 78 | + compare_generation_results([prompt], ref_gen_results, ov_gen_results, ov_generation_config) |
| 79 | + |
| 80 | +@pytest.mark.precommit |
| 81 | +def test_perf_metrics(): |
| 82 | + import time |
| 83 | + start_time = time.perf_counter() |
| 84 | + model_id = 'katuni4ka/tiny-random-gemma2' |
| 85 | + generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5) |
| 86 | + _, _, model_path = download_and_convert_model(model_id) |
| 87 | + ov_pipe = create_ov_pipeline(model_path, pipeline_type=PipelineType.STATEFUL_SPECULATIVE_DECODING) |
| 88 | + prompt = 'table is made of' |
| 89 | + perf_metrics = ov_pipe.generate([prompt], generation_config).perf_metrics |
| 90 | + total_time = (time.perf_counter() - start_time) * 1000 |
| 91 | + |
| 92 | + # Check that load time is adequate. |
| 93 | + load_time = perf_metrics.get_load_time() |
| 94 | + assert load_time > 0 and load_time < total_time |
| 95 | + |
| 96 | + # Check that num input and generated tokens are adequate. |
| 97 | + num_generated_tokens = perf_metrics.get_num_generated_tokens() |
| 98 | + assert num_generated_tokens > 0 and num_generated_tokens <= generation_config.max_new_tokens |
| 99 | + |
| 100 | + num_input_tokens = perf_metrics.get_num_input_tokens() |
| 101 | + assert num_input_tokens > 0 and num_input_tokens <= len(prompt) |
| 102 | + |
| 103 | + mean_ttft, std_ttft = perf_metrics.get_ttft() |
| 104 | + assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) |
| 105 | + assert mean_ttft > 0 and mean_ttft < 1000.0 |
| 106 | + |
| 107 | + raw_metrics = perf_metrics.raw_metrics |
| 108 | + durations = np.array(raw_metrics.m_durations) / 1000 |
| 109 | + # Check that prefill is not included in durations for TPOT calculation. |
| 110 | + # For the very long prompt prefill is slow and TTFT is much larger than any other token generation duration. |
| 111 | + assert np.all(mean_ttft > durations) |
| 112 | + |
| 113 | + mean_tpot, std_tpot = perf_metrics.get_tpot() |
| 114 | + assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) |
| 115 | + assert mean_tpot > 0 and mean_ttft < 1000.0 |
| 116 | + |
| 117 | + mean_throughput, std_throughput = perf_metrics.get_throughput() |
| 118 | + assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) |
| 119 | + assert mean_throughput > 0 and mean_throughput < 20000.0 |
| 120 | + |
| 121 | + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() |
| 122 | + assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) |
| 123 | + assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time |
| 124 | + assert std_gen_duration == 0 |
| 125 | + |
| 126 | + mean_tok_duration, std_tok_duration = perf_metrics.get_tokenization_duration() |
| 127 | + assert (mean_tok_duration, std_tok_duration) == (perf_metrics.get_tokenization_duration().mean, perf_metrics.get_tokenization_duration().std) |
| 128 | + assert mean_tok_duration > 0 and mean_tok_duration < mean_gen_duration |
| 129 | + assert std_tok_duration == 0 |
| 130 | + |
| 131 | + mean_detok_duration, std_detok_duration = perf_metrics.get_detokenization_duration() |
| 132 | + assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) |
| 133 | + assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration |
| 134 | + assert std_detok_duration == 0 |
| 135 | + |
| 136 | + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics |
| 137 | + assert np.allclose(mean_tpot, np.mean(durations)) |
| 138 | + assert np.allclose(std_tpot, np.std(durations), atol=0.00002) |
| 139 | + |
| 140 | + raw_dur = np.array(raw_metrics.generate_durations) / 1000 |
| 141 | + assert np.allclose(mean_gen_duration, np.mean(raw_dur)) |
| 142 | + assert np.allclose(std_gen_duration, np.std(raw_dur)) |
| 143 | + |
| 144 | + raw_dur = np.array(raw_metrics.tokenization_durations) / 1000 |
| 145 | + assert np.allclose(mean_tok_duration, np.mean(raw_dur)) |
| 146 | + assert np.allclose(std_tok_duration, np.std(raw_dur)) |
| 147 | + |
| 148 | + raw_dur = np.array(raw_metrics.detokenization_durations) / 1000 |
| 149 | + assert np.allclose(mean_detok_duration, np.mean(raw_dur)) |
| 150 | + assert np.allclose(std_detok_duration, np.std(raw_dur)) |
| 151 | + |
| 152 | + assert len(raw_metrics.m_times_to_first_token) > 0 |
| 153 | + assert len(raw_metrics.m_batch_sizes) > 0 |
| 154 | + assert len(raw_metrics.m_durations) > 0 |
| 155 | + |
| 156 | +@pytest.mark.precommit |
| 157 | +def test_extended_perf_metrics(): |
| 158 | + import time |
| 159 | + start_time = time.perf_counter() |
| 160 | + model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
| 161 | + generation_config = ov_genai.GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5) |
| 162 | + _, _, model_path = download_and_convert_model(model_id) |
| 163 | + ov_pipe = create_ov_pipeline(model_path, pipeline_type=PipelineType.STATEFUL_SPECULATIVE_DECODING) |
| 164 | + extended_perf_metrics = ov_pipe.generate(["Why is the Sun yellow?"], generation_config).extended_perf_metrics |
| 165 | + total_time = (time.perf_counter() - start_time) * 1000 |
| 166 | + |
| 167 | + assert not extended_perf_metrics is None |
| 168 | + assert not extended_perf_metrics.main_model_metrics is None |
| 169 | + assert not extended_perf_metrics.draft_model_metrics is None |
| 170 | + |
| 171 | + assert extended_perf_metrics.get_num_accepted_tokens() > 0 |
| 172 | + |
| 173 | + num_generated_tokens_main = extended_perf_metrics.main_model_metrics.get_num_generated_tokens() |
| 174 | + assert num_generated_tokens_main > 0 and num_generated_tokens_main <= generation_config.max_new_tokens |
| 175 | + |
| 176 | + num_generated_tokens_draft = extended_perf_metrics.draft_model_metrics.get_num_generated_tokens() |
| 177 | + # As Stateful Speculative Decoding pipeline is dynamically adjusting its number of candidates at |
| 178 | + # each step, here we check that generated tokens is less than upper candidates limit multiplied by |
| 179 | + # maximum number of generated tokens. |
| 180 | + assert num_generated_tokens_draft > 0 and \ |
| 181 | + num_generated_tokens_draft < ((generation_config.max_new_tokens - 1) * \ |
| 182 | + generation_config.num_assistant_tokens * 2 + 1) |
| 183 | + |
| 184 | + total_iteration_number_main = len(extended_perf_metrics.main_model_metrics.raw_metrics.m_durations) |
| 185 | + assert total_iteration_number_main > 0 and total_iteration_number_main <= generation_config.max_new_tokens |
| 186 | + |
| 187 | + total_iteration_number_draft = len(extended_perf_metrics.draft_model_metrics.raw_metrics.m_durations) |
| 188 | + assert total_iteration_number_draft > 0 and \ |
| 189 | + total_iteration_number_draft < ((generation_config.max_new_tokens - 1) * \ |
| 190 | + generation_config.num_assistant_tokens * 2 + 1) |
| 191 | + |
| 192 | + for model_metrics in [extended_perf_metrics.main_model_metrics, extended_perf_metrics.draft_model_metrics]: |
| 193 | + mean_ttst, std_ttst = model_metrics.get_ttst() |
| 194 | + assert (mean_ttst, std_ttst) == (model_metrics.get_ttst().mean, model_metrics.get_ttst().std) |
| 195 | + assert mean_ttst > 0 and mean_ttst < model_metrics.get_ttft().mean |
| 196 | + assert std_ttst == 0 |
| 197 | + |
| 198 | + mean_latency, std_latency = model_metrics.get_latency() |
| 199 | + assert (mean_latency, std_latency) == (model_metrics.get_latency().mean, model_metrics.get_latency().std) |
| 200 | + assert mean_latency > 0 and mean_latency < 1000.0 |
| 201 | + |
| 202 | + mean_gen_duration, std_gen_duration = model_metrics.get_generate_duration() |
| 203 | + assert (mean_gen_duration, std_gen_duration) == (model_metrics.get_generate_duration().mean, model_metrics.get_generate_duration().std) |
| 204 | + assert mean_gen_duration > 0 and mean_gen_duration < total_time |
| 205 | + assert std_gen_duration == 0 |
0 commit comments