Skip to content

Commit 25a6e69

Browse files
committed
More fixes for accuracy
1 parent 56a0a28 commit 25a6e69

File tree

6 files changed

+58
-45
lines changed

6 files changed

+58
-45
lines changed

src/cpp/src/continuous_batching/pipeline.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,6 @@
1919
using namespace ov::genai;
2020

2121
namespace {
22-
ov::genai::ModelDesc
23-
extract_draft_model_from_config(ov::AnyMap& config) {
24-
ov::genai::ModelDesc draft_model;
25-
if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
26-
draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
27-
config.erase(utils::DRAFT_MODEL_ARG_NAME);
28-
}
29-
return draft_model;
30-
}
31-
3222
bool
3323
extract_prompt_lookup_from_config(ov::AnyMap& config) {
3424
bool res = false;

src/cpp/src/llm/pipeline.cpp

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,12 @@ ov::genai::LLMPipeline::LLMPipeline(
8080
auto start_time = std::chrono::steady_clock::now();
8181
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties);
8282

83-
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
84-
if (utils::explicitly_requires_paged_attention(user_properties)) {
83+
if (is_npu_requested(device, properties)) {
84+
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, tokenizer, properties);
85+
} else if (utils::explicitly_requires_paged_attention(user_properties)) {
86+
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
8587
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
8688
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties);
87-
} else if (device == "NPU") {
88-
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, tokenizer, properties);
8989
} else if (attention_backend == PA_BACKEND) {
9090
// try to call CB adapter one more time, but with safe guard to silent exception
9191
try {
@@ -115,13 +115,10 @@ ov::genai::LLMPipeline::LLMPipeline(
115115

116116
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties);
117117

118-
// First -> check draft model. for NPU leave it as is for the main model.
119-
// if NPU
120-
// if draft model is on NPU
121-
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
122-
if (device == "NPU") {
118+
if (is_npu_requested(device, properties)) {
123119
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, properties);
124120
} else if (utils::explicitly_requires_paged_attention(user_properties)) {
121+
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
125122
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
126123
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
127124

@@ -157,17 +154,17 @@ ov::genai::LLMPipeline::LLMPipeline(
157154

158155
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties);
159156

160-
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
161-
if (utils::explicitly_requires_paged_attention(user_properties)) {
162-
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
163-
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
164-
tokenizer, scheduler_config, device, device_properties, generation_config);
165-
} else if (device == "NPU") {
157+
if (is_npu_requested(device, properties)) {
166158
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(
167159
utils::singleton_core().read_model(model_str, weights_tensor),
168160
tokenizer,
169161
properties,
170162
generation_config);
163+
} else if (utils::explicitly_requires_paged_attention(user_properties)) {
164+
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
165+
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
166+
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
167+
tokenizer, scheduler_config, device, device_properties, generation_config);
171168
} else if (attention_backend == PA_BACKEND) {
172169
// try to call CB adapter one more time, but with safe guard to silent exception
173170
try {

src/cpp/src/llm/pipeline_stateful_npu.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,6 @@
1414
#include "openvino/core/parallel.hpp"
1515
#include "openvino/genai/text_streamer.hpp"
1616

17-
namespace {
18-
ov::genai::ModelDesc
19-
extract_draft_model_from_config(ov::AnyMap& config) {
20-
ov::genai::ModelDesc draft_model;
21-
if (config.find(ov::genai::utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
22-
draft_model = config.at(ov::genai::utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
23-
config.erase(ov::genai::utils::DRAFT_MODEL_ARG_NAME);
24-
}
25-
return draft_model;
26-
}
27-
} // anonymous namespace
28-
2917
namespace ov::genai {
3018

3119
// NB: No constructor for creation of pipeline from infer request, as pipeline from infer request

src/cpp/src/speculative_decoding/speculative_decoding_npu.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,9 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
509509
*/
510510
// Last generated token by draft model needs to be prepended before next run if it is accepted by the main model!
511511
// So it will get into context too.
512+
// Remove debug lines.
513+
// std::cout << std::endl << "Launching spec decode for " << config.get_max_new_tokens(prompt_len) << " max new tokens." << std::endl << std::endl;
514+
// std::vector<std::pair<int,int>> accepted_tokens;
512515
int64_t draft_prefix_token = -1;
513516
while (m_main_request->can_infer() && (streaming_status == ov::genai::StreamingStatus::RUNNING)) {
514517
// Phase 1: Generation of candidates with the draft model:
@@ -534,7 +537,7 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
534537
candidate = m_draft_request->infer_next(candidate);
535538
candidates.push_back(candidate);
536539
}
537-
540+
538541
// Phase 2. Main inference.
539542
// For the main network, candidates_size + 1 tokens will be fed at once in a single infer request:
540543
// last token from previous main inference + all candidates from the draft stage
@@ -548,22 +551,23 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
548551
// that is generated based on subsequence [first token,...,`t`]
549552
// of the input prompt.
550553
// TODO: Handle OOM exception for static model here.
551-
auto ref_out_tokens = m_main_request->infer_next_return_all(input_for_main);
554+
auto ref_tokens = m_main_request->infer_next_return_all(input_for_main);
552555

553556
// Phase 3. Check if main model produced the same tokens as input candidates:
554557
size_t accepted_tokens_number = 0u;
555558
// Last token is a new token from the main model, skip it:
556-
for (size_t i = 0; i < ref_out_tokens.size() - 1; ++i) {
557-
if (ref_out_tokens[i] != candidates[i]) {
559+
for (size_t i = 0; i < ref_tokens.size() - 1; ++i) {
560+
if (ref_tokens[i] != candidates[i]) {
558561
break;
559562
}
560563
accepted_tokens_number++;
561564
}
562565

566+
// FIXME: Remove debug line
567+
// accepted_tokens.push_back({accepted_tokens_number, candidates.size()});
563568
auto mismatched_candidates = candidates.size() - accepted_tokens_number;
564-
std::vector<int64_t> validated_tokens(candidates.begin(), candidates.end() - mismatched_candidates);
565-
out_token = ref_out_tokens.back();
566-
validated_tokens.push_back(out_token);
569+
std::vector<int64_t> validated_tokens(ref_tokens.begin(), ref_tokens.end() - mismatched_candidates);
570+
out_token = validated_tokens.back();
567571

568572
// Phase 4: Update inference wrappers based on found matches and mismatches
569573
// This is the case when main model accepted all candidates from draft model
@@ -573,6 +577,7 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
573577
} else {
574578
m_draft_request->trimm_kv_cache(mismatched_candidates - 1);
575579
m_main_request->trimm_kv_cache(mismatched_candidates);
580+
draft_prefix_token = -1;
576581
}
577582

578583
m_speculative_config.update_candidate_strategy(accepted_tokens_number);
@@ -587,6 +592,14 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
587592
streamer_ptr->end();
588593
}
589594

595+
// Remove debug lines
596+
// std::cout << std::endl << std::endl << "Acceptance ratios for each iteration from total of " << accepted_tokens.size() << "." << std::endl;
597+
// std::cout << "Format: n/m per iteration, `n` accepted tokens from `m` candidates." << std::endl;
598+
// for (int i = 0; i < accepted_tokens.size(); ++i) {
599+
// std::cout << accepted_tokens[i].first << "/" << accepted_tokens[i].second << ", ";
600+
// }
601+
m_speculative_config.num_pred_tokens = 5;
602+
590603
m_draft_request->reset_state();
591604
m_main_request->reset_state();
592605

src/cpp/src/utils.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ inline bool is_paged_attention_available() {
101101
return false;
102102
#endif
103103
}
104-
105104
} // anonymous
106105

107106
namespace ov {
@@ -202,6 +201,28 @@ ProcessorConfig from_any_map(
202201
return extracted_config;
203202
}
204203

204+
ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config) {
205+
ov::genai::ModelDesc draft_model;
206+
if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
207+
draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
208+
config.erase(utils::DRAFT_MODEL_ARG_NAME);
209+
}
210+
return draft_model;
211+
}
212+
213+
bool is_npu_requested(const std::string& device, const ov::AnyMap& properties) {
214+
if (device == "NPU") {
215+
return true;
216+
}
217+
218+
auto draft_model_descr = extract_draft_model_from_config(properties);
219+
if (draft_model_descr.model != nullptr) {
220+
return draft_model_descr.device == "NPU";
221+
}
222+
223+
return false;
224+
}
225+
205226
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
206227
auto minuend_size = minuend.input_ids.get_size();
207228
auto subtrahend_size = subtrahend.input_ids.get_size();

src/cpp/src/utils.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ ProcessorConfig from_any_map(
118118
const ProcessorConfig& initial
119119
);
120120

121+
ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config);
122+
123+
bool is_npu_requested(const std::string& device, const ov::AnyMap& properties)
124+
121125
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
122126

123127
void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model);

0 commit comments

Comments
 (0)