Skip to content

Commit 3cba904

Browse files
committed
Fixes to make pipe functional
1 parent a9dc0d0 commit 3cba904

File tree

2 files changed

+34
-21
lines changed

2 files changed

+34
-21
lines changed

src/cpp/src/speculative_decoding/speculative_decoding_npu.cpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -241,23 +241,24 @@ ov::Tensor LLMInferWrapper::infer_next_internal(const std::vector<int64_t> token
241241
// }
242242

243243
auto input_ids = m_request.get_tensor("input_ids");
244-
input_ids.set_shape({BATCH_SIZE, tokens_size});
245-
std::copy_n(tokens.begin(), tokens_size, input_ids.data<int64_t>());
244+
ov::Tensor new_input_ids(input_ids.get_element_type(), ov::Shape{BATCH_SIZE, tokens_size});
245+
std::copy_n(tokens.begin(), tokens_size, new_input_ids.data<int64_t>());
246+
m_request.set_tensor("input_ids", new_input_ids);
246247

247248
// FIXME: For model with static shapes we can just copy after
248249
// the prefilled tokens, no reshape is needed.
249250
auto attention_mask = m_request.get_tensor("attention_mask");
250-
std::vector<int64_t> attention_mask_copy(attention_mask.data<int64_t>(),
251-
attention_mask.data<int64_t>() + m_num_processed_tokens);
252-
attention_mask.set_shape({BATCH_SIZE, m_num_processed_tokens + tokens_size});
253-
std::copy_n(attention_mask_copy.begin(), m_num_processed_tokens, attention_mask.data<int64_t>());
254-
std::fill_n(attention_mask.data<int64_t>() + m_num_processed_tokens, tokens_size, 1);
251+
ov::Tensor new_attention_mask(attention_mask.get_element_type(), ov::Shape{BATCH_SIZE, m_num_processed_tokens + tokens_size});
252+
std::copy_n(attention_mask.data<int64_t>(), m_num_processed_tokens, new_attention_mask.data<int64_t>());
253+
std::fill_n(new_attention_mask.data<int64_t>() + m_num_processed_tokens, tokens_size, 1);
254+
m_request.set_tensor("attention_mask", new_attention_mask);
255255

256256
auto position_ids = m_request.get_tensor("position_ids");
257-
position_ids.set_shape({BATCH_SIZE, tokens_size});
258-
std::iota(position_ids.data<int64_t>(),
259-
position_ids.data<int64_t>() + position_ids.get_size(),
257+
ov::Tensor new_position_ids(position_ids.get_element_type(), ov::Shape{BATCH_SIZE, tokens_size});
258+
std::iota(new_position_ids.data<int64_t>(),
259+
new_position_ids.data<int64_t>() + new_position_ids.get_size(),
260260
m_num_processed_tokens);
261+
m_request.set_tensor("position_ids", new_position_ids);
261262

262263
m_request.get_tensor("beam_idx").set_shape({BATCH_SIZE});
263264
m_request.get_tensor("beam_idx").data<int32_t>()[0] = 0;
@@ -284,8 +285,7 @@ void LLMInferWrapper::set_already_allocated_input_for_1_token() {
284285
m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&m_new_position_id)));
285286
}
286287

287-
// FIXME: It is wrong way to sample tokens, or right because of set output_seq_len in the sequence?
288-
// get_generated_ids will return all ids?
288+
// FIXME: Need to use Sampler correctly. Sampler does all the validation itself! Just needs to configure it correctly.
289289
std::variant<int64_t, std::vector<int64_t>>
290290
LLMInferWrapper::sample_tokens(const ov::Tensor& logits, std::size_t num_tokens_to_return) {
291291
OPENVINO_ASSERT(m_sequence_group, "sample_tokens() can be called only after infer_first()!");
@@ -298,7 +298,6 @@ std::variant<int64_t, std::vector<int64_t>>
298298
return sampled_tokens.back();
299299
} else {
300300
// FIXME condition can be switched to boolean?
301-
OPENVINO_ASSERT(num_tokens_to_return == sampled_tokens.size());
302301
return sampled_tokens;
303302
}
304303
}
@@ -585,8 +584,8 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
585584
// For the main network, candidates_size + 1 tokens will be fed at once in a single infer request:
586585
// last token from previous main inference + all candidates from the draft stage
587586
// FIXME: How max_seq_length will be handled?
588-
auto input_for_main = candidates;
589-
input_for_main.insert(candidates.begin(), out_token);
587+
std::vector<int64_t> input_for_main(candidates.begin(), candidates.end());
588+
input_for_main.insert(input_for_main.begin(), {out_token});
590589
// TODO: Handle OOM exception for static model here.
591590
auto ref_out_tokens = m_main_request->infer_next_return_all(input_for_main);
592591

src/cpp/src/speculative_decoding/speculative_decoding_npu.hpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,44 @@ constexpr size_t BATCH_SIZE = 1;
1414
class LLMInferWrapper {
1515
public:
1616
LLMInferWrapper::LLMInferWrapper(const ov::genai::ModelDesc& model_desc);
17+
1718
ov::genai::GenerationConfig get_generation_config() const;
19+
1820
void set_generation_config(ov::genai::GenerationConfig config);
21+
1922
int64_t infer_first(const ov::Tensor &input_ids,
2023
const ov::Tensor &attention_mask,
2124
const ov::Tensor &position_ids);
25+
2226
bool can_infer();
27+
2328
int64_t infer_next(const std::vector<int64_t> tokens);
29+
2430
int64_t infer_next(int64_t out_token);
31+
2532
std::vector<int64_t> infer_next_return_all(const std::vector<int64_t> tokens);
33+
2634
ov::Tensor get_logits();
35+
2736
std::size_t get_num_processed_tokens() const;
37+
2838
ov::genai::GenerationHandle create_generation_handle();
39+
2940
void remove_last_generated_tokens(const std::size_t tokens_to_remove);
41+
3042
void trimm_kv_cache(const std::size_t tokens_to_remove);
43+
3144
ov::genai::EncodedResults finalize();
45+
3246
ov::genai::GenerationStatus get_generation_status() const;
47+
3348
void reset_state();
3449

3550
private:
3651
ov::Tensor infer_next_internal(const std::vector<int64_t> tokens);
52+
3753
void set_already_allocated_input_for_1_token();
54+
3855
std::variant<int64_t, std::vector<int64_t>> sample_tokens(
3956
const ov::Tensor& logits, std::size_t num_tokens_to_return);
4057

@@ -59,6 +76,7 @@ class LLMInferWrapper {
5976
std::vector<int64_t> m_new_atten_mask_data;
6077
};
6178

79+
// FIXME: Do we need this?
6280
struct SpeculativeConfig {
6381
void update_candidate_strategy(const size_t num_matches);
6482

@@ -87,14 +105,10 @@ class SpeculativeLLMPipelineNPU : public ov::genai::LLMPipelineImplBase {
87105
) override;
88106

89107
void start_chat(const std::string& system_message) override;
108+
90109
void finish_chat() override;
91-
~SpeculativeLLMPipelineNPU();
92110

93-
private:
94-
int64_t generate_next_token(const std::vector<int64_t> tokens);
95-
std::vector<int64_t> generate_candidates(int64_t out_token);
96-
void update_candidate_strategy(const size_t num_matches);
97-
void update_kv_cache(const size_t seq_length);
111+
~SpeculativeLLMPipelineNPU();
98112

99113
private:
100114
uint32_t m_max_prompt_len = 0u;

0 commit comments

Comments
 (0)