@@ -241,23 +241,24 @@ ov::Tensor LLMInferWrapper::infer_next_internal(const std::vector<int64_t> token
241
241
// }
242
242
243
243
auto input_ids = m_request.get_tensor (" input_ids" );
244
- input_ids.set_shape ({BATCH_SIZE, tokens_size});
245
- std::copy_n (tokens.begin (), tokens_size, input_ids.data <int64_t >());
244
+ ov::Tensor new_input_ids (input_ids.get_element_type (), ov::Shape{BATCH_SIZE, tokens_size});
245
+ std::copy_n (tokens.begin (), tokens_size, new_input_ids.data <int64_t >());
246
+ m_request.set_tensor (" input_ids" , new_input_ids);
246
247
247
248
// FIXME: For model with static shapes we can just copy after
248
249
// the prefilled tokens, no reshape is needed.
249
250
auto attention_mask = m_request.get_tensor (" attention_mask" );
250
- std::vector<int64_t > attention_mask_copy (attention_mask.data <int64_t >(),
251
- attention_mask.data <int64_t >() + m_num_processed_tokens);
252
- attention_mask.set_shape ({BATCH_SIZE, m_num_processed_tokens + tokens_size});
253
- std::copy_n (attention_mask_copy.begin (), m_num_processed_tokens, attention_mask.data <int64_t >());
254
- std::fill_n (attention_mask.data <int64_t >() + m_num_processed_tokens, tokens_size, 1 );
251
+ ov::Tensor new_attention_mask (attention_mask.get_element_type (), ov::Shape{BATCH_SIZE, m_num_processed_tokens + tokens_size});
252
+ std::copy_n (attention_mask.data <int64_t >(), m_num_processed_tokens, new_attention_mask.data <int64_t >());
253
+ std::fill_n (new_attention_mask.data <int64_t >() + m_num_processed_tokens, tokens_size, 1 );
254
+ m_request.set_tensor (" attention_mask" , new_attention_mask);
255
255
256
256
auto position_ids = m_request.get_tensor (" position_ids" );
257
- position_ids.set_shape ( {BATCH_SIZE, tokens_size});
258
- std::iota (position_ids .data <int64_t >(),
259
- position_ids .data <int64_t >() + position_ids .get_size (),
257
+ ov::Tensor new_position_ids ( position_ids.get_element_type (), ov::Shape {BATCH_SIZE, tokens_size});
258
+ std::iota (new_position_ids .data <int64_t >(),
259
+ new_position_ids .data <int64_t >() + new_position_ids .get_size (),
260
260
m_num_processed_tokens);
261
+ m_request.set_tensor (" position_ids" , new_position_ids);
261
262
262
263
m_request.get_tensor (" beam_idx" ).set_shape ({BATCH_SIZE});
263
264
m_request.get_tensor (" beam_idx" ).data <int32_t >()[0 ] = 0 ;
@@ -284,8 +285,7 @@ void LLMInferWrapper::set_already_allocated_input_for_1_token() {
284
285
m_request.set_tensor (" position_ids" , ov::Tensor (ov::element::i64 , ov::Shape{1 ,1 }, reinterpret_cast <void *>(&m_new_position_id)));
285
286
}
286
287
287
- // FIXME: It is wrong way to sample tokens, or right because of set output_seq_len in the sequence?
288
- // get_generated_ids will return all ids?
288
+ // FIXME: Need to use Sampler correctly. Sampler does all the validation itself! Just needs to configure it correctly.
289
289
std::variant<int64_t , std::vector<int64_t >>
290
290
LLMInferWrapper::sample_tokens (const ov::Tensor& logits, std::size_t num_tokens_to_return) {
291
291
OPENVINO_ASSERT (m_sequence_group, " sample_tokens() can be called only after infer_first()!" );
@@ -298,7 +298,6 @@ std::variant<int64_t, std::vector<int64_t>>
298
298
return sampled_tokens.back ();
299
299
} else {
300
300
// FIXME condition can be switched to boolean?
301
- OPENVINO_ASSERT (num_tokens_to_return == sampled_tokens.size ());
302
301
return sampled_tokens;
303
302
}
304
303
}
@@ -585,8 +584,8 @@ EncodedResults SpeculativeLLMPipelineNPU::generate(
585
584
// For the main network, candidates_size + 1 tokens will be fed at once in a single infer request:
586
585
// last token from previous main inference + all candidates from the draft stage
587
586
// FIXME: How max_seq_length will be handled?
588
- auto input_for_main = candidates;
589
- input_for_main.insert (candidates .begin (), out_token);
587
+ std::vector< int64_t > input_for_main (candidates. begin (), candidates. end ()) ;
588
+ input_for_main.insert (input_for_main .begin (), { out_token} );
590
589
// TODO: Handle OOM exception for static model here.
591
590
auto ref_out_tokens = m_main_request->infer_next_return_all (input_for_main);
592
591
0 commit comments