Skip to content

Commit a9dc0d0

Browse files
committed
Initial version
1 parent 2affe91 commit a9dc0d0

File tree

7 files changed

+955
-40
lines changed

7 files changed

+955
-40
lines changed

src/cpp/src/llm/pipeline.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
#include "openvino/genai/llm_pipeline.hpp"
1010
#include "openvino/genai/perf_metrics.hpp"
1111

12-
#include "llm/pipeline_static.hpp"
1312
#include "llm/pipeline_stateful.hpp"
1413
#include "llm/pipeline_continuous_batching_adapter.hpp"
1514
#include "speculative_decoding/speculative_decoding_impl.hpp"
15+
#include "llm/pipeline_stateful_npu.hpp"
1616
#include "utils.hpp"
1717

1818
namespace ov {
@@ -85,9 +85,7 @@ ov::genai::LLMPipeline::LLMPipeline(
8585
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
8686
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties);
8787
} else if (device == "NPU") {
88-
m_pimpl = properties.count("STATIC_PIPELINE")
89-
? static_llm::LLMPipelineFactory::create(models_path, tokenizer, properties)
90-
: std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
88+
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, tokenizer, properties);
9189
} else if (attention_backend == PA_BACKEND) {
9290
// try to call CB adapter one more time, but with safe guard to silent exception
9391
try {
@@ -122,9 +120,7 @@ ov::genai::LLMPipeline::LLMPipeline(
122120
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config());
123121
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
124122
} else if (device == "NPU") {
125-
m_pimpl = properties.count("STATIC_PIPELINE")
126-
? static_llm::LLMPipelineFactory::create(models_path, properties)
127-
: std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
123+
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, properties);
128124
} else if (attention_backend == PA_BACKEND) {
129125
// try to call CB adapter one more time, but with safe guard to silent exception
130126
try {
@@ -163,16 +159,9 @@ ov::genai::LLMPipeline::LLMPipeline(
163159
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
164160
tokenizer, scheduler_config, device, device_properties, generation_config);
165161
} else if (device == "NPU") {
166-
m_pimpl = properties.count("STATIC_PIPELINE")
167-
? static_llm::LLMPipelineFactory::create(
168-
utils::singleton_core().read_model(model_str, weights_tensor),
169-
tokenizer,
170-
properties,
171-
generation_config)
172-
: std::make_unique<StatefulLLMPipeline>(
162+
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(
173163
utils::singleton_core().read_model(model_str, weights_tensor),
174164
tokenizer,
175-
device,
176165
properties,
177166
generation_config);
178167
} else if (attention_backend == PA_BACKEND) {
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
2+
// Copyright (C) 2025 Intel Corporation
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include "pipeline_stateful_npu.hpp"
6+
#include "speculative_decoding/speculative_decoding_npu.hpp"
7+
#include "llm/pipeline_stateful.hpp"
8+
#include "llm/pipeline_static.hpp"
9+
#include "utils.hpp"
10+
11+
#include <fstream>
12+
13+
#include "openvino/runtime/core.hpp"
14+
#include "openvino/core/parallel.hpp"
15+
#include "openvino/genai/text_streamer.hpp"
16+
17+
namespace {
18+
ov::genai::ModelDesc
19+
extract_draft_model_from_config(ov::AnyMap& config) {
20+
ov::genai::ModelDesc draft_model;
21+
if (config.find(ov::genai::utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
22+
draft_model = config.at(ov::genai::utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
23+
config.erase(ov::genai::utils::DRAFT_MODEL_ARG_NAME);
24+
}
25+
return draft_model;
26+
}
27+
} // anonymous namespace
28+
29+
namespace ov::genai {
30+
31+
// NB: No constructor for creation of pipeline from infer request, as pipeline from infer request
32+
// for NPU is handled inside of ov::genai::StatefulLLMPipeline class iself.
33+
StatefulLLMPipelineNPU::StatefulLLMPipelineNPU(
34+
const std::filesystem::path& models_path,
35+
const ov::genai::Tokenizer& tokenizer,
36+
const ov::AnyMap& properties)
37+
: StatefulLLMPipelineNPU(
38+
utils::read_model(models_path, properties),
39+
tokenizer,
40+
properties,
41+
utils::from_config_json_if_exists(models_path)
42+
) {}
43+
44+
StatefulLLMPipelineNPU::StatefulLLMPipelineNPU(
45+
const std::filesystem::path& models_path,
46+
const ov::AnyMap& plugin_config)
47+
: StatefulLLMPipelineNPU{models_path, Tokenizer(models_path, plugin_config), plugin_config} {}
48+
49+
StatefulLLMPipelineNPU::StatefulLLMPipelineNPU(
50+
const std::shared_ptr<ov::Model>& model,
51+
const ov::genai::Tokenizer& tokenizer,
52+
const ov::AnyMap& properties,
53+
const ov::genai::GenerationConfig& generation_config)
54+
: LLMPipelineImplBase(tokenizer, generation_config) {
55+
auto properties_without_draft_model = properties;
56+
auto draft_model_descr = extract_draft_model_from_config(properties_without_draft_model);
57+
if (draft_model_descr.model != nullptr) {
58+
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, "NPU", properties_without_draft_model, {}, generation_config);
59+
m_pimpl = std::make_unique<SpeculativeLLMPipelineNPU>(main_model_descr, draft_model_descr);
60+
} else if (properties_without_draft_model.count("STATIC_PIPELINE")) {
61+
m_pimpl = static_llm::LLMPipelineFactory::create(model, tokenizer,
62+
properties_without_draft_model, generation_config);
63+
} else {
64+
m_pimpl = std::make_unique<StatefulLLMPipeline>(model, tokenizer, "NPU",
65+
properties_without_draft_model, generation_config);
66+
}
67+
}
68+
69+
DecodedResults StatefulLLMPipelineNPU::generate(
70+
StringInputs inputs,
71+
OptionalGenerationConfig generation_config,
72+
StreamerVariant streamer) {
73+
return m_pimpl->generate(inputs, generation_config, streamer);
74+
}
75+
76+
EncodedResults StatefulLLMPipelineNPU::generate(
77+
const EncodedInputs& inputs,
78+
OptionalGenerationConfig generation_config,
79+
StreamerVariant streamer) {
80+
return m_pimpl->generate(inputs, generation_config, streamer);
81+
}
82+
83+
void StatefulLLMPipelineNPU::start_chat(const std::string& system_message) {
84+
m_pimpl->start_chat(system_message);
85+
}
86+
87+
// FIXME: Do we need it?
88+
// void StatefulLLMPipelineNPU::reset_kv_state() {
89+
// m_pimpl->reset_kv_state();
90+
// }
91+
92+
void StatefulLLMPipelineNPU::finish_chat() {
93+
m_pimpl->finish_chat();
94+
}
95+
96+
} // namespace ov::genai
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Copyright (C) 2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
5+
#include <limits>
6+
7+
#include "llm/pipeline_base.hpp"
8+
9+
namespace ov::genai {
10+
11+
class StatefulLLMPipelineNPU final : public LLMPipelineImplBase {
12+
public:
13+
StatefulLLMPipelineNPU(
14+
const std::filesystem::path& models_path,
15+
const ov::genai::Tokenizer& tokenizer,
16+
const ov::AnyMap& plugin_config
17+
);
18+
19+
StatefulLLMPipelineNPU(
20+
const std::filesystem::path& models_path,
21+
const ov::AnyMap& plugin_config
22+
);
23+
24+
StatefulLLMPipelineNPU(
25+
const std::shared_ptr<ov::Model>& model,
26+
const ov::genai::Tokenizer& tokenizer,
27+
const ov::AnyMap& config,
28+
const ov::genai::GenerationConfig& generation_config
29+
);
30+
31+
DecodedResults generate(
32+
StringInputs inputs,
33+
OptionalGenerationConfig generation_config,
34+
StreamerVariant streamer
35+
) override;
36+
37+
EncodedResults generate(
38+
const EncodedInputs& inputs,
39+
OptionalGenerationConfig generation_config,
40+
StreamerVariant streamer
41+
) override;
42+
43+
void start_chat(const std::string& system_message) override;
44+
45+
void finish_chat() override;
46+
47+
~StatefulLLMPipelineNPU() = default;
48+
49+
private:
50+
std::unique_ptr<LLMPipelineImplBase> m_pimpl;
51+
};
52+
53+
} // namespace ov::genai

src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,33 +8,10 @@
88
#include "speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp"
99
#include "speculative_decoding/speculative_decoding_metrics.hpp"
1010
#include "openvino/genai/speculative_decoding/perf_metrics.hpp"
11+
#include "utils.hpp"
1112

1213
namespace ov::genai {
1314

14-
struct ModelDesc {
15-
std::string device;
16-
ov::genai::SchedulerConfig scheduler_config;
17-
ov::AnyMap properties;
18-
ov::genai::GenerationConfig generation_config;
19-
std::shared_ptr<ov::Model> model = nullptr;
20-
ov::genai::Tokenizer tokenizer;
21-
22-
ModelDesc(const std::shared_ptr<ov::Model>& model,
23-
const ov::genai::Tokenizer& tokenizer,
24-
const std::string& device = {},
25-
const ov::AnyMap& properties = {},
26-
const ov::genai::SchedulerConfig& scheduler_config = {},
27-
const ov::genai::GenerationConfig& generation_config = {}) :
28-
model(model),
29-
tokenizer(tokenizer),
30-
device(device),
31-
properties(properties),
32-
scheduler_config(scheduler_config),
33-
generation_config(generation_config) {}
34-
35-
ModelDesc() = default;
36-
};
37-
3815
class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::IContinuousBatchingPipeline {
3916
protected:
4017
std::shared_ptr<ContinuousBatchingForSpeculativeDecodingImpl> m_main_pipeline, m_draft_pipeline;

0 commit comments

Comments
 (0)