@@ -80,12 +80,12 @@ ov::genai::LLMPipeline::LLMPipeline(
80
80
auto start_time = std::chrono::steady_clock::now ();
81
81
auto [properties, attention_backend] = utils::extract_attention_backend (user_properties);
82
82
83
- // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
84
- if (utils::explicitly_requires_paged_attention (user_properties)) {
83
+ if (is_npu_requested (device, properties)) {
84
+ m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, tokenizer, properties);
85
+ } else if (utils::explicitly_requires_paged_attention (user_properties)) {
86
+ // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
85
87
auto [device_properties, scheduler_config] = utils::extract_scheduler_config (properties, utils::get_latency_oriented_scheduler_config ());
86
88
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties);
87
- } else if (device == " NPU" ) {
88
- m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, tokenizer, properties);
89
89
} else if (attention_backend == PA_BACKEND) {
90
90
// try to call CB adapter one more time, but with safe guard to silent exception
91
91
try {
@@ -115,13 +115,10 @@ ov::genai::LLMPipeline::LLMPipeline(
115
115
116
116
auto [properties, attention_backend] = utils::extract_attention_backend (user_properties);
117
117
118
- // First -> check draft model. for NPU leave it as is for the main model.
119
- // if NPU
120
- // if draft model is on NPU
121
- // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
122
- if (device == " NPU" ) {
118
+ if (is_npu_requested (device, properties)) {
123
119
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(models_path, properties);
124
120
} else if (utils::explicitly_requires_paged_attention (user_properties)) {
121
+ // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
125
122
auto [device_properties, scheduler_config] = utils::extract_scheduler_config (properties, utils::get_latency_oriented_scheduler_config ());
126
123
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
127
124
@@ -157,17 +154,17 @@ ov::genai::LLMPipeline::LLMPipeline(
157
154
158
155
auto [properties, attention_backend] = utils::extract_attention_backend (user_properties);
159
156
160
- // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
161
- if (utils::explicitly_requires_paged_attention (user_properties)) {
162
- auto [device_properties, scheduler_config] = utils::extract_scheduler_config (properties, utils::get_latency_oriented_scheduler_config ());
163
- m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
164
- tokenizer, scheduler_config, device, device_properties, generation_config);
165
- } else if (device == " NPU" ) {
157
+ if (is_npu_requested (device, properties)) {
166
158
m_pimpl = std::make_unique<StatefulLLMPipelineNPU>(
167
159
utils::singleton_core ().read_model (model_str, weights_tensor),
168
160
tokenizer,
169
161
properties,
170
162
generation_config);
163
+ } else if (utils::explicitly_requires_paged_attention (user_properties)) {
164
+ // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
165
+ auto [device_properties, scheduler_config] = utils::extract_scheduler_config (properties, utils::get_latency_oriented_scheduler_config ());
166
+ m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
167
+ tokenizer, scheduler_config, device, device_properties, generation_config);
171
168
} else if (attention_backend == PA_BACKEND) {
172
169
// try to call CB adapter one more time, but with safe guard to silent exception
173
170
try {
0 commit comments