-
Notifications
You must be signed in to change notification settings - Fork 284
Adding non-CB pipeline for Speculative Decoding #2544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 33 commits
551b5c4
a2eed7c
4b6eabb
bf9b826
b00f972
b572b63
e9c229e
3003de2
cfa4856
b3c32f3
7305238
77ee8d8
6c25242
efb0851
c129caa
9a8ccd7
ee1369f
b4840b1
67fa0e5
16ee0db
d11ce99
6dbba16
a8d1dd0
62081ed
0150e1d
7352b1c
1596460
4787403
9bd43d3
90a4124
1bf82fc
f72feae
057b2e0
b885f8e
ec10cb7
efcbb66
abf71c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The accompanying Accuracy_Performance.xlsx doesn't contain perf data comparison with greedy. The point of speculative decoding is to outperform the corresponding single model sampling implementation. If you are going to collect that data it's worth paying attention to text generated by greedy. It should match speculative decoding exactly (if the device is the same), you can cover that in tests as well. Does random sampling work? I won't be able review in time, so I'd rely on @as-suvorov, @popovaan, @sbalandi There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will share results of validation against stateful pipepline with greedy decoding. Thanks! Random sampling doesn't work for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a test verifying that optimum-intel greedily generated text matches exactly new speculative decoding. That should cover the alignment with optimum and the case I described earlier. There are tests validating perf metrics for LLMs:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks!! |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please review it with the GenAI team |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,10 +9,10 @@ | |
#include "openvino/genai/llm_pipeline.hpp" | ||
#include "openvino/genai/perf_metrics.hpp" | ||
|
||
#include "llm/pipeline_static.hpp" | ||
#include "llm/pipeline_stateful.hpp" | ||
#include "llm/pipeline_continuous_batching_adapter.hpp" | ||
#include "speculative_decoding/speculative_decoding_impl.hpp" | ||
#include "speculative_decoding/speculative_decoding_stateful.hpp" | ||
#include "utils.hpp" | ||
|
||
namespace ov { | ||
|
@@ -60,6 +60,47 @@ std::pair<std::string, Any> draft_model( | |
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; | ||
} | ||
|
||
class StatefulPipeline { | ||
public: | ||
static std::unique_ptr<LLMPipelineImplBase> create( | ||
const std::filesystem::path& models_path, | ||
const ov::genai::Tokenizer& tokenizer, | ||
const std::string& device, | ||
const ov::AnyMap& properties) { | ||
return create( | ||
ov::genai::utils::read_model(models_path, properties), | ||
tokenizer, | ||
device, | ||
properties, | ||
utils::from_config_json_if_exists(models_path)); | ||
} | ||
|
||
static std::unique_ptr<LLMPipelineImplBase> create( | ||
const std::filesystem::path& models_path, | ||
const std::string& device, | ||
const ov::AnyMap& plugin_config) { | ||
return create(models_path, Tokenizer(models_path, plugin_config), device, plugin_config); | ||
} | ||
|
||
static std::unique_ptr<LLMPipelineImplBase> create( | ||
const std::shared_ptr<ov::Model>& model, | ||
const ov::genai::Tokenizer& tokenizer, | ||
const std::string& device, | ||
const ov::AnyMap& properties, | ||
const ov::genai::GenerationConfig& generation_config) { | ||
|
||
auto properties_without_draft_model = properties; | ||
auto draft_model_descr = ov::genai::utils::extract_draft_model_from_config(properties_without_draft_model); | ||
if (draft_model_descr.model != nullptr) { | ||
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, {}, generation_config); | ||
return std::make_unique<StatefulSpeculativeLLMPipeline>(main_model_descr, draft_model_descr); | ||
} | ||
|
||
return std::make_unique<StatefulLLMPipeline>(model, tokenizer, device, | ||
properties_without_draft_model, generation_config); | ||
} | ||
}; | ||
|
||
// Public LLMPipeline | ||
|
||
ov::genai::LLMPipeline::LLMPipeline( | ||
|
@@ -80,14 +121,12 @@ ov::genai::LLMPipeline::LLMPipeline( | |
auto start_time = std::chrono::steady_clock::now(); | ||
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties); | ||
|
||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
if (ov::genai::utils::is_npu_requested(device, properties)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we basically have 3 constructors with different parameters. We could try to use constructor delegation, move the logic into separate function with most generic parameters or (least preferrable) use template function with parameters pack and forwarding. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed internally, will create a follow up refactoring ticket |
||
m_pimpl = StatefulPipeline::create(models_path, tokenizer, device, properties); | ||
} else if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config()); | ||
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties); | ||
} else if (device == "NPU") { | ||
m_pimpl = properties.count("STATIC_PIPELINE") | ||
? static_llm::LLMPipelineFactory::create(models_path, tokenizer, properties) | ||
: std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties); | ||
} else if (attention_backend == PA_BACKEND) { | ||
// try to call CB adapter one more time, but with safe guard to silent exception | ||
try { | ||
|
@@ -102,7 +141,7 @@ ov::genai::LLMPipeline::LLMPipeline( | |
} | ||
|
||
if (m_pimpl == nullptr) { | ||
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties); | ||
m_pimpl = StatefulPipeline::create(models_path, tokenizer, device, properties); | ||
} | ||
|
||
m_pimpl->save_load_time(start_time); | ||
|
@@ -117,14 +156,12 @@ ov::genai::LLMPipeline::LLMPipeline( | |
|
||
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties); | ||
|
||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
if (ov::genai::utils::is_npu_requested(device, properties)) { | ||
m_pimpl = StatefulPipeline::create(models_path, device, properties); | ||
} else if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config()); | ||
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties); | ||
} else if (device == "NPU") { | ||
m_pimpl = properties.count("STATIC_PIPELINE") | ||
? static_llm::LLMPipelineFactory::create(models_path, properties) | ||
: std::make_unique<StatefulLLMPipeline>(models_path, device, properties); | ||
} else if (attention_backend == PA_BACKEND) { | ||
// try to call CB adapter one more time, but with safe guard to silent exception | ||
try { | ||
|
@@ -139,7 +176,7 @@ ov::genai::LLMPipeline::LLMPipeline( | |
} | ||
|
||
if (m_pimpl == nullptr) { | ||
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties); | ||
m_pimpl = StatefulPipeline::create(models_path, device, properties); | ||
} | ||
|
||
m_pimpl->save_load_time(start_time); | ||
|
@@ -157,24 +194,18 @@ ov::genai::LLMPipeline::LLMPipeline( | |
|
||
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties); | ||
|
||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
if (ov::genai::utils::is_npu_requested(device, properties)) { | ||
m_pimpl = StatefulPipeline::create( | ||
utils::singleton_core().read_model(model_str, weights_tensor), | ||
tokenizer, | ||
device, | ||
properties, | ||
generation_config); | ||
} else if (utils::explicitly_requires_paged_attention(user_properties)) { | ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues | ||
auto [device_properties, scheduler_config] = utils::extract_scheduler_config(properties, utils::get_latency_oriented_scheduler_config()); | ||
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, | ||
tokenizer, scheduler_config, device, device_properties, generation_config); | ||
} else if (device == "NPU") { | ||
m_pimpl = properties.count("STATIC_PIPELINE") | ||
? static_llm::LLMPipelineFactory::create( | ||
utils::singleton_core().read_model(model_str, weights_tensor), | ||
tokenizer, | ||
properties, | ||
generation_config) | ||
: std::make_unique<StatefulLLMPipeline>( | ||
utils::singleton_core().read_model(model_str, weights_tensor), | ||
tokenizer, | ||
device, | ||
properties, | ||
generation_config); | ||
} else if (attention_backend == PA_BACKEND) { | ||
// try to call CB adapter one more time, but with safe guard to silent exception | ||
try { | ||
|
@@ -190,7 +221,7 @@ ov::genai::LLMPipeline::LLMPipeline( | |
} | ||
|
||
if (m_pimpl == nullptr) { | ||
m_pimpl = std::make_unique<StatefulLLMPipeline>( | ||
m_pimpl = StatefulPipeline::create( | ||
utils::singleton_core().read_model(model_str, weights_tensor), | ||
tokenizer, | ||
device, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make sure to review it with the GenAI team |
Uh oh!
There was an error while loading. Please reload this page.