diff --git a/src/plugins/intel_npu/src/al/CMakeLists.txt b/src/plugins/intel_npu/src/al/CMakeLists.txt index 7bdb9ccd7a1b6b..8e5383728338d7 100644 --- a/src/plugins/intel_npu/src/al/CMakeLists.txt +++ b/src/plugins/intel_npu/src/al/CMakeLists.txt @@ -24,6 +24,7 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::npu_logger_utils openvino::runtime::dev + openvino_xml_util ) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp index 50d7b85607ca21..dad4a271fadbd4 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp @@ -1426,4 +1426,18 @@ struct USE_BASE_MODEL_SERIALIZER final : OptionBase { + static std::string_view key() { + return ov::intel_npu::serialization_weights_size_threshold.name(); + } + + static size_t defaultValue() { + return 0; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index c485af1e0a3d1a..c0f727f307aea2 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -357,12 +357,21 @@ static constexpr ov::Property weightless_blob{"NPU_WEIGHTLESS_BLOB"}; * * The base serializer is the OV implementation of the "XmlSerializer" without any extensions. All weights are copied in * a separate buffer. By turning this off, the NPU extension of the serializer is enabled. This allows optimizing the - * process by avoiding copies into a separate weights buffer. However, this solution may be less reliable. - * - * @note This option doesn't actually do anything right now, it has been registered in advance. + * process by reducing the amount of weights that will be copied in a separate buffer. However, this solution may be + * less reliable. */ static constexpr ov::Property use_base_model_serializer{"NPU_USE_BASE_MODEL_SERIALIZER"}; +/** + * @brief [Only for NPU Plugin] + * Type: size_t. Default is 0. + * + * Effective only if "use_base_model_serializer" is set to false. All "ov::Constant" buffers smaller than this value + * (byte size) will be copied in a separate buffer. The rest of the weights will be reconstructed at deserialization + * time using buffer pointers. + */ +static constexpr ov::Property serialization_weights_size_threshold{"NPU_SERIALIZATION_WEIGHTS_SIZE_THRESHOLD"}; + /** * @brief [Experimental, only for NPU Plugin] * Type: integer. diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/weights_pointer_attribute.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/weights_pointer_attribute.hpp new file mode 100644 index 00000000000000..e71923d7ee3f70 --- /dev/null +++ b/src/plugins/intel_npu/src/al/include/intel_npu/weights_pointer_attribute.hpp @@ -0,0 +1,46 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "openvino/core/runtime_attribute.hpp" + +namespace intel_npu { + +/** + * @brief Attribute containing the memory address of a weights buffer and the size of the buffer in bytes. + * @details Used as part of the serialization/deserialization algorithms in order to allow processing models without + * copying weights. + */ +class WeightsPointerAttribute : public ov::RuntimeAttribute { +public: + OPENVINO_RTTI("WeightsPointerAttribute", "0", RuntimeAttribute); + + WeightsPointerAttribute() = delete; + + WeightsPointerAttribute(const void* pointer, const size_t size) + : memory_pointer(reinterpret_cast(pointer)), + byte_size(size) {} + + /** + * @note The names of the attributes have been kept short in order to save some memory (there may be a lot of + * "ov::Constant" nodes in a model). Also, three characters should be sufficient to avoid collisions. + */ + static constexpr const std::string_view POINTER_KEY = "mpZ"; + static constexpr const std::string_view BYTE_SIZE_KEY = "msZ"; + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute(POINTER_KEY.data(), memory_pointer); + visitor.on_attribute(BYTE_SIZE_KEY.data(), byte_size); + return true; + } + + size_t memory_pointer; + size_t byte_size; +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp index 9bf0cc8ce2bb20..f41ceef203ad60 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp @@ -4,6 +4,7 @@ #pragma once +#include "intel_npu/common/filtered_config.hpp" #include "intel_npu/common/igraph.hpp" namespace intel_npu { @@ -11,7 +12,7 @@ namespace intel_npu { class ICompilerAdapter { public: virtual std::shared_ptr compile(const std::shared_ptr& model, - const Config& config) const = 0; + const FilteredConfig& config) const = 0; /** * @brief Compiles the model, weights separation enabled. @@ -27,7 +28,8 @@ class ICompilerAdapter { * "icompiler.hpp". * @return A "WeightlessGraph" type of object. */ - virtual std::shared_ptr compileWS(const std::shared_ptr& model, const Config& config) const = 0; + virtual std::shared_ptr compileWS(const std::shared_ptr& model, + const FilteredConfig& config) const = 0; /** * @brief Parses the provided binary objects and returns a wrapper over the resulted L0 handles. The model may also @@ -44,11 +46,12 @@ class ICompilerAdapter { */ virtual std::shared_ptr parse( ov::Tensor mainBlob, - const Config& config, + const FilteredConfig& config, std::optional> initBlobs = std::nullopt, const std::optional>& model = std::nullopt) const = 0; - virtual ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const = 0; + virtual ov::SupportedOpsMap query(const std::shared_ptr& model, + const FilteredConfig& config) const = 0; virtual uint32_t get_version() const = 0; virtual std::vector get_supported_options() const = 0; virtual bool is_option_supported(std::string optname) const = 0; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/custom_stream_buffer.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/custom_stream_buffer.hpp index 727132779f703f..bf7320368426c0 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/custom_stream_buffer.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/custom_stream_buffer.hpp @@ -75,6 +75,11 @@ class writer_streambuf final : public std::streambuf { } } + pos_type seekpos(pos_type pos, std::ios_base::openmode which) override { + writeIt = startIt + pos; + return pos; + } + OutputIt startIt; OutputIt writeIt; }; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp index f1c1c302f2265c..89f0d6feb8b2e3 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp @@ -10,6 +10,7 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" +#include "vcl_serializer.hpp" #include "ze_graph_ext_wrappers.hpp" namespace intel_npu { @@ -18,17 +19,20 @@ class DriverCompilerAdapter final : public ICompilerAdapter { public: DriverCompilerAdapter(const std::shared_ptr& zeroInitStruct); - std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; + std::shared_ptr compile(const std::shared_ptr& model, + const FilteredConfig& config) const override; - std::shared_ptr compileWS(const std::shared_ptr& model, const Config& config) const override; + std::shared_ptr compileWS(const std::shared_ptr& model, + const FilteredConfig& config) const override; std::shared_ptr parse( ov::Tensor mainBlob, - const Config& config, + const FilteredConfig& config, std::optional> initBlobs = std::nullopt, const std::optional>& model = std::nullopt) const override; - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; + ov::SupportedOpsMap query(const std::shared_ptr& model, + const FilteredConfig& config) const override; std::vector get_supported_options() const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ir_serializer.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ir_serializer.hpp deleted file mode 100644 index e97c9b5aee22e1..00000000000000 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ir_serializer.hpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2018-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include -#include - -#include "intel_npu/config/config.hpp" -#include "intel_npu/utils/logger/logger.hpp" -#include "ze_graph_ext.h" - -namespace intel_npu { - -using SerializedIR = std::pair>; - -/** - * @brief Contain all required transformation on OpenVINO model in case for external compiler usage and - * providing forward compatibility (OV model with opset N+M, external compiler with opset N) - */ -namespace driver_compiler_utils { - -class IRSerializer { -public: - IRSerializer(const std::shared_ptr& origModel, const uint32_t supportedOpset = 11); - - size_t getXmlSize() const { - return _xmlSize; - } - - size_t getWeightsSize() const { - return _weightsSize; - } - - /** - * @brief Serialize OpenVINO model to target buffer - */ - void serializeModelToBuffer(uint8_t* xml, uint8_t* weights); - - /** - * @brief Serialize input / output information to string format. - * @details Format: - * --inputs_precisions="0: [1:]" - * --inputs_layouts="0: [1:]" - * --outputs_precisions="0:" - * --outputs_layouts="0:" - * - * For older compiler versions, the name of the inputs/outputs may be used instead of their indices. - * - * Since the layout information is no longer an important part of the metadata values when using the 2.0 OV - * API, the layout fields shall be filled with default values in order to assure the backward compatibility - * with the driver. - */ - SerializedIR serializeIR(const std::shared_ptr& model, - ze_graph_compiler_version_info_t compilerVersion, - const uint32_t supportedOpsetVersion); - - std::string serializeIOInfo(const std::shared_ptr& model, const bool useIndices); - - std::string serializeConfig(const Config& config, - ze_graph_compiler_version_info_t compilerVersion, - bool turboSupported = false); - -private: - /** - * @brief Serialize OpenVINO model to target stream - */ - void serializeModelToStream(std::ostream& xml, std::ostream& weights); - - /** - * @brief Get size of xml and weights from model - */ - void countModelSize(); - - Logger _logger; - std::shared_ptr _model = nullptr; - uint32_t _supportedOpset = 11; - size_t _xmlSize = 0; - size_t _weightsSize = 0; -}; -} // namespace driver_compiler_utils -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp index b62cb91b29791f..0675d964565947 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp @@ -19,17 +19,20 @@ class PluginCompilerAdapter final : public ICompilerAdapter { public: PluginCompilerAdapter(const std::shared_ptr& zeroInitStruct); - std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; + std::shared_ptr compile(const std::shared_ptr& model, + const FilteredConfig& config) const override; - std::shared_ptr compileWS(const std::shared_ptr& model, const Config& config) const override; + std::shared_ptr compileWS(const std::shared_ptr& model, + const FilteredConfig& config) const override; std::shared_ptr parse( ov::Tensor mainBlob, - const Config& config, + const FilteredConfig& config, std::optional> initBlobs = std::nullopt, const std::optional>& model = std::nullopt) const override; - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; + ov::SupportedOpsMap query(const std::shared_ptr& model, + const FilteredConfig& config) const override; std::vector get_supported_options() const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/vcl_serializer.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/vcl_serializer.hpp new file mode 100644 index 00000000000000..6b8f01b373a831 --- /dev/null +++ b/src/plugins/intel_npu/src/compiler_adapter/include/vcl_serializer.hpp @@ -0,0 +1,142 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include + +#include +#include + +#include "intel_npu/config/config.hpp" +#include "intel_npu/utils/logger/logger.hpp" +#include "openvino/core/model.hpp" +#include "openvino/pass/manager.hpp" +#include "ze_graph_ext.h" + +namespace intel_npu { + +using SerializedIR = std::pair>; + +/** + * @brief Contain all required transformation on OpenVINO model in case for external compiler usage and + * providing forward compatibility (OV model with opset N+M, external compiler with opset N) + */ +namespace driver_compiler_utils { + +/** + * @brief Interface to be used by the serialization algorithms. + * @details The "VCL" serializer is meant to integrate an OV serializer and add any additional model metadata in order + * to feed the compilation method of the "VCL" interface. + */ +class VCLSerializerBase { +public: + VCLSerializerBase(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset = 11); + + virtual SerializedIR serialize() = 0; + + virtual ~VCLSerializerBase(); + +protected: + void serialize_model_to_stream(const std::function& register_serialization_pass); + + Logger _logger; + std::shared_ptr _model = nullptr; + ze_graph_compiler_version_info_t _compilerVersion; + uint32_t _supportedOpset = 11; +}; + +/** + * @brief Class implementing the legacy serialization algorithms. All weights are copied in a separate buffer. + */ +class VCLSerializerWithWeightsCopy : public VCLSerializerBase { +public: + VCLSerializerWithWeightsCopy(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset = 11); + + SerializedIR serialize() override; + +private: + /** + * @brief Serialize OpenVINO model to target buffer + */ + void serialize_model_to_buffer(uint8_t* xml, uint8_t* weights); + + /** + * @brief Serialize OpenVINO model to target stream + */ + void serialize_model_to_stream(std::ostream& xml, std::ostream& weights); + + /** + * @brief Get size of xml and weights from model + */ + void count_model_size(); + + size_t _xmlSize = 0; + size_t _weightsSize = 0; +}; + +/** + * @brief Class implementing the optimized serialization algorithm. + * @details Weights will be stored either as metadata (memory location & size in bytes) or as whole buffers (just like + * the legacy algorithm). The amount of weights that will be copied can be controlled by leveraging the + * "intel_npu::serialization_weights_size_threshold" config option. + */ +class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase { +public: + VCLSerializerWithoutWeightsCopy(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset = 11); + + SerializedIR serialize() override; + +private: + void serialize_model_to_buffer(uint8_t* buffer); + + void serialize_model_to_stream(std::ostream& stream); + + void count_model_size(); + + uint64_t _serializedModelSize = 0; +}; + +/** + * @brief Serializes the model using a format supported by the "VCL" interface. + * + * @param supportedOpsetVersion The last operators set version supported by the compiler. + * @param useBaseModelSerializer "true" means the legacy serializer will be used (weights will be copied), "false" means + * the optimized one is used instead (weights pointers are stored). + * @param weightsSizeThreshold Relevant only if "useBaseModelSerializer" is false. The weights smaller than this value + * will be copied into a separate buffer. The rest will have only their memory location stored. + */ +SerializedIR serializeIR(const std::shared_ptr& model, + ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpsetVersion, + const bool useBaseModelSerializer = true, + const size_t weightsSizeThreshold = 0); + +/** + * @brief Serialize input / output information to string format. + * @details Format: + * --inputs_precisions="0: [1:]" + * --inputs_layouts="0: [1:]" + * --outputs_precisions="0:" + * --outputs_layouts="0:" + * + * For older compiler versions, the name of the inputs/outputs may be used instead of their indices. + * + * Since the layout information is no longer an important part of the metadata values when using the 2.0 OV + * API, the layout fields shall be filled with default values in order to assure the backward compatibility + * with the driver. + */ +std::string serializeIOInfo(const std::shared_ptr& model, const bool useIndices); + +std::string serializeConfig(const Config& config, + ze_graph_compiler_version_info_t compilerVersion, + bool turboSupported = false); + +} // namespace driver_compiler_utils +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/xml_serializer.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/xml_serializer.hpp new file mode 100644 index 00000000000000..689114fbdf8543 --- /dev/null +++ b/src/plugins/intel_npu/src/compiler_adapter/include/xml_serializer.hpp @@ -0,0 +1,106 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/serialize.hpp" +#include "openvino/xml_util/xml_serialize_util.hpp" + +namespace intel_npu { + +/** + * @brief Nothing is stored. The weights are expected to be reconstruscted in some other way. + */ +class WeightlessWriter : public ov::util::ConstantWriter { +public: + explicit WeightlessWriter(ov::util::ConstantWriter& other) : ov::util::ConstantWriter(other) {} + + FilePosition write(const char*, size_t, size_t&, bool, ov::element::Type, bool) override { + return 0; + } +}; + +/** + * @brief Overriden in order to allow serializing models without copying weights. + * @details Weights can be stored either as values (buffer copies, just like the parent algorithm), or as metadata + * (memory location + buffer size in bytes). The amount of weights that are copied as values can be controlled by + * configuring the "intel_npu::serialization_weights_size_threshold" option. + */ +class XmlSerializer : public ov::util::XmlSerializer { +public: + XmlSerializer(pugi::xml_node& data, + const std::string& node_type_name, + ov::util::ConstantWriter& constant_write_handler, + int64_t version, + std::shared_ptr weightless_constant_writer = nullptr) + : ov::util::XmlSerializer(data, + node_type_name, + constant_write_handler, + version, + false, + false, + ov::element::dynamic, + false), + m_base_constant_writer(std::ref(constant_write_handler)), + m_weightless_constant_writer(weightless_constant_writer + ? weightless_constant_writer + : std::make_shared(constant_write_handler)) {} + +private: + /** + * @brief Toggles between the two writers. + */ + ov::util::ConstantWriter& get_constant_write_handler() override; + + /** + * @brief Overriden in order to choose which weights writer will be used based on the occurrence of the + * "WeightsPointerAttribute". + */ + bool append_node_attributes(ov::Node& node) override; + + std::unique_ptr make_visitor(pugi::xml_node& data, + const std::string& node_type_name, + ov::util::ConstantWriter& constant_write_handler, + int64_t version, + bool, + bool, + ov::element::Type, + bool) const override; + + /** + * @brief The base OV writer, copies the weights in a dedicated buffer. + */ + std::reference_wrapper m_base_constant_writer; + /** + * @brief Writes nothing. The visitor pattern will be used in order to store weights metadata instead. + */ + std::shared_ptr m_weightless_constant_writer = nullptr; + bool m_use_weightless_writer = false; +}; + +/** + * @brief Leverages the "intel_npu::XmlSerializer" in order to allow serializing models without copying weights. + */ +class StreamSerialize : public ov::pass::StreamSerialize { +public: + StreamSerialize(std::ostream& stream, + const std::function& custom_data_serializer, + ov::pass::Serialize::Version version = ov::pass::Serialize::Version::UNSPECIFIED) + : ov::pass::StreamSerialize(stream, custom_data_serializer, {}, version) {} + +private: + std::unique_ptr make_serializer(pugi::xml_node& data, + const std::string& node_type_name, + ov::util::ConstantWriter& constant_write_handler, + int64_t version, + bool, + bool, + ov::element::Type, + bool) const override { + return std::make_unique(data, node_type_name, constant_write_handler, version); + } +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp index 94a058650019e0..505c988e41151c 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp @@ -10,7 +10,7 @@ #include "intel_npu/network_metadata.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "ir_serializer.hpp" +#include "vcl_serializer.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index cd459b604f3f6c..df8209a23e39fd 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -11,13 +11,14 @@ #include "intel_npu/common/itt.hpp" #include "intel_npu/config/options.hpp" #include "intel_npu/utils/logger/logger.hpp" -#include "ir_serializer.hpp" #include "mem_usage.hpp" #include "openvino/core/model.hpp" #include "openvino/core/rt_info/weightless_caching_attributes.hpp" +#include "vcl_serializer.hpp" #include "weightless_graph.hpp" namespace { + bool isInitMetadata(const intel_npu::NetworkMetadata& networkMetadata) { if (networkMetadata.inputs.size() == 0) { return false; @@ -80,7 +81,7 @@ DriverCompilerAdapter::DriverCompilerAdapter(const std::shared_ptr DriverCompilerAdapter::compile(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(COMPILE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "compile"); const ze_graph_compiler_version_info_t& compilerVersion = _compilerProperties.compilerVersion; @@ -88,17 +89,24 @@ std::shared_ptr DriverCompilerAdapter::compile(const std::shared_ptr() + : true, + config.get()); std::string buildFlags; const bool useIndices = !((compilerVersion.major < 5) || (compilerVersion.major == 5 && compilerVersion.minor < 9)); _logger.debug("build flags"); - buildFlags += irSerializer.serializeIOInfo(model, useIndices); + buildFlags += driver_compiler_utils::serializeIOInfo(model, useIndices); buildFlags += " "; - buildFlags += - irSerializer.serializeConfig(config, compilerVersion, _zeGraphExt->isTurboOptionSupported(compilerVersion)); + buildFlags += driver_compiler_utils::serializeConfig(config, + compilerVersion, + _zeGraphExt->isTurboOptionSupported(compilerVersion)); _logger.debug("compileIR Build flags : %s", buildFlags.c_str()); @@ -121,7 +129,7 @@ std::shared_ptr DriverCompilerAdapter::compile(const std::shared_ptr DriverCompilerAdapter::compileWS(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(COMPILE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "compileWS"); storeWeightlessCacheAttribute(model); @@ -144,13 +152,18 @@ std::shared_ptr DriverCompilerAdapter::compileWS(const std::shared_ptr() + : true, + config.get()); std::string buildFlags; const bool useIndices = !((compilerVersion.major < 5) || (compilerVersion.major == 5 && compilerVersion.minor < 9)); - const std::string serializedIOInfo = irSerializer.serializeIOInfo(model, useIndices); + const std::string serializedIOInfo = driver_compiler_utils::serializeIOInfo(model, useIndices); const FilteredConfig* plgConfig = dynamic_cast(&config); if (plgConfig == nullptr) { OPENVINO_THROW("config is not FilteredConfig"); @@ -177,7 +190,7 @@ std::shared_ptr DriverCompilerAdapter::compileWS(const std::shared_ptr DriverCompilerAdapter::compileWS(const std::shared_ptr DriverCompilerAdapter::parse( ov::Tensor mainBlob, - const Config& config, + const FilteredConfig& config, std::optional> initBlobs, const std::optional>& model) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse"); @@ -278,7 +291,7 @@ std::shared_ptr DriverCompilerAdapter::parse( } ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(query_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "query"); const ze_graph_compiler_version_info_t& compilerVersion = _compilerProperties.compilerVersion; @@ -286,11 +299,16 @@ ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr() + : true, + config.get()); std::string buildFlags; - buildFlags += irSerializer.serializeConfig(config, compilerVersion); + buildFlags += driver_compiler_utils::serializeConfig(config, compilerVersion); _logger.debug("queryImpl build flags : %s", buildFlags.c_str()); ov::SupportedOpsMap result; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp index 1a3a1a51ac6751..c58ede410e9b23 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp @@ -91,7 +91,7 @@ PluginCompilerAdapter::PluginCompilerAdapter(const std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(COMPILE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "compile"); _logger.debug("compile start"); @@ -124,7 +124,7 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptr PluginCompilerAdapter::compileWS(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(COMPILE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "compileWS"); std::vector> initNetworkDescriptions; @@ -257,7 +257,7 @@ std::shared_ptr PluginCompilerAdapter::compileWS(const std::shared_ptr PluginCompilerAdapter::parse( ov::Tensor mainBlob, - const Config& config, + const FilteredConfig& config, std::optional> initBlobs, const std::optional>& model) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse"); @@ -331,7 +331,7 @@ std::shared_ptr PluginCompilerAdapter::parse( } ov::SupportedOpsMap PluginCompilerAdapter::query(const std::shared_ptr& model, - const Config& config) const { + const FilteredConfig& config) const { OV_ITT_TASK_CHAIN(QUERY_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "query"); return _compiler->query(model, config); diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ir_serializer.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/vcl_serializer.cpp similarity index 65% rename from src/plugins/intel_npu/src/compiler_adapter/src/ir_serializer.cpp rename to src/plugins/intel_npu/src/compiler_adapter/src/vcl_serializer.cpp index 723c27bc1cf38d..731e6ce06dde3f 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/ir_serializer.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/vcl_serializer.cpp @@ -2,20 +2,24 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ir_serializer.hpp" +#include "vcl_serializer.hpp" #include +#include #include #include +#include #include "custom_stream_buffer.hpp" #include "intel_npu/common/filtered_config.hpp" #include "intel_npu/config/options.hpp" -#include "openvino/pass/manager.hpp" +#include "intel_npu/weights_pointer_attribute.hpp" #include "openvino/pass/serialize.hpp" #include "transformations/op_conversions/convert_interpolate11_downgrade.hpp" +#include "xml_serializer.hpp" namespace { + constexpr std::string_view INPUTS_PRECISIONS_KEY = "--inputs_precisions"; constexpr std::string_view INPUTS_LAYOUTS_KEY = "--inputs_layouts"; constexpr std::string_view OUTPUTS_PRECISIONS_KEY = "--outputs_precisions"; @@ -136,12 +140,63 @@ std::string rankToLegacyLayoutString(const size_t rank) { } } +/** + * @brief Stores weights metadata (memory location & buffer size in bytes) as runtime attributes of "ov::Constant" + * nodes. + * @details The presence of these attrbutes determines which weights are copied in a separate buffer by the + * serialization algorithm. If the attribute is found, the metadata required to reconstruct the weights buffer is + * present, therefore copying the buffer is omitted. + * + * @param model The target model, the attributes will be stored within it. + * @param weightSizeThreshold Determines which constant nodes will have this attribute stored within them. Implicitly, + * this determines which weights will be copied at serialization time. The weights smaller than this value will not get + * this attribute. + */ +void storeWeightsPointerAttribute(const std::shared_ptr& model, const size_t weightSizeThreshold) { + for (auto&& node : model->get_ordered_ops()) { + if (!ov::is_type(node)) { + continue; + } + + auto constantNode = std::static_pointer_cast(node); + if (constantNode->get_byte_size() < weightSizeThreshold) { + continue; + } + + ov::RTMap& runtimeInfoMap = constantNode->get_rt_info(); + runtimeInfoMap[intel_npu::WeightsPointerAttribute::get_type_info_static()] = + intel_npu::WeightsPointerAttribute(constantNode->get_data_ptr(), constantNode->get_byte_size()); + } +} + +/** + * @brief Removes the attributes stored by "storeWeightsPointerAttribute" in order to restore the model to its original + * state. + * @see storeWeightsPointerAttribute for details. + */ +void removeWeightsPointerAttribute(const std::shared_ptr& model) { + for (auto&& node : model->get_ordered_ops()) { + if (!ov::is_type(node)) { + continue; + } + + ov::RTMap& runtimeInfoMap = node->get_rt_info(); + const auto& resultIt = runtimeInfoMap.find(intel_npu::WeightsPointerAttribute::get_type_info_static()); + if (resultIt != runtimeInfoMap.end()) { + runtimeInfoMap.erase(resultIt); + } + } +} + } // namespace namespace intel_npu::driver_compiler_utils { -IRSerializer::IRSerializer(const std::shared_ptr& origModel, const uint32_t supportedOpset) - : _logger("IRSerializer", Logger::global().level()), +VCLSerializerBase::VCLSerializerBase(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset) + : _logger("VCLSerializerBase", Logger::global().level()), + _compilerVersion(compilerVersion), _supportedOpset(supportedOpset) { // There is no const variant of run_passes so use const_cast here // as model serialization does not mutate the model @@ -152,14 +207,29 @@ IRSerializer::IRSerializer(const std::shared_ptr& origModel, co _model = _model->clone(); _logger.info("Clone model for offset smaller than 11"); } - - countModelSize(); } -void IRSerializer::serializeModelToStream(std::ostream& xml, std::ostream& weights) { - _logger.debug("serializeModelToStream"); +VCLSerializerBase::~VCLSerializerBase() = default; + +VCLSerializerWithWeightsCopy::VCLSerializerWithWeightsCopy(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset) + : VCLSerializerBase(origModel, compilerVersion, supportedOpset) { + _logger.setName("VCLSerializerWithWeightsCopy"); +}; + +VCLSerializerWithoutWeightsCopy::VCLSerializerWithoutWeightsCopy(const std::shared_ptr& origModel, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpset) + : VCLSerializerBase(origModel, compilerVersion, supportedOpset) { + _logger.setName("VCLSerializerWithoutWeightsCopy"); +}; + +void VCLSerializerBase::serialize_model_to_stream( + const std::function& register_serialization_pass) { + _logger.debug("serialize_model_to_stream"); const auto passConfig = std::make_shared(); - ov::pass::Manager manager(std::move(passConfig), "NPU:serializeModelToStream"); + ov::pass::Manager manager(std::move(passConfig), "NPU:serialize_model_to_stream"); if (_supportedOpset < 11) { // Downgrade to opset10 @@ -167,18 +237,7 @@ void IRSerializer::serializeModelToStream(std::ostream& xml, std::ostream& weigh _logger.info("Downgrade op for opset smaller than 11"); } - manager.register_pass(xml, weights); - - // Depending on the driver version, the compiler attached to it may request this information as an indicator of the - // precision/layout preprocessing requirement. We are setting this value to "true" since the API version is no - // longer a cause for altering the metadata. This is due to the preprocessing performed in the OpenVINO framework's - // implementaion, the "ov::Model" object is preprocessed before reaching the NPU plugin. - const auto newAPIKey = "is_new_api"; - - // Flag used for indicating an NPU plugin version which switched the I/O identification convention from names to - // indices. The flag is required in order to inform the driver-compiler adapter to expect indices when attempting to - // deserialize the I/O metadata. - const auto useIndicesForIOMetadata = "use_indices_for_io_metadata"; + register_serialization_pass(manager); // We modify the original model object here therefore a mutex is required static std::mutex rtInfoMutex; @@ -186,58 +245,107 @@ void IRSerializer::serializeModelToStream(std::ostream& xml, std::ostream& weigh { std::lock_guard lock(rtInfoMutex); - _model->set_rt_info(true, newAPIKey); - _model->set_rt_info(true, useIndicesForIOMetadata); + // Depending on the driver version, the compiler attached to it may request this information as an indicator of + // the + // precision/layout preprocessing requirement. We are setting this value to "true" since the API version is no + // longer a cause for altering the metadata. This is due to the preprocessing performed in the OpenVINO + // framework's implementaion, the "ov::Model" object is preprocessed before reaching the NPU plugin. + _model->set_rt_info(true, "is_new_api"); + // Flag used for indicating an NPU plugin version which switched the I/O identification convention from names to + // indices. The flag is required in order to inform the driver-compiler adapter to expect indices when + // attempting to deserialize the I/O metadata. + _model->set_rt_info(true, "use_indices_for_io_metadata"); manager.run_passes(_model); auto& rtInfo = _model->get_rt_info(); - rtInfo.erase(newAPIKey); - rtInfo.erase(useIndicesForIOMetadata); + rtInfo.erase("is_new_api"); + rtInfo.erase("use_indices_for_io_metadata"); } - _logger.debug("serializeModelToStream end"); + _logger.debug("serialize_model_to_stream end"); } -void IRSerializer::countModelSize() { - _logger.debug("countModelSize"); +void VCLSerializerWithWeightsCopy::serialize_model_to_stream(std::ostream& xml, std::ostream& weights) { + const std::function& register_serialization_pass = [&](ov::pass::Manager& manager) { + manager.register_pass(xml, weights); + }; + VCLSerializerBase::serialize_model_to_stream(register_serialization_pass); +} + +void VCLSerializerWithoutWeightsCopy::serialize_model_to_stream(std::ostream& stream) { + const std::function& compiler_version_serializer = [&](std::ostream& stream) { + stream.write(reinterpret_cast(&_compilerVersion), sizeof(_compilerVersion)); + }; + const std::function& register_serialization_pass = [&](ov::pass::Manager& manager) { + manager.register_pass(stream, compiler_version_serializer); + }; + VCLSerializerBase::serialize_model_to_stream(register_serialization_pass); +} + +void VCLSerializerWithWeightsCopy::count_model_size() { + _logger.debug("count_model_size"); counter_streambuf xmlStreamBuf; counter_streambuf weightsStreamBuf; std::ostream xmlStream(&xmlStreamBuf); std::ostream weightsStream(&weightsStreamBuf); - serializeModelToStream(xmlStream, weightsStream); + serialize_model_to_stream(xmlStream, weightsStream); _xmlSize = xmlStreamBuf.size(); _weightsSize = weightsStreamBuf.size(); - _logger.debug("countModelSize completed, xml size: %d, weights size: %d", _xmlSize, _weightsSize); + _logger.debug("count_model_size completed, xml size: %d, weights size: %d", _xmlSize, _weightsSize); +} + +void VCLSerializerWithoutWeightsCopy::count_model_size() { + _logger.debug("count_model_size"); + + counter_streambuf streamBuf; + std::ostream stream(&streamBuf); + + serialize_model_to_stream(stream); + + _serializedModelSize = streamBuf.size(); + + _logger.debug("count_model_size completed, serialized model size: %d", _serializedModelSize); } -void IRSerializer::serializeModelToBuffer(uint8_t* xml, uint8_t* weights) { - _logger.debug("serializeModelToBuffer"); +void VCLSerializerWithWeightsCopy::serialize_model_to_buffer(uint8_t* xml, uint8_t* weights) { + _logger.debug("serialize_model_to_buffer"); writer_streambuf xmlStreamBuf(xml); writer_streambuf weightsStreamBuf(weights); std::ostream xmlStream(&xmlStreamBuf); std::ostream weightsStream(&weightsStreamBuf); - serializeModelToStream(xmlStream, weightsStream); + serialize_model_to_stream(xmlStream, weightsStream); + + _logger.debug("serialize_model_to_buffer end"); +} + +void VCLSerializerWithoutWeightsCopy::serialize_model_to_buffer(uint8_t* buffer) { + _logger.debug("serialize_model_to_buffer"); - _logger.debug("serializeModelToBuffer end"); + writer_streambuf streamBuf(buffer); + std::ostream stream(&streamBuf); + + serialize_model_to_stream(stream); + + _logger.debug("serialize_model_to_buffer end"); } -SerializedIR IRSerializer::serializeIR(const std::shared_ptr& model, - ze_graph_compiler_version_info_t compilerVersion, - const uint32_t supportedOpsetVersion) { +SerializedIR VCLSerializerWithWeightsCopy::serialize() { + count_model_size(); + // Contract between adapter and compiler in driver const uint32_t maxNumberOfElements = 10; const uint64_t maxSizeOfXML = std::numeric_limits::max() / 3; const uint64_t maxSizeOfWeights = maxSizeOfXML * 2; const uint32_t numberOfInputData = 2; - const uint64_t xmlSize = static_cast(getXmlSize()); - const uint64_t weightsSize = static_cast(getWeightsSize()); + const uint64_t xmlSize = static_cast(_xmlSize); + const uint64_t weightsSize = static_cast(_weightsSize); OPENVINO_ASSERT(numberOfInputData < maxNumberOfElements); if (xmlSize >= maxSizeOfXML) { @@ -250,7 +358,7 @@ SerializedIR IRSerializer::serializeIR(const std::shared_ptr& m maxSizeOfWeights); } - const uint64_t sizeOfSerializedIR = sizeof(compilerVersion) + sizeof(numberOfInputData) + sizeof(xmlSize) + + const uint64_t sizeOfSerializedIR = sizeof(_compilerVersion) + sizeof(numberOfInputData) + sizeof(xmlSize) + xmlSize + sizeof(weightsSize) + weightsSize; // use array to avoid vector's memory zeroing overhead @@ -258,8 +366,8 @@ SerializedIR IRSerializer::serializeIR(const std::shared_ptr& m uint8_t* serializedIR = buffer.get(); uint64_t offset = 0; - checkedMemcpy(serializedIR + offset, sizeOfSerializedIR - offset, &compilerVersion, sizeof(compilerVersion)); - offset += sizeof(compilerVersion); + checkedMemcpy(serializedIR + offset, sizeOfSerializedIR - offset, &_compilerVersion, sizeof(_compilerVersion)); + offset += sizeof(_compilerVersion); checkedMemcpy(serializedIR + offset, sizeOfSerializedIR - offset, &numberOfInputData, sizeof(numberOfInputData)); offset += sizeof(numberOfInputData); @@ -274,14 +382,50 @@ SerializedIR IRSerializer::serializeIR(const std::shared_ptr& m uint64_t weightsOffset = offset; offset += weightsSize; - serializeModelToBuffer(serializedIR + xmlOffset, serializedIR + weightsOffset); + serialize_model_to_buffer(serializedIR + xmlOffset, serializedIR + weightsOffset); OPENVINO_ASSERT(offset == sizeOfSerializedIR); return std::make_pair(sizeOfSerializedIR, buffer); } -std::string IRSerializer::serializeIOInfo(const std::shared_ptr& model, const bool useIndices) { +SerializedIR VCLSerializerWithoutWeightsCopy::serialize() { + count_model_size(); + + if (_serializedModelSize >= std::numeric_limits::max()) { + OPENVINO_THROW("The serialized model is too big to process. Size: ", + _serializedModelSize, + " >= ", + std::numeric_limits::max()); + } + + // use array to avoid vector's memory zero-ing overhead + std::shared_ptr buffer(new uint8_t[_serializedModelSize], std::default_delete()); + serialize_model_to_buffer(buffer.get()); + + return SerializedIR(_serializedModelSize, buffer); +} + +SerializedIR serializeIR(const std::shared_ptr& model, + const ze_graph_compiler_version_info_t compilerVersion, + const uint32_t supportedOpsetVersion, + const bool useBaseModelSerializer, + const size_t weightsSizeThreshold) { + if (!useBaseModelSerializer) { + // Non-constness required for adding & removing weights pointer attributes. The current instance is already a + // clone (or should be one), we are not modifying the original model. + const std::shared_ptr nonConstantModel = std::const_pointer_cast(model); + storeWeightsPointerAttribute(nonConstantModel, weightsSizeThreshold); + + SerializedIR serializedIR = + VCLSerializerWithoutWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize(); + removeWeightsPointerAttribute(nonConstantModel); + return serializedIR; + } + return VCLSerializerWithWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize(); +} + +std::string serializeIOInfo(const std::shared_ptr& model, const bool useIndices) { const ov::ParameterVector& parameters = model->get_parameters(); const ov::ResultVector& results = model->get_results(); @@ -369,9 +513,9 @@ std::string IRSerializer::serializeIOInfo(const std::shared_ptr outputsPrecisionSS.str() + VALUES_SEPARATOR.data() + outputsLayoutSS.str(); } -std::string IRSerializer::serializeConfig(const Config& config, - ze_graph_compiler_version_info_t compilerVersion, - bool turboSupported) { +std::string serializeConfig(const Config& config, + ze_graph_compiler_version_info_t compilerVersion, + bool turboSupported) { Logger logger("serializeConfig", Logger::global().level()); std::string content = {}; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/xml_serializer.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/xml_serializer.cpp new file mode 100644 index 00000000000000..9a3abca741cbe0 --- /dev/null +++ b/src/plugins/intel_npu/src/compiler_adapter/src/xml_serializer.cpp @@ -0,0 +1,45 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "xml_serializer.hpp" + +#include "intel_npu/weights_pointer_attribute.hpp" +#include "openvino/op/util/op_types.hpp" + +namespace intel_npu { + +ov::util::ConstantWriter& XmlSerializer::get_constant_write_handler() { + if (m_use_weightless_writer) { + return *m_weightless_constant_writer; + } else { + return m_base_constant_writer; + } +} + +bool XmlSerializer::append_node_attributes(ov::Node& node) { + // If the "WeightsPointerAttribute" is found, then we have the metadata required to avoid copying the weights + // corresponding to this node. + m_use_weightless_writer = node.get_rt_info().count(WeightsPointerAttribute::get_type_info_static()) != 0; + auto result = ov::util::XmlSerializer::append_node_attributes(node); + m_use_weightless_writer = false; + return result; +} + +std::unique_ptr XmlSerializer::make_visitor(pugi::xml_node& data, + const std::string& node_type_name, + ov::util::ConstantWriter& constant_write_handler, + int64_t version, + bool, + bool, + ov::element::Type, + bool) const { + return std::make_unique(data, + node_type_name, + constant_write_handler, + version, + m_weightless_constant_writer); +} + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 610cf717be4393..3585a257f7ddc1 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -322,6 +322,7 @@ void Plugin::init_options() { REGISTER_OPTION(SEPARATE_WEIGHTS_VERSION); REGISTER_OPTION(WS_COMPILE_CALL_NUMBER); REGISTER_OPTION(USE_BASE_MODEL_SERIALIZER); + REGISTER_OPTION(SERIALIZATION_WEIGHTS_SIZE_THRESHOLD); if (_backend) { if (_backend->isCommandQueueExtSupported()) { diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index 0432b69af2b363..fe0cbc43608fb5 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -384,6 +384,8 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::weightless_blob, WEIGHTLESS_BLOB); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::separate_weights_version, SEPARATE_WEIGHTS_VERSION); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::use_base_model_serializer, USE_BASE_MODEL_SERIALIZER); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::serialization_weights_size_threshold, + SERIALIZATION_WEIGHTS_SIZE_THRESHOLD); TRY_REGISTER_CUSTOMFUNC_PROPERTY(ov::intel_npu::stepping, STEPPING, [&](const Config& config) { if (!config.has()) { diff --git a/src/plugins/intel_npu/tests/functional/behavior/npu_driver_compiler_adapter/custom_stream.cpp b/src/plugins/intel_npu/tests/functional/behavior/npu_driver_compiler_adapter/custom_stream.cpp index ec12aff645f38e..f646a6455d6c1d 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/npu_driver_compiler_adapter/custom_stream.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/npu_driver_compiler_adapter/custom_stream.cpp @@ -4,46 +4,30 @@ #include #include -#include "shared_test_classes/base/ov_behavior_test_utils.hpp" #include "common/functions.hpp" #include "common/npu_test_env_cfg.hpp" #include "common_test_utils/node_builders/constant.hpp" #include "intel_npu/config/options.hpp" -#include "ir_serializer.hpp" #include "openvino/opsets/opset11.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" +#include "vcl_serializer.hpp" using CompilationParams = std::tuple; -using IRSerializer = intel_npu::driver_compiler_utils::IRSerializer; +using VCLSerializerWithWeightsCopy = intel_npu::driver_compiler_utils::VCLSerializerWithWeightsCopy; +using VCLSerializerWithoutWeightsCopy = intel_npu::driver_compiler_utils::VCLSerializerWithoutWeightsCopy; namespace ov::test::behavior { class DriverCompilerAdapterCustomStreamTestNPU : public ov::test::behavior::OVPluginTestBase, public testing::WithParamInterface { public: - std::string generateRandomFileName() { - std::stringstream ss; - auto now = std::chrono::high_resolution_clock::now(); - auto seed = now.time_since_epoch().count(); - std::mt19937 mt_rand(static_cast(seed)); - std::uniform_int_distribution dist(0, 15); - - for (unsigned int i = 0; i < 16; ++i) { - int random_number = dist(mt_rand); - ss << std::hex << random_number; - } - return ss.str(); - } - void SetUp() override { std::tie(target_device, configuration) = this->GetParam(); SKIP_IF_CURRENT_TEST_IS_DISABLED() OVPluginTestBase::SetUp(); - std::string fileName = generateRandomFileName(); - xmlFileName = fileName + ".xml"; - binFileName = fileName + ".bin"; } static std::string getTestCaseName(const testing::TestParamInfo& obj) { @@ -68,43 +52,25 @@ class DriverCompilerAdapterCustomStreamTestNPU : public ov::test::behavior::OVPl if (!configuration.empty()) { utils::PluginCache::get().reset(); } - if (std::remove(xmlFileName.c_str()) != 0 || std::remove(binFileName.c_str()) != 0) { - ADD_FAILURE() << "Failed to remove serialized files, xml: " << xmlFileName << " bin: " << binFileName; - } APIBaseTest::TearDown(); } protected: ov::AnyMap configuration; - std::string xmlFileName; - std::string binFileName; }; -TEST_P(DriverCompilerAdapterCustomStreamTestNPU, TestLargeModel) { +TEST_P(DriverCompilerAdapterCustomStreamTestNPU, TestLargeModelWeightsCopy) { auto model = createModelWithLargeSize(); - IRSerializer irSerializer(model, 11); - size_t xmlSize = irSerializer.getXmlSize(); - size_t weightsSize = irSerializer.getWeightsSize(); - - std::vector xml(xmlSize); - std::vector weights(weightsSize); - irSerializer.serializeModelToBuffer(xml.data(), weights.data()); - - { - std::ofstream xmlFile(xmlFileName, std::ios::binary); - if (xmlFile) { - xmlFile.write(reinterpret_cast(xml.data()), xmlSize); - xmlFile.close(); - } + const ze_graph_compiler_version_info_t dummyCompilerVersion{0, 0}; + VCLSerializerWithWeightsCopy serializer(model, dummyCompilerVersion, 11); + EXPECT_NO_THROW(serializer.serialize()); +} - std::ofstream binFile(binFileName, std::ios::binary); - if (binFile) { - binFile.write(reinterpret_cast(weights.data()), weightsSize); - binFile.close(); - } - } - ov::Core core; - EXPECT_NO_THROW(model = core.read_model(xmlFileName)); +TEST_P(DriverCompilerAdapterCustomStreamTestNPU, TestLargeModelNoWeightsCopy) { + auto model = createModelWithLargeSize(); + const ze_graph_compiler_version_info_t dummyCompilerVersion{0, 0}; + VCLSerializerWithoutWeightsCopy serializer(model, dummyCompilerVersion, 11); + EXPECT_NO_THROW(serializer.serialize()); } const std::vector configs = { diff --git a/src/plugins/intel_npu/tests/functional/internal/compiler_adapter/zero_graph.hpp b/src/plugins/intel_npu/tests/functional/internal/compiler_adapter/zero_graph.hpp index 0c133c32f41c15..9a24a2765dc4e7 100644 --- a/src/plugins/intel_npu/tests/functional/internal/compiler_adapter/zero_graph.hpp +++ b/src/plugins/intel_npu/tests/functional/internal/compiler_adapter/zero_graph.hpp @@ -16,8 +16,8 @@ #include "intel_npu/utils/zero/zero_mem.hpp" #include "intel_npu/utils/zero/zero_mem_pool.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "ir_serializer.hpp" #include "openvino/runtime/intel_npu/properties.hpp" +#include "vcl_serializer.hpp" #include "ze_graph_ext_wrappers.hpp" #include "zero_init_mock.hpp" @@ -85,7 +85,8 @@ class ZeroGraphTest : public ::testing::TestWithParamgetCompilerProperties(); const auto maxOpsetVersion = compilerProperties.maxOVOpsetVersionSupported; - irSerializer = std::make_shared(IRSerializer(model, maxOpsetVersion)); + vclSerializer = + std::make_shared(model, compilerProperties.compilerVersion, maxOpsetVersion); } void TearDown() override { @@ -93,16 +94,13 @@ class ZeroGraphTest : public ::testing::TestWithParamgetCompilerProperties(); - const ze_graph_compiler_version_info_t& compilerVersion = compilerProperties.compilerVersion; - const auto maxOpsetVersion = compilerProperties.maxOVOpsetVersionSupported; - serializedIR = irSerializer->serializeIR(model, compilerVersion, maxOpsetVersion); + serializedIR = vclSerializer->serialize(); } bool bypassUmdCache() { if (!configuration.empty()) { for (auto& configItem : configuration) { - if (configItem.first == ov::cache_dir.name()) { + if (configItem.first == ov::cache_dir.name()) { const auto set_cache_dir = configItem.second; if (!set_cache_dir.empty()) { return true; @@ -127,7 +125,7 @@ class ZeroGraphTest : public ::testing::TestWithParam model; - std::shared_ptr irSerializer; + std::shared_ptr vclSerializer; std::string targetDevice; std::string blobPath;