Investigate refactoring opportunities for batch management in Plugin and Compiler - review - WIP

DariaMityagina · DariaMityagina · commit 6cb439f97d88 · 2025-08-25T15:15:33.000Z
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp
@@ -47,7 +47,7 @@ class ICompilerAdapter {
         const Config& config,
         std::optional<std::vector<ov::Tensor>> initBlobs = std::nullopt,
         const std::optional<std::shared_ptr<const ov::Model>>& model = std::nullopt,
-        std::optional<ov::Dimension> batchSize = std::nullopt) const = 0;
+        std::optional<int64_t> batchSize = std::nullopt) const = 0;
 
     virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
     virtual uint32_t get_version() const = 0;
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp
@@ -30,7 +30,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter {
         const Config& config,
         std::optional<std::vector<ov::Tensor>> initBlobs = std::nullopt,
         const std::optional<std::shared_ptr<const ov::Model>>& model = std::nullopt,
-        std::optional<ov::Dimension> batchSize = std::nullopt) const override;
+        std::optional<int64_t> batchSize = std::nullopt) const override;
 
     ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp
@@ -28,7 +28,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter {
         const Config& config,
         std::optional<std::vector<ov::Tensor>> initBlobs = std::nullopt,
         const std::optional<std::shared_ptr<const ov::Model>>& model = std::nullopt,
-        std::optional<ov::Dimension> batchSize = std::nullopt) const override;
+        std::optional<int64_t> batchSize = std::nullopt) const override;
 
     ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp
@@ -45,7 +45,7 @@ class ZeGraphExtWrappers {
 
     GraphDescriptor getGraphDescriptor(void* data, size_t size) const;
 
-    NetworkMetadata getNetworkMeta(GraphDescriptor& graphDescriptor, std::optional<ov::Dimension> batchSize = std::nullopt) const;
+    NetworkMetadata getNetworkMeta(GraphDescriptor& graphDescriptor, std::optional<int64_t> batchSize = std::nullopt) const;
 
     void destroyGraph(GraphDescriptor& graphDescriptor);
 
@@ -71,7 +71,7 @@ class ZeGraphExtWrappers {
                      uint32_t index,
                      std::vector<IODescriptor>& inputs,
                      std::vector<IODescriptor>& outputs,
-                     std::optional<ov::Dimension> batchSize) const;
+                     std::optional<int64_t> batchSize) const;
 
     void initializeGraphThroughCommandList(ze_graph_handle_t graphHandle, uint32_t commandQueueGroupOrdinal) const;
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -362,7 +362,7 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::parse(
     const Config& config,
     std::optional<std::vector<ov::Tensor>> initBlobs,
     const std::optional<std::shared_ptr<const ov::Model>>& model,
-    std::optional<ov::Dimension> batchSize) const {
+    std::optional<int64_t> batchSize) const {
     OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse");
 
     _logger.debug("parse start");
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp
@@ -254,7 +254,7 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::parse(
     const Config& config,
     std::optional<std::vector<ov::Tensor>> initBlobs,
     const std::optional<std::shared_ptr<const ov::Model>>& model,
-    std::optional<ov::Dimension> batchSize) const {
+    std::optional<int64_t> batchSize) const {
     OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse");
 
     _logger.debug("parse start");
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp
@@ -435,7 +435,7 @@ GraphDescriptor ZeGraphExtWrappers::getGraphDescriptor(void* blobData, size_t bl
  */
 static IODescriptor getIODescriptor(const ze_graph_argument_properties_3_t& arg,
                                     const std::optional<ze_graph_argument_metadata_t>& metadata,
-                                    std::optional<ov::Dimension> batchSize) {
+                                    std::optional<int64_t> batchSize) {
     auto logger = Logger::global().clone("getIODescriptor");
     ov::element::Type_t precision = zeroUtils::toOVElementType(arg.devicePrecision);
     ov::Shape shapeFromCompiler;
@@ -453,7 +453,7 @@ static IODescriptor getIODescriptor(const ze_graph_argument_properties_3_t& arg,
         shapeFromIRModel.reserve(metadata->shape_size);
         for (uint32_t id = 0; id < metadata->shape_size; id++) {
             if (batchSize.has_value() && id == utils::BATCH_AXIS) {
-                shapeFromIRModel.push_back(ov::Dimension(1, batchSize.value().get_max_length()));
+                shapeFromIRModel.push_back(ov::Dimension(1, batchSize.value()));
             } else if (metadata->shape[id] != dynamicDim) {
                 shapeFromIRModel.push_back(metadata->shape[id]);
             } else {
@@ -520,7 +520,7 @@ void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle,
                                      uint32_t index,
                                      std::vector<IODescriptor>& inputs,
                                      std::vector<IODescriptor>& outputs,
-                                     std::optional<ov::Dimension> batchSize) const {
+                                     std::optional<int64_t> batchSize) const {
     if (NotSupportArgumentMetadata(_graphExtVersion)) {
         ze_graph_argument_properties_3_t arg = {};
         _logger.debug("getMetadata - perform pfnGetArgumentProperties3");
@@ -573,7 +573,7 @@ void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle,
     }
 }
 
-NetworkMetadata ZeGraphExtWrappers::getNetworkMeta(GraphDescriptor& graphDescriptor, std::optional<ov::Dimension> batchSize) const {
+NetworkMetadata ZeGraphExtWrappers::getNetworkMeta(GraphDescriptor& graphDescriptor, std::optional<int64_t> batchSize) const {
     ze_graph_properties_t graphProperties = {};
     graphProperties.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES;
 
diff --git a/src/plugins/intel_npu/src/plugin/include/metadata.hpp b/src/plugins/intel_npu/src/plugin/include/metadata.hpp
@@ -46,7 +46,7 @@ class MetadataBase {
     /**
      * @returns Batch size. Populated in case of plugin batching.
      */
-    virtual std::optional<ov::Dimension> get_batch_size() const = 0;
+    virtual std::optional<int64_t> get_batch_size() const = 0;
 
     virtual ~MetadataBase() = default;
 
@@ -216,7 +216,7 @@ class Metadata<METADATA_VERSION_2_0> : public MetadataBase {
 
     std::optional<std::vector<uint64_t>> get_init_sizes() const override;
 
-    std::optional<ov::Dimension> get_batch_size() const override;
+    std::optional<int64_t> get_batch_size() const override;
 
     size_t get_metadata_size() const override;
 
@@ -250,7 +250,7 @@ class Metadata<METADATA_VERSION_2_1> : public Metadata<METADATA_VERSION_2_0> {
 
     std::optional<std::vector<uint64_t>> get_init_sizes() const override;
 
-    std::optional<ov::Dimension> get_batch_size() const override;
+    std::optional<int64_t> get_batch_size() const override;
 
     size_t get_metadata_size() const override;
 
@@ -268,7 +268,7 @@ class Metadata<METADATA_VERSION_2_2> : public Metadata<METADATA_VERSION_2_1> {
     Metadata(uint64_t blobSize,
              std::optional<OpenvinoVersion> ovVersion = std::nullopt,
              const std::optional<std::vector<uint64_t>> initSizes = std::nullopt,
-             const std::optional<ov::Dimension> batchSize = std::nullopt);
+             const std::optional<int64_t> batchSize = std::nullopt);
 
     /**
      * @details The number of init schedules, along with the size of each init binary object are read in addition to the
@@ -284,10 +284,12 @@ class Metadata<METADATA_VERSION_2_2> : public Metadata<METADATA_VERSION_2_1> {
      */
     void write(std::ostream& stream) override;
 
-    std::optional<ov::Dimension> get_batch_size() const override;
+    std::optional<int64_t> get_batch_size() const override;
+
+    size_t get_metadata_size() const override;
 
 private:
-    std::optional<ov::Dimension> _batchSize;
+    std::optional<int64_t> _batchSize;
 };
 
 /**
diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
@@ -91,14 +91,15 @@ void CompiledModel::export_model(std::ostream& stream) const {
 
     auto [blobSizesBeforeVersioning, initBlobSizes] = _graph->export_blob(stream);
 
-    std::optional<ov::Dimension> originalBatchSize = std::nullopt;
+    std::optional<int64_t> originalBatchSize = std::nullopt;
     auto metadata = _graph->get_metadata();
     auto inputMeta = metadata.inputs;
     for (auto in : inputMeta) {
         // Plugin batching applied, saving original batch value
-        if (in.shapeFromIRModel.has_value() && in.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS].is_dynamic() &&
+        if (in.shapeFromIRModel.has_value() &&
             in.shapeFromCompiler[intel_npu::utils::BATCH_AXIS] == 1) {
-            originalBatchSize = std::optional(in.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS]);
+            originalBatchSize = std::optional(in.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS].get_max_length());
+            break;
         }
     }
 
diff --git a/src/plugins/intel_npu/src/plugin/src/metadata.cpp b/src/plugins/intel_npu/src/plugin/src/metadata.cpp
@@ -74,7 +74,7 @@ Metadata<METADATA_VERSION_2_1>::Metadata(uint64_t blobSize,
 Metadata<METADATA_VERSION_2_2>::Metadata(uint64_t blobSize,
                                          std::optional<OpenvinoVersion> ovVersion,
                                          const std::optional<std::vector<uint64_t>> initSizes,
-                                         const std::optional<ov::Dimension> batchSize)
+                                         const std::optional<int64_t> batchSize)
     : Metadata<METADATA_VERSION_2_1>{blobSize, ovVersion, initSizes},
       _batchSize{batchSize} {
     _version = METADATA_VERSION_2_2;
@@ -126,19 +126,31 @@ void Metadata<METADATA_VERSION_2_1>::read(const ov::Tensor& tensor) {
 void Metadata<METADATA_VERSION_2_2>::read(std::istream& stream) {
     Metadata<METADATA_VERSION_2_1>::read(stream);
 
-    stream.read(reinterpret_cast<char*>(&_batchSize), sizeof(_batchSize));
+    int64_t batchSize;
+    stream.read(reinterpret_cast<char*>(&batchSize), sizeof(batchSize));
+
+    if (batchSize)  {
+        _batchSize = std::optional(batchSize);
+    }
 }
 
 void Metadata<METADATA_VERSION_2_2>::read(const ov::Tensor& tensor) {
     Metadata<METADATA_VERSION_2_1>::read(tensor);
 
-    // Calculate the offset where the batch size is stored in the tensor
-    auto offset = sizeof(decltype(std::declval<OpenvinoVersion>().get_major())) +
-                  sizeof(decltype(std::declval<OpenvinoVersion>().get_minor())) +
-                  sizeof(decltype(std::declval<OpenvinoVersion>().get_patch())) +
-                  sizeof(uint64_t) * (get_init_sizes() ? get_init_sizes()->size() : 0);
+    auto roiTensor = ov::Tensor(tensor,
+                                ov::Coordinate{sizeof(decltype(std::declval<OpenvinoVersion>().get_major())) +
+                                               sizeof(decltype(std::declval<OpenvinoVersion>().get_minor())) +
+                                               sizeof(decltype(std::declval<OpenvinoVersion>().get_patch())) +
+                                               sizeof(uint64_t) +
+                                               sizeof(uint64_t) * (get_init_sizes() ? get_init_sizes()->size() : 0)},
+                                ov::Coordinate{tensor.get_byte_size()});
+
+    int64_t batchSize;
+    batchSize = *reinterpret_cast<const decltype(batchSize)*>(roiTensor.data<const char>());
 
-    _batchSize = *reinterpret_cast<const decltype(_batchSize)*>(tensor.data<const char>() + offset);
+    if (batchSize) {
+        _batchSize = std::optional(batchSize);
+    }
 }
 
 void MetadataBase::append_padding_blob_size_and_magic(std::ostream& stream) {
@@ -169,8 +181,6 @@ void Metadata<METADATA_VERSION_2_1>::write(std::ostream& stream) {
             stream.write(reinterpret_cast<const char*>(&initSize), sizeof(initSize));
         }
     }
-
-    append_padding_blob_size_and_magic(stream);
 }
 
 void Metadata<METADATA_VERSION_2_2>::write(std::ostream& stream) {
@@ -195,7 +205,7 @@ std::unique_ptr<MetadataBase> create_metadata(uint32_t version, uint64_t blobSiz
     case METADATA_VERSION_2_1:
         return std::make_unique<Metadata<METADATA_VERSION_2_1>>(blobSize, std::nullopt);
     case METADATA_VERSION_2_2:
-        return std::make_unique<Metadata<METADATA_VERSION_2_1>>(blobSize, std::nullopt);
+        return std::make_unique<Metadata<METADATA_VERSION_2_2>>(blobSize, std::nullopt);
     default:
         OPENVINO_THROW("Metadata version is not supported!");
     }
@@ -338,15 +348,15 @@ std::optional<std::vector<uint64_t>> Metadata<METADATA_VERSION_2_1>::get_init_si
     return _initSizes;
 }
 
-std::optional<ov::Dimension> Metadata<METADATA_VERSION_2_0>::get_batch_size() const {
+std::optional<int64_t> Metadata<METADATA_VERSION_2_0>::get_batch_size() const {
     return std::nullopt;
 }
 
-std::optional<ov::Dimension> Metadata<METADATA_VERSION_2_1>::get_batch_size() const {
+std::optional<int64_t> Metadata<METADATA_VERSION_2_1>::get_batch_size() const {
     return std::nullopt;
 }
 
-std::optional<ov::Dimension> Metadata<METADATA_VERSION_2_2>::get_batch_size() const {
+std::optional<int64_t> Metadata<METADATA_VERSION_2_2>::get_batch_size() const {
     return _batchSize;
 }
 
@@ -366,4 +376,10 @@ size_t Metadata<METADATA_VERSION_2_1>::get_metadata_size() const {
     return metadataSize;
 }
 
+size_t Metadata<METADATA_VERSION_2_2>::get_metadata_size() const {
+    size_t metadataSize = Metadata<METADATA_VERSION_2_1>::get_metadata_size() + sizeof(_batchSize);
+
+    return metadataSize;
+}
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -589,7 +589,7 @@ bool validateModelBatch(const std::shared_ptr<const ov::Model>& model, Logger lo
         return false;
     }
 
-    auto node_info_printer = [&logger](const auto& ov_node, std::string_view nodeType) {
+    auto node_info_printer = [&logger](const auto& ov_node, std::string nodeType) {
         logger.info("%s: %s has shape value: %s",
                     nodeType,
                     ov_node.get_any_name(),
@@ -650,9 +650,10 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     auto device = _backend == nullptr ? nullptr : _backend->getDevice(localConfig.get<DEVICE_ID>());
     localConfig.update({{ov::intel_npu::platform.name(), platform}});
 
-    auto updateBatchMode = [&localConfig](ov::intel_npu::BatchMode mode) {
+    auto updateBatchMode = [&](ov::intel_npu::BatchMode mode) {
         std::stringstream strStream;
         strStream << mode;
+        _logger.info("Setting batching mode to %s.", strStream.str());
         localConfig.update({{ov::intel_npu::batch_mode.name(), strStream.str()}});
     };
 
@@ -676,7 +677,6 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
             } catch (const std::exception& ex) {
                 _logger.info("Couldn't reshape the model. Batching will be handed by compiler.", ex.what());
             }
-            _logger.info("Setting batching mode to BatchMode::COMPILER.");
             updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
         } else {
             _logger.info("Unable to manage batching on the plugin side, so the compiler will take care of it.");
@@ -754,8 +754,8 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     if (modelDeBached) {
         auto metadata = graph->get_metadata();
         for (auto& in : metadata.inputs) {
-            if (in.shapeFromIRModel.has_value() && originalBatch.get_max_length() != 1) {
-                in.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS] = originalBatch;
+            if (in.shapeFromIRModel.has_value() && in.shapeFromCompiler[intel_npu::utils::BATCH_AXIS] == 1) {
+                in.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS] = originalBatch.get_max_length();
             }
         }
         graph->set_metadata(metadata);
@@ -951,7 +951,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::parse(const ov::Tensor& tensorBig,
 
     uint64_t mainSize = tensorBig.get_byte_size();
     std::optional<std::vector<uint64_t>> initSizes;
-    std::optional<ov::Dimension> batchSize;
+    std::optional<int64_t> batchSize;
 
     if (metadata) {
         size_t accumulator = 0;