Investigate refactoring opportunities for batch management in Plugin and Compiler - review

DariaMityagina · DariaMityagina · commit 44532be23b73 · 2025-10-19T00:13:34.000Z
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -84,7 +84,9 @@ std::optional<size_t> determine_dynamic_batch_size(const IODescriptor& desc,
     }
 
     auto batchFromModel = ioShape[intel_npu::utils::BATCH_AXIS];
-    if (!batchFromModel.is_dynamic()) {
+    auto batchModelFromIR =
+        desc.shapeFromIRModel.has_value() && desc.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS].is_dynamic();
+    if (!batchFromModel.is_dynamic() && !batchModelFromIR) {
         return std::nullopt;
     }
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/graph.hpp
@@ -64,6 +64,7 @@ class Graph : public IGraph {
 
 protected:
     bool release_blob(const Config& config);
+    std::optional<size_t> determine_batch_size();
 
     std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/graph.cpp
@@ -232,6 +232,10 @@ void Graph::initialize(const Config& config) {
     //  releasing it here to avoid unnecessary memory usage.
     _blobIsReleased = release_blob(config);
 
+    if (!_batchSize.has_value()) {
+        _batchSize = determine_batch_size();
+    }
+
     if (_zeroInitStruct->getCommandQueueDdiTable().version() < ZE_MAKE_VERSION(1, 1) &&
         config.get<RUN_INFERENCES_SEQUENTIALLY>()) {
         auto numberOfCommandLists = _batchSize.has_value() ? *_batchSize : 1;
@@ -288,6 +292,58 @@ uint32_t Graph::get_last_submitted_id() const {
     return _lastSubmittedId;
 }
 
+std::optional<size_t> Graph::determine_batch_size() {
+    if (!_metadata.outputs.at(0).shapeFromIRModel.has_value()) {
+        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+        return std::nullopt;
+    }
+
+    const ov::PartialShape& firstShape = *_metadata.outputs.at(0).shapeFromIRModel;
+    if (firstShape.is_dynamic() || firstShape.rank().get_length() == 0) {
+        return std::nullopt;
+    }
+
+    const size_t candidateBatchSize = firstShape[utils::BATCH_AXIS].get_max_length();
+    if (candidateBatchSize == 0 || candidateBatchSize == utils::DEFAULT_BATCH_SIZE) {
+        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+        return std::nullopt;
+    }
+
+    auto checkDescriptorsUseCandidateBatchSize = [candidateBatchSize](const std::vector<IODescriptor>& descriptors) {
+        for (const IODescriptor& descriptor : descriptors) {
+            OPENVINO_ASSERT(descriptor.shapeFromIRModel.has_value(),
+                            "Missing value for the \"shapeFromIRModel\" attribute, I/O descriptor");
+
+            const ov::PartialShape& shapeFromCompiler = descriptor.shapeFromCompiler;
+            const ov::PartialShape& shapeFromIRModel = *descriptor.shapeFromIRModel;
+
+            if (shapeFromCompiler.is_dynamic() || shapeFromCompiler.rank().get_length() == 0 ||
+                *shapeFromCompiler.begin() != utils::DEFAULT_BATCH_SIZE) {
+                return false;
+            }
+
+            if (!descriptor.isStateInput && !descriptor.isStateOutput && !descriptor.isShapeTensor) {
+                if (shapeFromIRModel.is_dynamic() || shapeFromIRModel.rank().get_length() == 0 ||
+                    *shapeFromIRModel.begin() != candidateBatchSize) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    };
+
+    if (!checkDescriptorsUseCandidateBatchSize(_metadata.inputs) ||
+        !checkDescriptorsUseCandidateBatchSize(_metadata.outputs)) {
+        _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
+        return std::nullopt;
+    }
+
+    _logger.debug("Batching is handled by the plugin");
+
+    return candidateBatchSize;
+}
+
 const std::optional<std::size_t> Graph::get_batch_size() const {
     return _batchSize;
 }
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -708,7 +708,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     }
 
     std::optional<int64_t> batch = std::nullopt;
-    if (originalBatch.has_value()) {
+    if (originalBatch.has_value() && successfullyDebatched) {
         batch = originalBatch.value().is_static() ? originalBatch.value().get_length() : -1;
         if (batch > 0) {
             // Initial batch setup for static cases
diff --git a/src/plugins/intel_npu/src/plugin/src/transformations.cpp b/src/plugins/intel_npu/src/plugin/src/transformations.cpp
@@ -235,13 +235,15 @@ std::tuple<std::shared_ptr<ov::Model>, bool> handlePluginBatching(
             logger.info("The model has been debatched successfully");
             successfullyDebatched = true;
         }
+        if (batchModeIsAvailable) {
+            // If we have successfully debatched the model on the PLUGIN side, we should
+            // avoid repeating the same in the compiler by resetting the batch mode
+            updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
+        }
     } catch (const std::exception& ex) {
         logger.info("Couldn't validate and reshape the model. Batching will be handled by compiler. Error: %s",
                     ex.what());
     }
-    if (batchModeIsAvailable) {
-        updateBatchMode(ov::intel_npu::BatchMode::COMPILER);
-    }
     return {reshapedModel, successfullyDebatched};
 }
 

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,9 @@ std::optional<size_t> determine_dynamic_batch_size(const IODescriptor& desc,`
`84`	`84`	`}`
`85`	`85`
`86`	`86`	`auto batchFromModel = ioShape[intel_npu::utils::BATCH_AXIS];`
`87`		`- if (!batchFromModel.is_dynamic()) {`
	`87`	`+ auto batchModelFromIR =`
	`88`	`+ desc.shapeFromIRModel.has_value() && desc.shapeFromIRModel.value()[intel_npu::utils::BATCH_AXIS].is_dynamic();`
	`89`	`+ if (!batchFromModel.is_dynamic() && !batchModelFromIR) {`
`88`	`90`	`return std::nullopt;`
`89`	`91`	`}`
`90`	`92`
Original file line number	Diff line number	Diff line change
`@@ -708,7 +708,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<`
`708`	`708`	`}`
`709`	`709`
`710`	`710`	`std::optional<int64_t> batch = std::nullopt;`
`711`		`- if (originalBatch.has_value()) {`
	`711`	`+ if (originalBatch.has_value() && successfullyDebatched) {`
`712`	`712`	`batch = originalBatch.value().is_static() ? originalBatch.value().get_length() : -1;`
`713`	`713`	`if (batch > 0) {`
`714`	`714`	`// Initial batch setup for static cases`
Original file line number	Diff line number	Diff line change
`@@ -235,13 +235,15 @@ std::tuple<std::shared_ptr<ov::Model>, bool> handlePluginBatching(`
`235`	`235`	`logger.info("The model has been debatched successfully");`
`236`	`236`	`successfullyDebatched = true;`
`237`	`237`	`}`
	`238`	`+ if (batchModeIsAvailable) {`
	`239`	`+ // If we have successfully debatched the model on the PLUGIN side, we should`
	`240`	`+ // avoid repeating the same in the compiler by resetting the batch mode`
	`241`	`+ updateBatchMode(ov::intel_npu::BatchMode::COMPILER);`
	`242`	`+ }`
`238`	`243`	`} catch (const std::exception& ex) {`
`239`	`244`	`logger.info("Couldn't validate and reshape the model. Batching will be handled by compiler. Error: %s",`
`240`	`245`	`ex.what());`
`241`	`246`	`}`
`242`		`- if (batchModeIsAvailable) {`
`243`		`- updateBatchMode(ov::intel_npu::BatchMode::COMPILER);`
`244`		`- }`
`245`	`247`	`return {reshapedModel, successfullyDebatched};`
`246`	`248`	`}`
`247`	`249`