feat: preprocessing in python for NPU (#414)

mgumowsk · web-flow · commit da6f46eb3b13 · 2025-10-14T09:09:26.000Z
* fix

* reshape inputs

* handle npu anomaly resize
diff --git a/src/model_api/adapters/onnx_adapter.py b/src/model_api/adapters/onnx_adapter.py
@@ -1,17 +1,16 @@
 #
-# Copyright (C) 2020-2024 Intel Corporation
+# Copyright (C) 2020-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 
 from __future__ import annotations
 
 import sys
-from functools import partial, reduce
 from typing import Any, Callable
 
 import numpy as np
 
-from .utils import INTERPOLATION_TYPES, RESIZE_TYPES, InputTransform
+from .utils import setup_python_preprocessing_pipeline
 
 try:
     import onnx
@@ -145,30 +144,17 @@ def embed_preprocessing(
         """
         Adds external preprocessing steps done before ONNX model execution.
         """
-        preproc_funcs = [np.squeeze]
-        if resize_mode != "crop":
-            if resize_mode == "fit_to_window_letterbox":
-                resize_fn = partial(
-                    RESIZE_TYPES[resize_mode],
-                    size=target_shape,
-                    interpolation=INTERPOLATION_TYPES[interpolation_mode],
-                    pad_value=pad_value,
-                )
-            else:
-                resize_fn = partial(
-                    RESIZE_TYPES[resize_mode],
-                    size=target_shape,
-                    interpolation=INTERPOLATION_TYPES[interpolation_mode],
-                )
-        else:
-            resize_fn = partial(RESIZE_TYPES[resize_mode], size=target_shape)
-        preproc_funcs.append(resize_fn)
-        input_transform = InputTransform(brg2rgb, mean, scale)
-        preproc_funcs.extend((input_transform.__call__, partial(change_layout, layout=layout)))
-
-        self.preprocessor = reduce(
-            lambda f, g: lambda x: f(g(x)),
-            reversed(preproc_funcs),
+        self.preprocessor = setup_python_preprocessing_pipeline(
+            layout=layout,
+            resize_mode=resize_mode,
+            interpolation_mode=interpolation_mode,
+            target_shape=target_shape,
+            pad_value=pad_value,
+            dtype=dtype,
+            brg2rgb=brg2rgb,
+            mean=mean,
+            scale=scale,
+            input_idx=input_idx,
         )
 
     def get_model(self):
@@ -227,18 +213,3 @@ def get_shape_from_onnx(onnx_shape):
         if isinstance(item, str):
             onnx_shape[i] = -1
     return tuple(onnx_shape)
-
-
-def change_layout(image, layout):
-    """Changes the input image layout to fit the layout of the model input layer.
-
-    Args:
-        inputs (ndarray): a single image as 3D array in HWC layout
-
-    Returns:
-        - the image with layout aligned with the model layout
-    """
-    if "CHW" in layout:
-        image = image.transpose((2, 0, 1))  # HWC->CHW
-        image = image.reshape((1, *image.shape))
-    return image
diff --git a/src/model_api/adapters/openvino_adapter.py b/src/model_api/adapters/openvino_adapter.py
@@ -41,6 +41,7 @@
     resize_image,
     resize_image_letterbox,
     resize_image_with_aspect,
+    setup_python_preprocessing_pipeline,
 )
 
 
@@ -143,6 +144,8 @@ def __init__(
         )
         self.is_onnx_file = False
         self.onnx_metadata = {}
+        self.preprocessor = lambda arg: arg
+        self.use_python_preprocessing = False
 
         if isinstance(self.model_path, (str, Path)):
             if Path(self.model_path).suffix == ".onnx" and weights_path:
@@ -175,7 +178,52 @@ def __init__(
         msg = "Model must be bytes or a file"
         raise RuntimeError(msg)
 
+    def reshape_dynamic_inputs(self) -> None:
+        """For NPU devices, set static shape if the model has dynamic shapes"""
+        for input in self.model.inputs:
+            if input.partial_shape.is_dynamic:
+                input_name = input.get_any_name()
+                shape = get_input_shape(input)
+                static_shape = []
+
+                # Detect likely layout for 4D shapes
+                is_nchw = False
+                if len(shape) == 4 and not isinstance(shape[1], tuple) and shape[1] != -1 and shape[1] <= 4:
+                    is_nchw = True
+
+                for i, dim in enumerate(shape):
+                    if isinstance(dim, tuple):
+                        static_shape.append((dim[0] + dim[1]) // 2)
+                    elif dim == -1:
+                        if i == 0:
+                            static_shape.append(1)
+                        elif len(shape) == 4:
+                            if is_nchw:
+                                if i == 1:
+                                    static_shape.append(3)
+                                else:
+                                    static_shape.append(224)
+                            else:
+                                if i == 3:
+                                    static_shape.append(3)
+                                else:
+                                    static_shape.append(224)
+                        else:
+                            static_shape.append(1)
+                    else:
+                        static_shape.append(dim)
+
+                log.info(
+                    f"NPU: Reshaping input '{input_name}' from dynamic {shape} to static {static_shape}",
+                )
+                self.reshape_model({input_name: static_shape})
+
     def load_model(self) -> None:
+        """Loads the model to the device specified in the constructor"""
+        devices = parse_devices(self.device)
+        if any("NPU" in dev.upper() for dev in devices) and self.model.is_dynamic():
+            self.reshape_dynamic_inputs()
+
         self.compiled_model = self.core.compile_model(
             self.model,
             self.device,
@@ -280,11 +328,17 @@ def copy_raw_result(self, request):
         return {key: request.get_tensor(key).data.copy() for key in self.get_output_layers()}
 
     def infer_sync(self, dict_data: dict[str, ndarray]) -> dict[str, ndarray]:
+        if self.use_python_preprocessing:
+            for key in dict_data:
+                dict_data[key] = self.preprocessor(dict_data[key])
         self.infer_request = self.async_queue[self.async_queue.get_idle_request_id()]
         self.infer_request.infer(dict_data)
         return self.get_raw_result(self.infer_request)
 
     def infer_async(self, dict_data, callback_data) -> None:
+        if self.use_python_preprocessing:
+            for key in dict_data:
+                dict_data[key] = self.preprocessor(dict_data[key])
         self.async_queue.start_async(dict_data, callback_data)
 
     def set_callback(self, callback_fn: Callable):
@@ -347,8 +401,32 @@ def embed_preprocessing(
         input_idx: int = 0,
     ) -> None:
         """
-        Embeds OpenVINO PrePostProcessor module into the model.
+        Embeds preprocessing into the model, or sets up Python preprocessing for NPU devices.
         """
+        # Check if we should use Python preprocessing for NPU devices
+        devices = parse_devices(self.device)
+        if any("NPU" in dev.upper() for dev in devices):
+            self.preprocessor = setup_python_preprocessing_pipeline(
+                layout=layout,
+                resize_mode=resize_mode,
+                interpolation_mode=interpolation_mode,
+                target_shape=target_shape,
+                pad_value=pad_value,
+                dtype=dtype,
+                brg2rgb=brg2rgb,
+                mean=mean,
+                scale=scale,
+                input_idx=input_idx,
+            )
+            self.use_python_preprocessing = True
+            input_name = self.model.inputs[input_idx].get_any_name()
+            if layout == "NCHW":
+                static_shape = [1, 3, target_shape[1], target_shape[0]]
+            else:
+                static_shape = [1, target_shape[1], target_shape[0], 3]
+            self.reshape_model({input_name: static_shape})
+            return
+
         ppp = PrePostProcessor(self.model)
 
         # Change the input type to the 8-bit image
diff --git a/src/model_api/adapters/utils.py b/src/model_api/adapters/utils.py
@@ -517,6 +517,81 @@ def crop_resize_ocv(image: np.ndarray, size: tuple[int, int]) -> np.ndarray:
     return cv2.resize(cropped_frame, size)
 
 
+def setup_python_preprocessing_pipeline(
+    layout: str,
+    resize_mode: str,
+    interpolation_mode: str,
+    target_shape: tuple[int, ...],
+    pad_value: int,
+    dtype: type = int,
+    brg2rgb: bool = False,
+    mean: list[Any] | None = None,
+    scale: list[Any] | None = None,
+    input_idx: int = 0,
+):
+    """
+    Sets up a Python preprocessing pipeline for model adapters.
+
+    Args:
+        layout: Target layout for the input (e.g., "NCHW", "NHWC")
+        resize_mode: Type of resizing ("crop", "standard", "fit_to_window", "fit_to_window_letterbox")
+        interpolation_mode: Interpolation method ("LINEAR", "CUBIC", "NEAREST")
+        target_shape: Target shape for resizing
+        pad_value: Padding value for letterbox resizing
+        dtype: Data type for preprocessing
+        brg2rgb: Whether to convert BGR to RGB
+        mean: Mean values for normalization
+        scale: Scale values for normalization
+        input_idx: Input index (unused but kept for compatibility)
+
+    Returns:
+        Callable: A preprocessing function that can be applied to input data
+    """
+    from functools import partial, reduce
+
+    preproc_funcs = [np.squeeze]
+    if resize_mode != "crop":
+        if resize_mode == "fit_to_window_letterbox":
+            resize_fn = partial(
+                RESIZE_TYPES[resize_mode],
+                size=target_shape,
+                interpolation=INTERPOLATION_TYPES[interpolation_mode],
+                pad_value=pad_value,
+            )
+        else:
+            resize_fn = partial(
+                RESIZE_TYPES[resize_mode],
+                size=target_shape,
+                interpolation=INTERPOLATION_TYPES[interpolation_mode],
+            )
+    else:
+        resize_fn = partial(RESIZE_TYPES[resize_mode], size=target_shape)
+    preproc_funcs.append(resize_fn)
+    input_transform = InputTransform(brg2rgb, mean, scale)
+    preproc_funcs.extend((input_transform.__call__, partial(change_layout, layout=layout)))
+
+    return reduce(
+        lambda f, g: lambda x: f(g(x)),
+        reversed(preproc_funcs),
+    )
+
+
+def change_layout(image, layout):
+    """Changes the input image layout to fit the layout of the model input layer.
+
+    Args:
+        image (ndarray): a single image as 3D array in HWC layout
+        layout (str): target layout
+
+    Returns:
+        ndarray: the image with layout aligned with the model layout
+    """
+    if "CHW" in layout:
+        image = image.transpose((2, 0, 1))  # HWC->CHW
+        image = image.reshape((1, *image.shape))
+    return image
+
+
 RESIZE_TYPES: dict[str, Callable] = {
     "crop": crop_resize_ocv,
     "standard": resize_image_ocv,
diff --git a/src/model_api/models/anomaly.py b/src/model_api/models/anomaly.py
@@ -80,6 +80,14 @@ def preprocess(self, inputs: np.ndarray) -> list[dict]:
         """
         original_shape = inputs.shape
 
+        if (
+            self._is_dynamic
+            and getattr(self.inference_adapter, "device", "") == "NPU"
+            and hasattr(self.inference_adapter, "compiled_model")
+        ):
+            _, self.c, self.h, self.w = self.inference_adapter.compiled_model.inputs[0].get_shape()
+            self._is_dynamic = False
+
         if self._is_dynamic:
             h, w, c = inputs.shape
             resized_shape = (w, h, c)
@@ -98,11 +106,13 @@ def preprocess(self, inputs: np.ndarray) -> list[dict]:
             if self.embedded_processing:
                 processed_image = inputs[None]
             else:
+                # Resize image to expected model input dimensions
+                resized_image = self.resize(inputs, (self.w, self.h))
                 # Convert to float32 and normalize for anomalib
-                if inputs.dtype == np.uint8:
-                    processed_image = inputs.astype(np.float32) / 255.0
+                if resized_image.dtype == np.uint8:
+                    processed_image = resized_image.astype(np.float32) / 255.0
                 else:
-                    processed_image = inputs.astype(np.float32)
+                    processed_image = resized_image.astype(np.float32)
                 processed_image = self._change_layout(processed_image)
 
         return [
diff --git a/src/model_api/tilers/detection.py b/src/model_api/tilers/detection.py
@@ -112,7 +112,7 @@ def _merge_results(self, results: list[dict], shape: tuple[int, int, int]) -> De
 
         merged_vector = np.mean(feature_vectors, axis=0) if feature_vectors else np.ndarray(0)
         saliency_map = self._merge_saliency_maps(saliency_maps, shape, tiles_coords) if saliency_maps else np.ndarray(0)
-        label_names = [self.model.labels[int(label_idx)] for label_idx in detections_array[:, 0]]
+        label_names = [self.model.get_label_name(int(label_idx)) for label_idx in detections_array[:, 0]]
 
         return DetectionResult(
             bboxes=detections_array[:, 2:].astype(np.int32),
diff --git a/src/model_api/tilers/instance_segmentation.py b/src/model_api/tilers/instance_segmentation.py
@@ -123,7 +123,7 @@ def _merge_results(self, results, shape) -> InstanceSegmentationResult:
         labels = labels.astype(np.int32)
         resized_masks, label_names = [], []
         for mask, box, label_idx in zip(masks, bboxes, labels):
-            label_names.append(self.model.labels[int(label_idx.squeeze())])
+            label_names.append(self.model.get_label_name(int(label_idx.squeeze())))
             resized_masks.append(_segm_postprocess(box, mask, *shape[:-1]))
 
         resized_masks = np.stack(resized_masks) if resized_masks else masks