Disable prefix caching when serving a VLM model (InternLM#3990)

lvhan028 · web-flow · commit 4045ba895540 · 2025-09-19T14:05:55.000+08:00
* warn VLM deployment not support prefix caching

* Remove the restriction on NumPy version

* fix according to reviewer comments
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -68,16 +68,11 @@ def pipeline(model_path: str,
             if backend_config is not None else None
         model_path = get_model(model_path, download_dir, revision)
 
-    task, pipeline_class = get_task(model_path)
-    if task == 'vlm':
-        if backend_config and backend_config.enable_prefix_caching:
-            backend_config.enable_prefix_caching = False
-            logger.warning('VLM does not support prefix caching.')
-
-    if type(backend_config) is not PytorchEngineConfig:
+    _, pipeline_class = get_task(model_path)
+    if not isinstance(backend_config, PytorchEngineConfig):
         # set auto backend mode
         backend_config = autoget_backend_config(model_path, backend_config)
-    backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind'
+    backend = 'pytorch' if isinstance(backend_config, PytorchEngineConfig) else 'turbomind'
     logger.info(f'Using {backend} engine')
 
     return pipeline_class(model_path,
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -28,6 +28,9 @@ def __init__(self,
                  **kwargs) -> None:
         if backend == 'pytorch':
             try_import_deeplink(backend_config.device_type)
+        if backend_config and backend_config.enable_prefix_caching:
+            backend_config.enable_prefix_caching = False
+            logger.warning('Prefix caching is disabled since LMDeploy hasn\'t support in on VL models yet')
         self.vl_encoder = ImageEncoder(model_path, backend, vision_config, backend_config=backend_config)
         super().__init__(model_path, backend=backend, backend_config=backend_config, **kwargs)
         if self.model_name == 'base':
diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
@@ -4,7 +4,7 @@ einops
 fastapi
 fire
 mmengine-lite
-numpy<2.0.0
+numpy
 openai
 outlines<0.1.0
 partial_json_parser
diff --git a/requirements/runtime_camb.txt b/requirements/runtime_camb.txt
@@ -3,7 +3,7 @@ einops
 fastapi
 fire
 mmengine-lite
-numpy<2.0.0
+numpy
 openai
 outlines<0.1.0
 partial_json_parser
diff --git a/requirements/runtime_cuda.txt b/requirements/runtime_cuda.txt
@@ -4,7 +4,7 @@ einops
 fastapi
 fire
 mmengine-lite
-numpy<2.0.0
+numpy
 openai
 outlines
 partial_json_parser
diff --git a/requirements/runtime_maca.txt b/requirements/runtime_maca.txt
@@ -3,7 +3,7 @@ einops
 fastapi
 fire
 mmengine-lite
-numpy<2.0.0
+numpy
 openai
 outlines<0.1.0
 partial_json_parser
diff --git a/requirements/runtime_rocm.txt b/requirements/runtime_rocm.txt
@@ -3,7 +3,7 @@ einops
 fastapi
 fire
 mmengine-lite
-numpy<2.0.0
+numpy
 openai
 outlines
 partial_json_parser