From daa305adeab4a1b7b1332256257c036280bcef37 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 13 Oct 2025 17:39:17 +0800
Subject: [PATCH 01/37] FEAT: add engine ability display

---
 xinference/model/llm/vllm/core.py |  25 ++-
 xinference/model/utils.py         | 255 ++++++++++++++++++++++++++++--
 2 files changed, 263 insertions(+), 17 deletions(-)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 4da42ed48b..58b0a523aa 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -851,7 +851,30 @@ def _sanitize_generate_config(
 
     @classmethod
     def check_lib(cls) -> bool:
-        return importlib.util.find_spec("vllm") is not None
+        if importlib.util.find_spec("vllm") is None:
+            return False
+
+        try:
+            import vllm
+
+            if not getattr(vllm, "__version__", None):
+                return False
+
+            # Check version
+            from packaging import version
+
+            if version.parse(vllm.__version__) < version.parse("0.3.0"):
+                return False
+
+            # Check CUDA
+            import torch
+
+            if not torch.cuda.is_available():
+                return False
+
+            return True
+        except Exception:
+            return False
 
     @classmethod
     def match_json(
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea5dec74d5..0d8e471bb0 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -14,6 +14,7 @@
 
 import asyncio
 import functools
+import importlib.util
 import json
 import logging
 import os
@@ -472,44 +473,266 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
-) -> Optional[Dict[str, List[dict]]]:
+) -> Optional[Dict[str, Union[List[dict], str]]]:
     if model_type == "LLM":
-        from .llm.llm_family import LLM_ENGINES
+        from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
 
         if model_name not in LLM_ENGINES:
             return None
 
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine, params in engine_params.items():
+        # Get all supported engines, not just currently available ones
+        all_supported_engines = list(SUPPORTED_ENGINES.keys())
+        engine_params = {}
+
+        # First add currently available engine parameters
+        available_engines = deepcopy(LLM_ENGINES[model_name])
+        for engine, params in available_engines.items():
             for param in params:
-                del param["llm_class"]
+                # Remove previous available attribute as available engines don't need this flag
+                if "available" in param:
+                    del param["available"]
+            engine_params[engine] = params
+
+        # Check unavailable engines
+        for engine_name in all_supported_engines:
+            if engine_name not in engine_params:  # Engine not in available list
+                try:
+                    engine_classes = SUPPORTED_ENGINES[engine_name]
+                    error_msg = None
+
+                    # Try to find specific error reasons
+                    for engine_class in engine_classes:
+                        try:
+                            if hasattr(engine_class, "check_lib"):
+                                lib_available = engine_class.check_lib()
+                                if not lib_available:
+                                    error_msg = (
+                                        f"Engine {engine_name} library is not available"
+                                    )
+                                    break
+                            else:
+                                # If no check_lib method, try import check
+                                module_name = engine_name.lower().replace(".", "")
+                                if engine_name == "vLLM":
+                                    module_name = "vllm"
+                                elif engine_name == "SGLang":
+                                    module_name = "sglang"
+                                elif engine_name == "llama.cpp":
+                                    module_name = "llama_cpp"
+                                elif engine_name == "MLX":
+                                    module_name = "mlx"
+                                elif engine_name == "LMDEPLOY":
+                                    module_name = "lmdeploy"
+                                elif engine_name == "Transformers":
+                                    module_name = "transformers"
+
+                                importlib.import_module(module_name)
+                                break
+                        except ImportError as e:
+                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                        except Exception as e:
+                            error_msg = (
+                                f"Engine {engine_name} is not available: {str(e)}"
+                            )
+
+                    if error_msg is None:
+                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+                    # For unavailable engines, directly return error message string
+                    engine_params[engine_name] = error_msg
+
+                except Exception as e:
+                    # If exception occurs during checking, return error message string
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
+
+        # Filter out llm_class field
+        for engine, params in engine_params.items():
+            if isinstance(
+                params, list
+            ):  # Only process parameter lists of available engines
+                for param in params:
+                    if "llm_class" in param:
+                        del param["llm_class"]
 
         return engine_params
     elif model_type == "embedding":
-        from .embedding.embed_family import EMBEDDING_ENGINES
+        from .embedding.embed_family import (
+            EMBEDDING_ENGINES,
+        )
+        from .embedding.embed_family import (
+            SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
+        )
 
         if model_name not in EMBEDDING_ENGINES:
             return None
 
-        # filter embedding_class
-        engine_params = deepcopy(EMBEDDING_ENGINES[model_name])
-        for engine, params in engine_params.items():
+        # Get all supported engines, not just currently available ones
+        all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys())
+        engine_params = {}
+
+        # First add currently available engine parameters
+        available_engines = deepcopy(EMBEDDING_ENGINES[model_name])
+        for engine, params in available_engines.items():
             for param in params:
-                del param["embedding_class"]
+                # Remove previous available attribute as available engines don't need this flag
+                if "available" in param:
+                    del param["available"]
+            engine_params[engine] = params
+
+        # Check unavailable engines
+        for engine_name in all_supported_engines:
+            if engine_name not in engine_params:  # Engine not in available list
+                try:
+                    engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+                    error_msg = None
+
+                    # Try to find specific error reasons
+                    for engine_class in engine_classes:
+                        try:
+                            if hasattr(engine_class, "check_lib"):
+                                lib_available = engine_class.check_lib()
+                                if not lib_available:
+                                    error_msg = (
+                                        f"Engine {engine_name} library is not available"
+                                    )
+                                    break
+                            else:
+                                # If no check_lib method, try import check
+                                module_name = engine_name.lower().replace(".", "")
+                                if engine_name == "vLLM":
+                                    module_name = "vllm"
+                                elif engine_name == "SGLang":
+                                    module_name = "sglang"
+                                elif engine_name == "llama.cpp":
+                                    module_name = "llama_cpp"
+                                elif engine_name == "MLX":
+                                    module_name = "mlx"
+                                elif engine_name == "LMDEPLOY":
+                                    module_name = "lmdeploy"
+                                elif engine_name == "Transformers":
+                                    module_name = "transformers"
+                                elif engine_name == "SentenceTransformers":
+                                    module_name = "sentence_transformers"
+
+                                importlib.import_module(module_name)
+                                break
+                        except ImportError as e:
+                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                        except Exception as e:
+                            error_msg = (
+                                f"Engine {engine_name} is not available: {str(e)}"
+                            )
+
+                    if error_msg is None:
+                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+                    # For unavailable engines, directly return error message string
+                    engine_params[engine_name] = error_msg
+
+                except Exception as e:
+                    # If exception occurs during checking, return error message string
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
+
+        # Filter out embedding_class field
+        for engine, params in engine_params.items():
+            if isinstance(
+                params, list
+            ):  # Only process parameter lists of available engines
+                for param in params:
+                    if "embedding_class" in param:
+                        del param["embedding_class"]
 
         return engine_params
     elif model_type == "rerank":
-        from .rerank.rerank_family import RERANK_ENGINES
+        from .rerank.rerank_family import (
+            RERANK_ENGINES,
+        )
+        from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES
 
         if model_name not in RERANK_ENGINES:
             return None
 
-        # filter rerank_class
-        engine_params = deepcopy(RERANK_ENGINES[model_name])
-        for engine, params in engine_params.items():
+        # Get all supported engines, not just currently available ones
+        all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys())
+        engine_params = {}
+
+        # First add currently available engine parameters
+        available_engines = deepcopy(RERANK_ENGINES[model_name])
+        for engine, params in available_engines.items():
             for param in params:
-                del param["rerank_class"]
+                # Remove previous available attribute as available engines don't need this flag
+                if "available" in param:
+                    del param["available"]
+            engine_params[engine] = params
+
+        # Check unavailable engines
+        for engine_name in all_supported_engines:
+            if engine_name not in engine_params:  # Engine not in available list
+                try:
+                    engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+                    error_msg = None
+
+                    # Try to find specific error reasons
+                    for engine_class in engine_classes:
+                        try:
+                            if hasattr(engine_class, "check_lib"):
+                                lib_available = engine_class.check_lib()
+                                if not lib_available:
+                                    error_msg = (
+                                        f"Engine {engine_name} library is not available"
+                                    )
+                                    break
+                            else:
+                                # If no check_lib method, try import check
+                                module_name = engine_name.lower().replace(".", "")
+                                if engine_name == "vLLM":
+                                    module_name = "vllm"
+                                elif engine_name == "SGLang":
+                                    module_name = "sglang"
+                                elif engine_name == "llama.cpp":
+                                    module_name = "llama_cpp"
+                                elif engine_name == "MLX":
+                                    module_name = "mlx"
+                                elif engine_name == "LMDEPLOY":
+                                    module_name = "lmdeploy"
+                                elif engine_name == "Transformers":
+                                    module_name = "transformers"
+                                elif engine_name == "SentenceTransformers":
+                                    module_name = "sentence_transformers"
+
+                                importlib.import_module(module_name)
+                                break
+                        except ImportError as e:
+                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                        except Exception as e:
+                            error_msg = (
+                                f"Engine {engine_name} is not available: {str(e)}"
+                            )
+
+                    if error_msg is None:
+                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+                    # For unavailable engines, directly return error message string
+                    engine_params[engine_name] = error_msg
+
+                except Exception as e:
+                    # If exception occurs during checking, return error message string
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
+
+        # Filter out rerank_class field
+        for engine, params in engine_params.items():
+            if isinstance(
+                params, list
+            ):  # Only process parameter lists of available engines
+                for param in params:
+                    if "rerank_class" in param:
+                        del param["rerank_class"]
 
         return engine_params
     else:

From 5347c4be930b4125382555c1328b78b4fd8a1fce Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Mon, 13 Oct 2025 18:16:12 +0800
Subject: [PATCH 02/37] feat: frontend supports engine ability display

---
 .../components/launchModelDrawer.js           | 69 ++++++-------------
 .../launch_model/components/selectField.js    | 42 +++++++++++
 2 files changed, 64 insertions(+), 47 deletions(-)
 create mode 100644 xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js

diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
index 1169f06269..ccff202111 100644
--- a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
+++ b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
@@ -13,15 +13,11 @@ import {
   CircularProgress,
   Collapse,
   Drawer,
-  FormControl,
   FormControlLabel,
-  InputLabel,
   ListItemButton,
   ListItemText,
-  MenuItem,
   Radio,
   RadioGroup,
-  Select,
   Switch,
   TextField,
   Tooltip,
@@ -39,45 +35,11 @@ import DynamicFieldList from './dynamicFieldList'
 import getModelFormConfig from './modelFormConfig'
 import PasteDialog from './pasteDialog'
 import Progress from './progress'
+import SelectField from './selectField'
 
 const enginesWithNWorker = ['SGLang', 'vLLM', 'MLX']
 const modelEngineType = ['LLM', 'embedding', 'rerank']
 
-const SelectField = ({
-  label,
-  labelId,
-  name,
-  value,
-  onChange,
-  options = [],
-  disabled = false,
-  required = false,
-}) => (
-  <FormControl
-    variant="outlined"
-    margin="normal"
-    disabled={disabled}
-    required={required}
-    fullWidth
-  >
-    <InputLabel id={labelId}>{label}</InputLabel>
-    <Select
-      labelId={labelId}
-      name={name}
-      value={value}
-      onChange={onChange}
-      label={label}
-      className="textHighlight"
-    >
-      {options.map((item) => (
-        <MenuItem key={item.value || item} value={item.value || item}>
-          {item.label || item}
-        </MenuItem>
-      ))}
-    </Select>
-  </FormControl>
-)
-
 const LaunchModelDrawer = ({
   modelData,
   modelType,
@@ -549,19 +511,32 @@ const LaunchModelDrawer = ({
 
   const engineItems = useMemo(() => {
     return engineOptions.map((engine) => {
-      const modelFormats = Array.from(
-        new Set(enginesObj[engine]?.map((item) => item.model_format))
-      )
+      const engineData = enginesObj[engine]
+      let modelFormats = []
+      let label = engine
+      let disabled = false
+
+      if (Array.isArray(engineData)) {
+        modelFormats = Array.from(
+          new Set(engineData.map((item) => item.model_format))
+        )
 
-      const relevantSpecs = modelData.model_specs.filter((spec) =>
-        modelFormats.includes(spec.model_format)
-      )
+        const relevantSpecs = modelData.model_specs.filter((spec) =>
+          modelFormats.includes(spec.model_format)
+        )
+
+        const cached = relevantSpecs.some((spec) => isCached(spec))
 
-      const cached = relevantSpecs.some((spec) => isCached(spec))
+        label = cached ? `${engine} ${t('launchModel.cached')}` : engine
+      } else if (typeof engineData === 'string') {
+        label = `${engine} (${engineData})`
+        disabled = true
+      }
 
       return {
         value: engine,
-        label: cached ? `${engine} ${t('launchModel.cached')}` : engine,
+        label,
+        disabled,
       }
     })
   }, [engineOptions, enginesObj, modelData])
diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js
new file mode 100644
index 0000000000..7e9a4af8ce
--- /dev/null
+++ b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js
@@ -0,0 +1,42 @@
+import { FormControl, InputLabel, MenuItem, Select } from '@mui/material'
+
+const SelectField = ({
+  label,
+  labelId,
+  name,
+  value,
+  onChange,
+  options = [],
+  disabled = false,
+  required = false,
+}) => (
+  <FormControl
+    variant="outlined"
+    margin="normal"
+    disabled={disabled}
+    required={required}
+    fullWidth
+  >
+    <InputLabel id={labelId}>{label}</InputLabel>
+    <Select
+      labelId={labelId}
+      name={name}
+      value={value}
+      onChange={onChange}
+      label={label}
+      className="textHighlight"
+    >
+      {options.map((item) => (
+        <MenuItem
+          key={item.value || item}
+          value={item.value || item}
+          disabled={item.disabled}
+        >
+          {item.label || item}
+        </MenuItem>
+      ))}
+    </Select>
+  </FormControl>
+)
+
+export default SelectField

From 2466777ddf2a3431f35b7770b9003a78242cdbe3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 09:52:09 +0800
Subject: [PATCH 03/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 0d8e471bb0..ea1c18eec8 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,6 +474,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
 ) -> Optional[Dict[str, Union[List[dict], str]]]:
+    engine_params: Optional[Dict[str, Union[List[dict], str]]] = None
+
     if model_type == "LLM":
         from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
 

From 8e1fa20df8db50443bd75271424a0f2fba834a41 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:01:29 +0800
Subject: [PATCH 04/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea1c18eec8..7763b6fba5 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -506,7 +506,7 @@ def get_engine_params_by_name(
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                                lib_available: bool = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -587,14 +587,14 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+                    engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                                lib_available: bool = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -675,14 +675,14 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+                    engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                                lib_available: bool = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"

From da58bf468322393589b63b788e7c5b78c32a6568 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:48:18 +0800
Subject: [PATCH 05/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 7763b6fba5..d1bd6f072f 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -473,8 +473,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
-) -> Optional[Dict[str, Union[List[dict], str]]]:
-    engine_params: Optional[Dict[str, Union[List[dict], str]]] = None
+) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
+    engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None
 
     if model_type == "LLM":
         from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
@@ -506,7 +506,7 @@ def get_engine_params_by_name(
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available: bool = engine_class.check_lib()
+                                lib_available = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -587,14 +587,14 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+                    engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available: bool = engine_class.check_lib()
+                                lib_available = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -675,14 +675,14 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name]
+                    engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
                     for engine_class in engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available: bool = engine_class.check_lib()
+                                lib_available = engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"

From 38aad40977460da0b3548005d545b2eb03d50bf6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:52:46 +0800
Subject: [PATCH 06/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index d1bd6f072f..42f1e5913d 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -499,11 +499,11 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes = SUPPORTED_ENGINES[engine_name]
+                    llm_engine_classes = SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
-                    for engine_class in engine_classes:
+                    for engine_class in llm_engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
                                 lib_available = engine_class.check_lib()
@@ -587,11 +587,11 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+                    embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
-                    for engine_class in engine_classes:
+                    for engine_class in embedding_engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
                                 lib_available = engine_class.check_lib()
@@ -675,11 +675,11 @@ def get_engine_params_by_name(
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
-                    engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+                    rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
                     error_msg = None
 
                     # Try to find specific error reasons
-                    for engine_class in engine_classes:
+                    for engine_class in rerank_engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
                                 lib_available = engine_class.check_lib()

From a679c3b2be223097099b400f36775a4fd109ac68 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:02:01 +0800
Subject: [PATCH 07/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 42f1e5913d..373a7d24d9 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
 ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
-    engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None
+    engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
 
     if model_type == "LLM":
         from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
@@ -484,7 +484,6 @@ def get_engine_params_by_name(
 
         # Get all supported engines, not just currently available ones
         all_supported_engines = list(SUPPORTED_ENGINES.keys())
-        engine_params = {}
 
         # First add currently available engine parameters
         available_engines = deepcopy(LLM_ENGINES[model_name])
@@ -572,7 +571,6 @@ def get_engine_params_by_name(
 
         # Get all supported engines, not just currently available ones
         all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys())
-        engine_params = {}
 
         # First add currently available engine parameters
         available_engines = deepcopy(EMBEDDING_ENGINES[model_name])
@@ -660,7 +658,6 @@ def get_engine_params_by_name(
 
         # Get all supported engines, not just currently available ones
         all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys())
-        engine_params = {}
 
         # First add currently available engine parameters
         available_engines = deepcopy(RERANK_ENGINES[model_name])

From 340ff708d41410062c0386e14ffeb505b2a6fbe9 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:11:30 +0800
Subject: [PATCH 08/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 373a7d24d9..5f2d437219 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -589,10 +589,10 @@ def get_engine_params_by_name(
                     error_msg = None
 
                     # Try to find specific error reasons
-                    for engine_class in embedding_engine_classes:
+                    for embedding_engine_class in embedding_engine_classes:
                         try:
-                            if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                            if hasattr(embedding_engine_class, "check_lib"):
+                                lib_available = embedding_engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -676,10 +676,10 @@ def get_engine_params_by_name(
                     error_msg = None
 
                     # Try to find specific error reasons
-                    for engine_class in rerank_engine_classes:
+                    for rerank_engine_class in rerank_engine_classes:
                         try:
-                            if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                            if hasattr(rerank_engine_class, "check_lib"):
+                                lib_available = rerank_engine_class.check_lib()
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"

From 19e1e2a1fdea15472a18be13073784a11901c70e Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:26:09 +0800
Subject: [PATCH 09/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 5f2d437219..b073cc879b 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -505,7 +505,7 @@ def get_engine_params_by_name(
                     for engine_class in llm_engine_classes:
                         try:
                             if hasattr(engine_class, "check_lib"):
-                                lib_available = engine_class.check_lib()
+                                lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
                                 if not lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
@@ -540,11 +540,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
+                    engine_params[engine_name] = (  # type: ignore[arg-type]
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 
@@ -592,8 +592,8 @@ def get_engine_params_by_name(
                     for embedding_engine_class in embedding_engine_classes:
                         try:
                             if hasattr(embedding_engine_class, "check_lib"):
-                                lib_available = embedding_engine_class.check_lib()
-                                if not lib_available:
+                                embedding_lib_available: bool = embedding_engine_class.check_lib()  # type: ignore[assignment]
+                                if not embedding_lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
                                     )
@@ -629,11 +629,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
+                    engine_params[engine_name] = (  # type: ignore[arg-type]
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 
@@ -679,8 +679,8 @@ def get_engine_params_by_name(
                     for rerank_engine_class in rerank_engine_classes:
                         try:
                             if hasattr(rerank_engine_class, "check_lib"):
-                                lib_available = rerank_engine_class.check_lib()
-                                if not lib_available:
+                                rerank_lib_available: bool = rerank_engine_class.check_lib()  # type: ignore[assignment]
+                                if not rerank_lib_available:
                                     error_msg = (
                                         f"Engine {engine_name} library is not available"
                                     )
@@ -716,11 +716,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
+                    engine_params[engine_name] = (  # type: ignore[arg-type]
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 

From cc84a84bc2817c28268f965d9d161def9a458f2c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:48:54 +0800
Subject: [PATCH 10/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index b073cc879b..783ceba2e4 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -540,11 +540,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
+                    engine_params[engine_name] = error_msg
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (  # type: ignore[arg-type]
+                    engine_params[engine_name] = (
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 
@@ -629,11 +629,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
+                    engine_params[engine_name] = error_msg
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (  # type: ignore[arg-type]
+                    engine_params[engine_name] = (
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 
@@ -716,11 +716,11 @@ def get_engine_params_by_name(
                         error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg  # type: ignore[arg-type]
+                    engine_params[engine_name] = error_msg
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (  # type: ignore[arg-type]
+                    engine_params[engine_name] = (
                         f"Error checking engine {engine_name}: {str(e)}"
                     )
 

From d9b3a434c09a4f2b552aedec487258a6b432ca3c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:57:11 +0800
Subject: [PATCH 11/37] FEAT: add engine ability display

---
 xinference/model/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 783ceba2e4..18de3c26e4 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
 ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
-    engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
+    engine_params: Dict[str, Any] = {}
 
     if model_type == "LLM":
         from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES

From d9d313699613323e94b83b4ad0ff141986d2f209 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 11:22:30 +0800
Subject: [PATCH 12/37] modify accomplishment measure

---
 xinference/model/embedding/core.py            |  40 ++
 xinference/model/embedding/llama_cpp/core.py  |  62 ++-
 .../embedding/sentence_transformers/core.py   |  77 ++-
 xinference/model/llm/core.py                  |  38 ++
 xinference/model/llm/llama_cpp/core.py        |  59 ++-
 xinference/model/llm/lmdeploy/core.py         |  64 ++-
 xinference/model/llm/mlx/core.py              | 158 +++++-
 xinference/model/llm/sglang/core.py           | 229 +++++++--
 xinference/model/llm/transformers/core.py     |  70 ++-
 xinference/model/llm/vllm/core.py             | 461 +++++++++++++++---
 xinference/model/rerank/core.py               |  40 ++
 .../rerank/sentence_transformers/core.py      |  75 ++-
 xinference/model/rerank/vllm/core.py          |  73 ++-
 xinference/model/utils.py                     | 140 ++++--
 14 files changed, 1390 insertions(+), 196 deletions(-)

diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index fffbc7633c..299ec4c5d1 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -171,6 +171,46 @@ def match_json(
     ) -> bool:
         pass
 
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        """
+        Check if the engine can handle the given embedding model with detailed error information.
+
+        This method provides detailed failure reasons and suggestions when an engine
+        cannot handle a specific model configuration. The default implementation
+        falls back to the boolean match_json method for backward compatibility.
+
+        Args:
+            model_family: The embedding model family information
+            model_spec: The model specification
+            quantization: The quantization method
+
+        Returns:
+            MatchResult: Detailed match result with reasons and suggestions
+        """
+        from .match_result import ErrorType, MatchResult
+
+        # Default implementation for backward compatibility
+        if cls.match_json(model_family, model_spec, quantization):
+            return MatchResult.success()
+        else:
+            # Get basic reason based on common failure patterns
+            if not cls.check_lib():
+                return MatchResult.failure(
+                    reason=f"Required library for {cls.__name__} is not available",
+                    error_type=ErrorType.DEPENDENCY_MISSING,
+                )
+            else:
+                return MatchResult.failure(
+                    reason=f"Embedding model configuration is not compatible with {cls.__name__}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                )
+
     @classmethod
     def match(
         cls,
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index fb8c4e45ca..6e2908ffdd 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -235,6 +235,64 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="llama.cpp library (xllamacpp) is not installed for embedding",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="xllamacpp package not found in Python environment",
+            )
+
+        # Check model format compatibility
         if model_spec.model_format not in ["ggufv2"]:
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2",
+            )
+
+        # Check embedding-specific requirements
+        if not hasattr(model_spec, "model_file_name_template"):
+            return MatchResult.failure(
+                reason="GGUF embedding model requires proper file configuration",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details="Missing model_file_name_template for GGUF embedding",
+            )
+
+        # Check model dimensions for llama.cpp compatibility
+        model_dimensions = model_family.dimensions
+        if model_dimensions > 4096:  # llama.cpp may have limitations
+            return MatchResult.failure(
+                reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large embedding dimensions: {model_dimensions}",
+            )
+
+        # Check platform-specific considerations
+        import platform
+
+        current_platform = platform.system()
+
+        # llama.cpp works across platforms but may have performance differences
+        if current_platform == "Windows":
+            return MatchResult.failure(
+                reason="llama.cpp embedding may have limited performance on Windows",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Windows platform: {current_platform}",
+            )
+
+        return MatchResult.success()
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 05f7753e8e..843d68ea37 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -434,5 +434,78 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
-        # As default embedding engine, sentence-transformer support all models
-        return model_spec.model_format in ["pytorch"]
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="Sentence Transformers library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="sentence_transformers package not found in Python environment",
+            )
+
+        # Check model format compatibility
+        if model_spec.model_format not in ["pytorch"]:
+            return MatchResult.failure(
+                reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+            )
+
+        # Check model dimensions compatibility
+        model_dimensions = model_family.dimensions
+        if model_dimensions > 1536:  # Very large embedding models
+            return MatchResult.failure(
+                reason=f"Large embedding model detected ({model_dimensions} dimensions)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large embedding dimensions: {model_dimensions}",
+            )
+
+        # Check token limits
+        max_tokens = model_family.max_tokens
+        if max_tokens > 8192:  # Very high token limits
+            return MatchResult.failure(
+                reason=f"High token limit model detected (max_tokens: {max_tokens})",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details=f"High max_tokens: {max_tokens}",
+            )
+
+        # Check for special model requirements
+        model_name = model_family.model_name.lower()
+
+        # Check Qwen2 GTE models
+        if "gte" in model_name and "qwen2" in model_name:
+            # These models have specific requirements
+            if not hasattr(cls, "_check_qwen_gte_requirements"):
+                return MatchResult.failure(
+                    reason="Qwen2 GTE models require special handling",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details="Qwen2 GTE model special requirements",
+                )
+
+        # Check Qwen3 models
+        if "qwen3" in model_name:
+            # Qwen3 has flash attention requirements
+            try:
+                # This would be checked during actual loading
+                pass
+            except Exception:
+                return MatchResult.failure(
+                    reason="Qwen3 embedding model may have compatibility issues",
+                    error_type=ErrorType.VERSION_REQUIREMENT,
+                    technical_details="Qwen3 model compatibility check",
+                )
+
+        return MatchResult.success()
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 8abc8f04a6..ee446d024a 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,6 +31,7 @@
 
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
+    from .match_result import ErrorType, MatchResult
 
 logger = logging.getLogger(__name__)
 
@@ -159,6 +160,43 @@ def match_json(
     ) -> bool:
         raise NotImplementedError
 
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        """
+        Check if the engine can handle the given model with detailed error information.
+
+        This method provides detailed failure reasons and suggestions when an engine
+        cannot handle a specific model configuration. The default implementation
+        falls back to the boolean match_json method for backward compatibility.
+
+        Args:
+            llm_family: The model family information
+            llm_spec: The model specification
+            quantization: The quantization method
+
+        Returns:
+            MatchResult: Detailed match result with reasons and suggestions
+        """
+        from .match_result import ErrorType, MatchResult
+
+        # Default implementation for backward compatibility
+        if cls.match_json(llm_family, llm_spec, quantization):
+            return MatchResult.success()
+        else:
+            # Get basic reason based on common failure patterns
+            if not cls.check_lib():
+                return MatchResult.failure(
+                    reason=f"Required library for {cls.__name__} is not available",
+                    error_type=ErrorType.DEPENDENCY_MISSING,
+                )
+            else:
+                return MatchResult.failure(
+                    reason=f"Model configuration is not compatible with {cls.__name__}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                )
+
     def prepare_parse_reasoning_content(
         self, reasoning_content: bool, enable_thinking: bool = True
     ):
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index d009378dbe..f35fae9f6e 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -86,14 +86,67 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="llama.cpp library (xllamacpp) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="xllamacpp package not found in Python environment",
+            )
+
+        # Check model format compatibility
         if llm_spec.model_format not in ["ggufv2"]:
-            return False
+            return MatchResult.failure(
+                reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2",
+            )
+
+        # Check model abilities - llama.cpp supports both chat and generation
         if (
             "chat" not in llm_family.model_ability
             and "generate" not in llm_family.model_ability
         ):
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Check platform-specific issues
+        import platform
+
+        current_platform = platform.system()
+
+        # Check for ARM64 specific issues
+        if current_platform == "Darwin" and platform.machine() == "arm64":
+            # Apple Silicon specific checks could go here
+            pass
+        elif current_platform == "Windows":
+            # Windows specific checks could go here
+            pass
+
+        # Check memory requirements (basic heuristic)
+        model_size = float(str(llm_spec.model_size_in_billions))
+        if model_size > 70:  # Very large models
+            return MatchResult.failure(
+                reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large model size: {model_size}B parameters",
+            )
+
+        return MatchResult.success()
 
     def load(self):
         try:
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 0144a6f734..cd0aa892cf 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -121,7 +121,22 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        return MatchResult.failure(
+            reason="LMDeploy base model does not support direct inference",
+            error_type=ErrorType.MODEL_COMPATIBILITY,
+            technical_details="LMDeploy base model class is not intended for direct use",
+        )
 
     def generate(
         self,
@@ -174,13 +189,52 @@ def load(self):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability first
+        if not LMDEPLOY_INSTALLED:
+            return MatchResult.failure(
+                reason="LMDeploy library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="lmdeploy package not found in Python environment",
+            )
+
+        # Check model format compatibility and quantization
         if llm_spec.model_format == "awq":
-            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            # LMDeploy has specific AWQ quantization requirements
             if "4" not in quantization:
-                return False
+                return MatchResult.failure(
+                    reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"AWQ + {quantization} not supported by LMDeploy",
+                )
+
+        # Check model compatibility
         if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
-            return False
-        return LMDEPLOY_INSTALLED
+            return MatchResult.failure(
+                reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Unsupported chat model: {llm_family.model_name}",
+            )
+
+        # Check model abilities - LMDeploy primarily supports chat models
+        if "chat" not in llm_family.model_ability:
+            return MatchResult.failure(
+                reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        return MatchResult.success()
 
     async def async_chat(
         self,
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 80b9c4be2f..cf24d31fdf 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -411,17 +411,67 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["mlx"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="MLX library (mlx_lm) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="mlx_lm package not found in Python environment",
+            )
+
+        # Check platform compatibility - MLX only works on Apple Silicon
         if sys.platform != "darwin" or platform.processor() != "arm":
-            # only work for Mac M chips
-            return False
+            return MatchResult.failure(
+                reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
+            )
+
+        # Check model format compatibility
+        if llm_spec.model_format not in ["mlx"]:
+            return MatchResult.failure(
+                reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
+            )
+
+        # Check model abilities - MLX supports generation but not chat/vision in this base class
         if "generate" not in llm_family.model_ability:
-            return False
+            return MatchResult.failure(
+                reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # MLX base model doesn't support chat or vision
         if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability:
-            # do not process chat or vision
-            return False
-        return True
+            return MatchResult.failure(
+                reason="MLX base model does not support chat or vision abilities",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}",
+            )
+
+        # Check memory constraints for Apple Silicon
+        model_size = float(str(llm_spec.model_size_in_billions))
+        if model_size > 70:  # Large models may be problematic
+            return MatchResult.failure(
+                reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large model size: {model_size}B on Apple Silicon",
+            )
+
+        return MatchResult.success()
 
     def _get_prompt_cache(
         self, prompt, lora_name: Optional[str] = None, model: Any = None
@@ -722,17 +772,39 @@ def _sanitize_generate_config(
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["mlx"]:
-            return False
-        if sys.platform != "darwin" or platform.processor() != "arm":
-            # only work for Mac M chips
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Use base class validation first
+        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        if not base_result.is_match:
+            return base_result
+
+        # Check chat ability
         if "chat" not in llm_family.model_ability:
-            return False
+            return MatchResult.failure(
+                reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # MLX Chat doesn't support vision
         if "vision" in llm_family.model_ability:
-            # do not process vision
-            return False
-        return True
+            return MatchResult.failure(
+                reason="MLX Chat model does not support vision abilities",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Vision ability not supported in MLXChatModel",
+            )
+
+        return MatchResult.success()
 
     def chat(
         self,
@@ -786,14 +858,54 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["mlx"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability first - MLX Vision uses mlx_vlm
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="MLX Vision library (mlx_vlm) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="mlx_vlm package not found in Python environment",
+            )
+
+        # Check platform compatibility
         if sys.platform != "darwin" or platform.processor() != "arm":
-            # only work for Mac M chips
-            return False
+            return MatchResult.failure(
+                reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
+            )
+
+        # Check model format compatibility
+        if llm_spec.model_format not in ["mlx"]:
+            return MatchResult.failure(
+                reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
+            )
+
+        # Check vision ability
         if "vision" not in llm_family.model_ability:
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Check for distributed inference limitations
+        # MLX Vision models don't support distributed inference
+        # This could be checked here if needed
+
+        return MatchResult.success()
 
     def _load_model(self, **kwargs):
         try:
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index d3bbfc1570..f3658b5ed7 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -15,6 +15,7 @@
 import json
 import logging
 import multiprocessing
+import platform
 import sys
 import threading
 import time
@@ -341,24 +342,104 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability first
+        if not SGLANG_INSTALLED:
+            return MatchResult.failure(
+                reason="SGLang library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="sglang package not found in Python environment",
+            )
+
+        # Check hardware requirements - SGLang requires CUDA
         if not cls._has_cuda_device():
-            return False
+            return MatchResult.failure(
+                reason="SGLang requires CUDA GPU support",
+                error_type=ErrorType.HARDWARE_REQUIREMENT,
+                technical_details="No CUDA devices detected",
+            )
+
+        # Check OS requirements
         if not cls._is_linux():
-            return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
-            return False
+            return MatchResult.failure(
+                reason="SGLang only supports Linux operating system",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Current OS: {platform.system()}, required: Linux",
+            )
+
+        # Check model format compatibility
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"SGLang does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}",
+            )
+
+        # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
-                return False
+            if quantization != "none" and quantization is not None:
+                return MatchResult.failure(
+                    reason=f"SGLang pytorch format does not support quantization: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"pytorch + {quantization} combination not supported",
+                )
+
+        # Check model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
-                return False
+                return MatchResult.failure(
+                    reason=f"Custom model family not supported by SGLang: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom family: {llm_family.model_family}",
+                )
         else:
             if llm_family.model_name not in SGLANG_SUPPORTED_MODELS:
-                return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return SGLANG_INSTALLED
+                return MatchResult.failure(
+                    reason=f"Model not supported by SGLang: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported model: {llm_family.model_name}",
+                )
+
+        # Check model abilities with flexible logic
+        # SGLang can handle models with various text generation capabilities
+        has_text_capability = (
+            "generate" in llm_family.model_ability
+            or "chat" in llm_family.model_ability
+            or "reasoning" in llm_family.model_ability
+            or "tools" in llm_family.model_ability
+        )
+
+        if not has_text_capability:
+            return MatchResult.failure(
+                reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # SGLang is primarily designed for text models, not specialized models
+        specialized_abilities = ["embedding", "rerank", "audio", "vision"]
+        has_specialized = any(
+            ability in llm_family.model_ability for ability in specialized_abilities
+        )
+        if has_specialized:
+            return MatchResult.failure(
+                reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
+            )
+
+        return MatchResult.success()
 
     @staticmethod
     def _convert_state_to_completion_chunk(
@@ -647,20 +728,65 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Use base class validation first
+        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        if not base_result.is_match:
+            return base_result
+
+        # Check model format compatibility (same as base)
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"SGLang Chat does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
+            )
+
+        # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
-                return False
+            if quantization != "none" and quantization is not None:
+                return MatchResult.failure(
+                    reason=f"SGLang Chat pytorch format does not support quantization: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"Chat pytorch + {quantization} not supported",
+                )
+
+        # Check chat model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
-                return False
+                return MatchResult.failure(
+                    reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom chat family: {llm_family.model_family}",
+                )
         else:
             if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS:
-                return False
+                return MatchResult.failure(
+                    reason=f"Chat model not supported by SGLang: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported chat model: {llm_family.model_name}",
+                )
+
+        # Check chat ability
         if "chat" not in llm_family.model_ability:
-            return False
-        return SGLANG_INSTALLED
+            return MatchResult.failure(
+                reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        return MatchResult.success()
 
     def _sanitize_chat_config(
         self,
@@ -734,24 +860,65 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if not cls._has_cuda_device():
-            return False
-        if not cls._is_linux():
-            return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Use base class validation first
+        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        if not base_result.is_match:
+            return base_result
+
+        # Vision models have the same format restrictions as base SGLANG
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"SGLang Vision does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+            )
+
+        # Vision models typically work with specific quantization settings
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
-                return False
+            if quantization != "none" and quantization is not None:
+                return MatchResult.failure(
+                    reason=f"SGLang Vision pytorch format does not support quantization: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"Vision pytorch + {quantization} not supported",
+                )
+
+        # Check vision model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
-                return False
+                return MatchResult.failure(
+                    reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom vision family: {llm_family.model_family}",
+                )
         else:
             if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
-                return False
+                return MatchResult.failure(
+                    reason=f"Vision model not supported by SGLang: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported vision model: {llm_family.model_name}",
+                )
+
+        # Check vision ability
         if "vision" not in llm_family.model_ability:
-            return False
-        return SGLANG_INSTALLED
+            return MatchResult.failure(
+                reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        return MatchResult.success()
 
     def _sanitize_chat_config(
         self,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 6ad98c38e8..89a966136d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -500,14 +500,72 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="Transformers library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="transformers or torch package not found",
+            )
+
+        # Check model format compatibility
+        supported_formats = ["pytorch", "gptq", "awq", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"Transformers does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Transformers unsupported format: {llm_spec.model_format}",
+            )
+
+        # Check for models that shouldn't use Transformers by default
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"Model {model_family} is not recommended for Transformers engine",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}",
+            )
+
+        # Check model abilities with flexible logic
+        # Transformers can handle models with various text processing capabilities
+        has_text_capability = (
+            "generate" in llm_family.model_ability
+            or "chat" in llm_family.model_ability
+            or "reasoning" in llm_family.model_ability
+            or "tools" in llm_family.model_ability
+        )
+
+        if not has_text_capability:
+            return MatchResult.failure(
+                reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Check for highly specialized models that might not work well with generic Transformers engine
+        specialized_abilities = ["embedding", "rerank", "audio", "vision"]
+        has_specialized = any(
+            ability in llm_family.model_ability for ability in specialized_abilities
+        )
+        if has_specialized and not has_text_capability:
+            return MatchResult.failure(
+                reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
+            )
+
+        return MatchResult.success()
 
     def build_prefill_attention_mask(
         self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 58b0a523aa..9d76d5685e 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -19,6 +19,7 @@
 import logging
 import multiprocessing
 import os
+import platform
 import sys
 import threading
 import time
@@ -880,35 +881,178 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability first
+        if not VLLM_INSTALLED:
+            return MatchResult.failure(
+                reason="vLLM library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="vllm package not found in Python environment",
+            )
+
+        # Check hardware requirements
         if not cls._has_cuda_device() and not cls._has_mlu_device():
-            return False
+            return MatchResult.failure(
+                reason="vLLM requires CUDA or MLU accelerator support",
+                error_type=ErrorType.HARDWARE_REQUIREMENT,
+                technical_details="No CUDA or MLU devices detected",
+            )
+
+        # Check OS requirements
         if not cls._is_linux():
-            return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
-            return False
+            return MatchResult.failure(
+                reason="vLLM only supports Linux operating system",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Current OS: {platform.system()}, required: Linux",
+            )
+
+        # Check model format
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"vLLM does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}",
+            )
+
+        # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return False
+                return MatchResult.failure(
+                    reason=f"vLLM pytorch format does not support quantization: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"pytorch + {quantization} combination not supported",
+                )
+
         if llm_spec.model_format == "awq":
-            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
             if "4" not in quantization:
-                return False
+                return MatchResult.failure(
+                    reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"AWQ + {quantization} not supported, only 4-bit",
+                )
+
         if llm_spec.model_format == "gptq":
             if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
                 if not any(q in quantization for q in ("3", "4", "8")):
-                    return False
+                    return MatchResult.failure(
+                        reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}",
+                        error_type=ErrorType.QUANTIZATION,
+                        technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3",
+                    )
             else:
                 if "4" not in quantization:
-                    return False
+                    return MatchResult.failure(
+                        reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}",
+                        error_type=ErrorType.VERSION_REQUIREMENT,
+                        technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3",
+                    )
+
+        # Check model compatibility with more flexible matching
+        def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
+            """Check if model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants (e.g., qwen3 variants)
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common patterns
+            model_lower = model_name.lower()
+            if any(
+                family in model_lower
+                for family in [
+                    "qwen3",
+                    "llama",
+                    "mistral",
+                    "gemma",
+                    "baichuan",
+                    "deepseek",
+                ]
+            ):
+                # Check if there's a corresponding supported model with same family
+                for supported in supported_list:
+                    if any(
+                        family in supported.lower()
+                        for family in [
+                            "qwen3",
+                            "llama",
+                            "mistral",
+                            "gemma",
+                            "baichuan",
+                            "deepseek",
+                        ]
+                    ):
+                        return True
+
+            return False
+
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
-                return False
+            if not is_model_supported(
+                llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
+            ):
+                return MatchResult.failure(
+                    reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom family: {llm_family.model_family}",
+                )
         else:
-            if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
-                return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return VLLM_INSTALLED
+            if not is_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in VLLM_SUPPORTED_MODELS],
+            ):
+                return MatchResult.failure(
+                    reason=f"Model may not be supported by vLLM: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported model: {llm_family.model_name}",
+                )
+
+        # Check model abilities with flexible logic
+        # vLLM can handle models that have text generation capabilities
+        # Models with 'chat' ability usually also support 'generate'
+        has_text_capability = (
+            "generate" in llm_family.model_ability
+            or "chat" in llm_family.model_ability
+            or "reasoning" in llm_family.model_ability
+            or "tools" in llm_family.model_ability
+        )
+
+        if not has_text_capability:
+            return MatchResult.failure(
+                reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Additional check: ensure model doesn't have conflicting abilities
+        conflicting_abilities = ["embedding", "rerank"]
+        has_conflicting = any(
+            ability in llm_family.model_ability for ability in conflicting_abilities
+        )
+        if has_conflicting:
+            return MatchResult.failure(
+                reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}",
+            )
+
+        # All checks passed
+        return MatchResult.success()
 
     @staticmethod
     def _convert_request_output_to_completion_chunk(
@@ -1316,40 +1460,141 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in [
-            "pytorch",
-            "gptq",
-            "awq",
-            "fp8",
-            "bnb",
-            "ggufv2",
-        ]:
-            return False
-        if llm_spec.model_format == "pytorch":
-            if quantization != "none" and quantization is not None:
-                return False
-        if llm_spec.model_format == "awq":
-            if not any(q in quantization for q in ("4", "8")):
-                return False
-        if llm_spec.model_format == "gptq":
-            if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
-                if not any(q in quantization for q in ("3", "4", "8")):
-                    return False
-            else:
-                if "4" not in quantization:
-                    return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Use base class validation first
+        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        if not base_result.is_match:
+            return base_result
+
+        # Chat-specific format support (includes GGUFv2 for newer vLLM)
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"vLLM Chat does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
+            )
+
+        # GGUFv2 requires newer vLLM version
         if llm_spec.model_format == "ggufv2":
             if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")):
-                return False
+                return MatchResult.failure(
+                    reason="vLLM GGUF support requires version >= 0.8.2",
+                    error_type=ErrorType.VERSION_REQUIREMENT,
+                    technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2",
+                )
+
+        # AWQ chat models support more quantization levels
+        if llm_spec.model_format == "awq":
+            if not any(q in quantization for q in ("4", "8")):
+                return MatchResult.failure(
+                    reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"Chat AWQ + {quantization} not supported",
+                )
+
+        # Check chat model compatibility with flexible matching
+        def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
+            """Check if chat model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common chat model patterns
+            model_lower = model_name.lower()
+            if any(
+                family in model_lower
+                for family in [
+                    "qwen3",
+                    "llama",
+                    "mistral",
+                    "gemma",
+                    "baichuan",
+                    "deepseek",
+                    "glm",
+                    "chatglm",
+                ]
+            ):
+                # Check if there's a corresponding supported chat model with same family
+                for supported in supported_list:
+                    if any(
+                        family in supported.lower()
+                        for family in [
+                            "qwen3",
+                            "llama",
+                            "mistral",
+                            "gemma",
+                            "baichuan",
+                            "deepseek",
+                            "glm",
+                            "chatglm",
+                        ]
+                    ):
+                        return True
+
+            return False
+
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
-                return False
+            if not is_chat_model_supported(
+                llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
+            ):
+                return MatchResult.failure(
+                    reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom chat family: {llm_family.model_family}",
+                )
         else:
-            if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
-                return False
-        if "chat" not in llm_family.model_ability:
-            return False
-        return VLLM_INSTALLED
+            if not is_chat_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS],
+            ):
+                return MatchResult.failure(
+                    reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported chat model: {llm_family.model_name}",
+                )
+
+        # Check chat ability with flexible logic
+        # vLLM Chat should work with models that have conversation capabilities
+        has_chat_capability = (
+            "chat" in llm_family.model_ability
+            or "generate" in llm_family.model_ability
+            or "reasoning" in llm_family.model_ability
+        )
+
+        if not has_chat_capability:
+            return MatchResult.failure(
+                reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Additional check: ensure model is not purely a tool model without conversation
+        if set(llm_family.model_ability) == {"tools"}:
+            return MatchResult.failure(
+                reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Tool-only model detected",
+            )
+
+        return MatchResult.success()
 
     def _sanitize_chat_config(
         self,
@@ -1494,38 +1739,110 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if not cls._has_cuda_device() and not cls._has_mlu_device():
-            return False
-        if not cls._is_linux():
-            return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
-            return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Use base class validation first
+        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        if not base_result.is_match:
+            return base_result
+
+        # Vision models have the same format restrictions as base VLLM
+        supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+        if llm_spec.model_format not in supported_formats:
+            return MatchResult.failure(
+                reason=f"vLLM Vision does not support model format: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+            )
+
+        # Vision models typically work with specific quantization settings
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return False
+                return MatchResult.failure(
+                    reason=f"vLLM Vision pytorch format does not support quantization: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"Vision pytorch + {quantization} not supported",
+                )
+
+        # AWQ vision models support more quantization levels than base
         if llm_spec.model_format == "awq":
             if not any(q in quantization for q in ("4", "8")):
-                return False
-        if llm_spec.model_format == "gptq":
-            if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
-                if not any(q in quantization for q in ("3", "4", "8")):
-                    return False
-            else:
-                if "4" not in quantization:
-                    return False
+                return MatchResult.failure(
+                    reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"Vision AWQ + {quantization} not supported",
+                )
+
+        # Check vision model compatibility with flexible matching
+        def is_vision_model_supported(
+            model_name: str, supported_list: List[str]
+        ) -> bool:
+            """Check if vision model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common vision model patterns
+            model_lower = model_name.lower()
+            if any(
+                family in model_lower
+                for family in ["llama", "qwen", "internvl", "glm", "phi"]
+            ):
+                # Check if there's a corresponding supported vision model with same family
+                for supported in supported_list:
+                    if any(
+                        family in supported.lower()
+                        for family in ["llama", "qwen", "internvl", "glm", "phi"]
+                    ):
+                        return True
+
+            return False
+
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
-                return False
+            if not is_vision_model_supported(
+                llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST
+            ):
+                return MatchResult.failure(
+                    reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Custom vision family: {llm_family.model_family}",
+                )
         else:
-            if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
-                return False
-        if (
-            "vision" not in llm_family.model_ability
-            and "audio" not in llm_family.model_ability
-            and "omni" not in llm_family.model_ability
-        ):
-            return False
-        return VLLM_INSTALLED
+            if not is_vision_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST],
+            ):
+                return MatchResult.failure(
+                    reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details=f"Unsupported vision model: {llm_family.model_name}",
+                )
+
+        # Check vision ability
+        if "vision" not in llm_family.model_ability:
+            return MatchResult.failure(
+                reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        return MatchResult.success()
 
     def _sanitize_model_config(
         self, model_config: Optional[VLLMModelConfig]
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index ae27e7e85e..929522f23e 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -131,6 +131,46 @@ def match_json(
     ) -> bool:
         pass
 
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: RerankModelFamilyV2,
+        model_spec: RerankSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        """
+        Check if the engine can handle the given rerank model with detailed error information.
+
+        This method provides detailed failure reasons and suggestions when an engine
+        cannot handle a specific model configuration. The default implementation
+        falls back to the boolean match_json method for backward compatibility.
+
+        Args:
+            model_family: The rerank model family information
+            model_spec: The model specification
+            quantization: The quantization method
+
+        Returns:
+            MatchResult: Detailed match result with reasons and suggestions
+        """
+        from .match_result import ErrorType, MatchResult
+
+        # Default implementation for backward compatibility
+        if cls.match_json(model_family, model_spec, quantization):
+            return MatchResult.success()
+        else:
+            # Get basic reason based on common failure patterns
+            if not cls.check_lib():
+                return MatchResult.failure(
+                    reason=f"Required library for {cls.__name__} is not available",
+                    error_type=ErrorType.DEPENDENCY_MISSING,
+                )
+            else:
+                return MatchResult.failure(
+                    reason=f"Rerank model configuration is not compatible with {cls.__name__}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                )
+
     @classmethod
     def match(
         cls,
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index fabbb6e593..ee81a9adac 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -191,7 +191,7 @@ def compute_logits(inputs, **kwargs):
                     from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
                 else:
                     raise RuntimeError(
-                        f"Unsupported Rank model type: {self.model_family.type}"
+                        f"Unsupported Rerank model type: {self.model_family.type}"
                     )
             except ImportError:
                 error_message = "Failed to import module 'FlagEmbedding'"
@@ -341,5 +341,74 @@ def match_json(
         model_spec: RerankSpecV1,
         quantization: str,
     ) -> bool:
-        # As default embedding engine, sentence-transformer support all models
-        return model_spec.model_format in ["pytorch"]
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: RerankModelFamilyV2,
+        model_spec: RerankSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="Sentence Transformers library is not installed for reranking",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="sentence_transformers package not found in Python environment",
+            )
+
+        # Check model format compatibility
+        if model_spec.model_format not in ["pytorch"]:
+            return MatchResult.failure(
+                reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+            )
+
+        # Check rerank-specific requirements
+        if not hasattr(model_family, "model_name"):
+            return MatchResult.failure(
+                reason="Rerank model family requires model name specification",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details="Missing model_name in rerank model family",
+            )
+
+        # Check model type compatibility
+        if model_family.type and model_family.type not in [
+            "rerank",
+            "unknown",
+            "cross-encoder",
+            "normal",
+            "LLM-based",
+            "LLM-based layerwise",
+        ]:
+            return MatchResult.failure(
+                reason=f"Model type '{model_family.type}' may not be compatible with reranking engines",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Model type: {model_family.type}",
+            )
+
+        # Check max tokens limit for reranking performance
+        max_tokens = model_family.max_tokens
+        if max_tokens and max_tokens > 8192:  # High token limits for reranking
+            return MatchResult.failure(
+                reason=f"High max_tokens limit for reranking model: {max_tokens}",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details=f"High max_tokens for reranking: {max_tokens}",
+            )
+
+        # Check language compatibility
+        if not model_family.language or len(model_family.language) == 0:
+            return MatchResult.failure(
+                reason="Rerank model language information is missing",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details="Missing language information in rerank model",
+            )
+
+        return MatchResult.success()
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index eac173b40c..f9763b567a 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -149,8 +149,71 @@ def match_json(
         model_spec: RerankSpecV1,
         quantization: str,
     ) -> bool:
-        if model_spec.model_format in ["pytorch"]:
-            prefix = model_family.model_name.split("-", 1)[0]
-            if prefix in SUPPORTED_MODELS_PREFIXES:
-                return True
-        return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: RerankModelFamilyV2,
+        model_spec: RerankSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="vLLM library is not installed for reranking",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="vllm package not found in Python environment",
+            )
+
+        # Check model format compatibility
+        if model_spec.model_format not in ["pytorch"]:
+            return MatchResult.failure(
+                reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+            )
+
+        # Check model name prefix matching
+        if model_spec.model_format == "pytorch":
+            try:
+                prefix = model_family.model_name.split("-", 1)[0].lower()
+                # Support both prefix matching and special cases
+                if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
+                    # Special handling for Qwen3 models
+                    if "qwen3" not in model_family.model_name.lower():
+                        return MatchResult.failure(
+                            reason=f"Model family prefix not supported by vLLM reranking: {prefix}",
+                            error_type=ErrorType.MODEL_COMPATIBILITY,
+                            technical_details=f"Unsupported prefix: {prefix}",
+                        )
+            except (IndexError, AttributeError):
+                return MatchResult.failure(
+                    reason="Unable to parse model family name for vLLM compatibility check",
+                    error_type=ErrorType.CONFIGURATION_ERROR,
+                    technical_details=f"Model name parsing failed: {model_family.model_name}",
+                )
+
+        # Check rerank-specific requirements
+        if not hasattr(model_family, "model_name"):
+            return MatchResult.failure(
+                reason="Rerank model family requires model name specification for vLLM",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details="Missing model_name in vLLM rerank model family",
+            )
+
+        # Check max tokens limit for vLLM reranking performance
+        max_tokens = model_family.max_tokens
+        if max_tokens and max_tokens > 4096:  # vLLM has stricter limits
+            return MatchResult.failure(
+                reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details=f"High max_tokens for vLLM reranking: {max_tokens}",
+            )
+
+        return MatchResult.success()
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 18de3c26e4..ad0dabbf35 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -494,59 +494,111 @@ def get_engine_params_by_name(
                     del param["available"]
             engine_params[engine] = params
 
-        # Check unavailable engines
+        # Check unavailable engines with detailed error information
         for engine_name in all_supported_engines:
             if engine_name not in engine_params:  # Engine not in available list
                 try:
                     llm_engine_classes = SUPPORTED_ENGINES[engine_name]
-                    error_msg = None
 
-                    # Try to find specific error reasons
-                    for engine_class in llm_engine_classes:
+                    # Try to get detailed error information from engine's match_json_with_reason
+                    detailed_error = None
+
+                    # We need a sample model to test against, use the first available spec
+                    if model_name in LLM_ENGINES and LLM_ENGINES[model_name]:
+                        # Try to get model family for testing
                         try:
-                            if hasattr(engine_class, "check_lib"):
-                                lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
-                                if not lib_available:
-                                    error_msg = (
-                                        f"Engine {engine_name} library is not available"
-                                    )
+                            from .llm.llm_family import match_llm
+
+                            llm_family = match_llm(model_name, None, None, None, None)
+                            if llm_family and llm_family.model_specs:
+                                llm_spec = llm_family.model_specs[0]
+                                quantization = llm_spec.quantization or "none"
+
+                                # Test each engine class for detailed error info
+                                for engine_class in llm_engine_classes:
+                                    try:
+                                        if hasattr(
+                                            engine_class, "match_json_with_reason"
+                                        ):
+                                            from .llm.match_result import MatchResult
+
+                                            result = (
+                                                engine_class.match_json_with_reason(
+                                                    llm_family, llm_spec, quantization
+                                                )
+                                            )
+                                            if not result.is_match:
+                                                detailed_error = {
+                                                    "error": result.reason,
+                                                    "error_type": result.error_type,
+                                                    "technical_details": result.technical_details,
+                                                }
+                                                break
+                                    except Exception:
+                                        # Fall back to next engine class
+                                        continue
+                        except Exception:
+                            # If we can't get model family, continue with basic checking
+                            pass
+
+                    if detailed_error:
+                        engine_params[engine_name] = detailed_error
+                    else:
+                        # Fallback to basic error checking for backward compatibility
+                        error_msg = None
+                        for engine_class in llm_engine_classes:
+                            try:
+                                if hasattr(engine_class, "check_lib"):
+                                    lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
+                                    if not lib_available:
+                                        error_msg = {
+                                            "error": f"Engine {engine_name} library is not available",
+                                            "error_type": "dependency_missing",
+                                        }
+                                        break
+                                else:
+                                    # If no check_lib method, try import check
+                                    module_name = engine_name.lower().replace(".", "")
+                                    if engine_name == "vLLM":
+                                        module_name = "vllm"
+                                    elif engine_name == "SGLang":
+                                        module_name = "sglang"
+                                    elif engine_name == "llama.cpp":
+                                        module_name = "llama_cpp"
+                                    elif engine_name == "MLX":
+                                        module_name = "mlx"
+                                    elif engine_name == "LMDEPLOY":
+                                        module_name = "lmdeploy"
+                                    elif engine_name == "Transformers":
+                                        module_name = "transformers"
+
+                                    importlib.import_module(module_name)
                                     break
-                            else:
-                                # If no check_lib method, try import check
-                                module_name = engine_name.lower().replace(".", "")
-                                if engine_name == "vLLM":
-                                    module_name = "vllm"
-                                elif engine_name == "SGLang":
-                                    module_name = "sglang"
-                                elif engine_name == "llama.cpp":
-                                    module_name = "llama_cpp"
-                                elif engine_name == "MLX":
-                                    module_name = "mlx"
-                                elif engine_name == "LMDEPLOY":
-                                    module_name = "lmdeploy"
-                                elif engine_name == "Transformers":
-                                    module_name = "transformers"
-
-                                importlib.import_module(module_name)
-                                break
-                        except ImportError as e:
-                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
-                        except Exception as e:
-                            error_msg = (
-                                f"Engine {engine_name} is not available: {str(e)}"
-                            )
-
-                    if error_msg is None:
-                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
-                    # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                            except ImportError as e:
+                                error_msg = {
+                                    "error": f"Engine {engine_name} library is not installed: {str(e)}",
+                                    "error_type": "dependency_missing",
+                                }
+                            except Exception as e:
+                                error_msg = {
+                                    "error": f"Engine {engine_name} is not available: {str(e)}",
+                                    "error_type": "configuration_error",
+                                }
+
+                        if error_msg is None:
+                            error_msg = {
+                                "error": f"Engine {engine_name} is not compatible with current model or environment",
+                                "error_type": "model_compatibility",
+                            }
+
+                        engine_params[engine_name] = error_msg
 
                 except Exception as e:
-                    # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
-                        f"Error checking engine {engine_name}: {str(e)}"
-                    )
+                    # If exception occurs during checking, return structured error
+                    engine_params[engine_name] = {
+                        "error": f"Error checking engine {engine_name}: {str(e)}",
+                        "error_type": "configuration_error",
+                    }
 
         # Filter out llm_class field
         for engine, params in engine_params.items():

From 08450ac0c283f29a84ca46ac816dba7d05434eb6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 11:31:00 +0800
Subject: [PATCH 13/37] modify accomplishment measure

---
 xinference/model/llm/mlx/core.py | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index cf24d31fdf..d2d4b25697 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -422,15 +422,7 @@ def match_json_with_reason(
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
-        # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="MLX library (mlx_lm) is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="mlx_lm package not found in Python environment",
-            )
-
-        # Check platform compatibility - MLX only works on Apple Silicon
+        # Check platform compatibility first - MLX only works on Apple Silicon
         if sys.platform != "darwin" or platform.processor() != "arm":
             return MatchResult.failure(
                 reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
@@ -438,6 +430,14 @@ def match_json_with_reason(
                 technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
             )
 
+        # Check library availability (only if platform is compatible)
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="MLX library (mlx_lm) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="mlx_lm package not found in Python environment",
+            )
+
         # Check model format compatibility
         if llm_spec.model_format not in ["mlx"]:
             return MatchResult.failure(
@@ -869,15 +869,7 @@ def match_json_with_reason(
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
-        # Check library availability first - MLX Vision uses mlx_vlm
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="MLX Vision library (mlx_vlm) is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="mlx_vlm package not found in Python environment",
-            )
-
-        # Check platform compatibility
+        # Check platform compatibility first - MLX only works on Apple Silicon
         if sys.platform != "darwin" or platform.processor() != "arm":
             return MatchResult.failure(
                 reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
@@ -885,6 +877,14 @@ def match_json_with_reason(
                 technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
             )
 
+        # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="MLX Vision library (mlx_vlm) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="mlx_vlm package not found in Python environment",
+            )
+
         # Check model format compatibility
         if llm_spec.model_format not in ["mlx"]:
             return MatchResult.failure(

From e793cd4d1ed470971b03dd93ba0a47705ace27af Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:23:00 +0800
Subject: [PATCH 14/37] modify accomplishment measure

---
 xinference/model/embedding/match_result.py | 76 +++++++++++++++++++++
 xinference/model/llm/match_result.py       | 76 +++++++++++++++++++++
 xinference/model/rerank/match_result.py    | 77 ++++++++++++++++++++++
 3 files changed, 229 insertions(+)
 create mode 100644 xinference/model/embedding/match_result.py
 create mode 100644 xinference/model/llm/match_result.py
 create mode 100644 xinference/model/rerank/match_result.py

diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py
new file mode 100644
index 0000000000..47775f20f9
--- /dev/null
+++ b/xinference/model/embedding/match_result.py
@@ -0,0 +1,76 @@
+"""
+Error handling result structures for embedding model engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+    """
+    Result of engine matching operation with detailed error information.
+
+    This class provides structured information about whether an engine can handle
+    a specific model configuration, and if not, why and what alternatives exist.
+    """
+
+    is_match: bool
+    reason: Optional[str] = None
+    error_type: Optional[str] = None
+    technical_details: Optional[str] = None
+
+    @classmethod
+    def success(cls) -> "MatchResult":
+        """Create a successful match result."""
+        return cls(is_match=True)
+
+    @classmethod
+    def failure(
+        cls,
+        reason: str,
+        error_type: Optional[str] = None,
+        technical_details: Optional[str] = None,
+    ) -> "MatchResult":
+        """Create a failed match result with optional details."""
+        return cls(
+            is_match=False,
+            reason=reason,
+            error_type=error_type,
+            technical_details=technical_details,
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API responses."""
+        result = {"is_match": self.is_match}
+        if not self.is_match:
+            if self.reason:
+                result["reason"] = self.reason
+            if self.error_type:
+                result["error_type"] = self.error_type
+            if self.technical_details:
+                result["technical_details"] = self.technical_details
+        return result
+
+    def to_error_string(self) -> str:
+        """Convert to error string for backward compatibility."""
+        if self.is_match:
+            return "Available"
+        error_msg = self.reason or "Unknown error"
+        return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+    HARDWARE_REQUIREMENT = "hardware_requirement"
+    OS_REQUIREMENT = "os_requirement"
+    MODEL_FORMAT = "model_format"
+    DEPENDENCY_MISSING = "dependency_missing"
+    MODEL_COMPATIBILITY = "model_compatibility"
+    DIMENSION_MISMATCH = "dimension_mismatch"
+    VERSION_REQUIREMENT = "version_requirement"
+    CONFIGURATION_ERROR = "configuration_error"
+    ENGINE_UNAVAILABLE = "engine_unavailable"
diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py
new file mode 100644
index 0000000000..eeff2461f2
--- /dev/null
+++ b/xinference/model/llm/match_result.py
@@ -0,0 +1,76 @@
+"""
+Error handling result structures for engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+    """
+    Result of engine matching operation with detailed error information.
+
+    This class provides structured information about whether an engine can handle
+    a specific model configuration, and if not, why and what alternatives exist.
+    """
+
+    is_match: bool
+    reason: Optional[str] = None
+    error_type: Optional[str] = None
+    technical_details: Optional[str] = None
+
+    @classmethod
+    def success(cls) -> "MatchResult":
+        """Create a successful match result."""
+        return cls(is_match=True)
+
+    @classmethod
+    def failure(
+        cls,
+        reason: str,
+        error_type: Optional[str] = None,
+        technical_details: Optional[str] = None,
+    ) -> "MatchResult":
+        """Create a failed match result with optional details."""
+        return cls(
+            is_match=False,
+            reason=reason,
+            error_type=error_type,
+            technical_details=technical_details,
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API responses."""
+        result = {"is_match": self.is_match}
+        if not self.is_match:
+            if self.reason:
+                result["reason"] = self.reason
+            if self.error_type:
+                result["error_type"] = self.error_type
+            if self.technical_details:
+                result["technical_details"] = self.technical_details
+        return result
+
+    def to_error_string(self) -> str:
+        """Convert to error string for backward compatibility."""
+        if self.is_match:
+            return "Available"
+        error_msg = self.reason or "Unknown error"
+        return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+    HARDWARE_REQUIREMENT = "hardware_requirement"
+    OS_REQUIREMENT = "os_requirement"
+    MODEL_FORMAT = "model_format"
+    QUANTIZATION = "quantization"
+    DEPENDENCY_MISSING = "dependency_missing"
+    MODEL_COMPATIBILITY = "model_compatibility"
+    ABILITY_MISMATCH = "ability_mismatch"
+    VERSION_REQUIREMENT = "version_requirement"
+    CONFIGURATION_ERROR = "configuration_error"
diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py
new file mode 100644
index 0000000000..125e791afd
--- /dev/null
+++ b/xinference/model/rerank/match_result.py
@@ -0,0 +1,77 @@
+"""
+Error handling result structures for rerank model engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+    """
+    Result of engine matching operation with detailed error information.
+
+    This class provides structured information about whether an engine can handle
+    a specific model configuration, and if not, why and what alternatives exist.
+    """
+
+    is_match: bool
+    reason: Optional[str] = None
+    error_type: Optional[str] = None
+    technical_details: Optional[str] = None
+
+    @classmethod
+    def success(cls) -> "MatchResult":
+        """Create a successful match result."""
+        return cls(is_match=True)
+
+    @classmethod
+    def failure(
+        cls,
+        reason: str,
+        error_type: Optional[str] = None,
+        technical_details: Optional[str] = None,
+    ) -> "MatchResult":
+        """Create a failed match result with optional details."""
+        return cls(
+            is_match=False,
+            reason=reason,
+            error_type=error_type,
+            technical_details=technical_details,
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API responses."""
+        result = {"is_match": self.is_match}
+        if not self.is_match:
+            if self.reason:
+                result["reason"] = self.reason
+            if self.error_type:
+                result["error_type"] = self.error_type
+            if self.technical_details:
+                result["technical_details"] = self.technical_details
+        return result
+
+    def to_error_string(self) -> str:
+        """Convert to error string for backward compatibility."""
+        if self.is_match:
+            return "Available"
+        error_msg = self.reason or "Unknown error"
+        return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+    HARDWARE_REQUIREMENT = "hardware_requirement"
+    OS_REQUIREMENT = "os_requirement"
+    MODEL_FORMAT = "model_format"
+    DEPENDENCY_MISSING = "dependency_missing"
+    MODEL_COMPATIBILITY = "model_compatibility"
+    DIMENSION_MISMATCH = "dimension_mismatch"
+    VERSION_REQUIREMENT = "version_requirement"
+    CONFIGURATION_ERROR = "configuration_error"
+    ENGINE_UNAVAILABLE = "engine_unavailable"
+    RERANK_SPECIFIC = "rerank_specific"

From 27ea341e43e2c15e96276f1a770104f9bb346691 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:40:08 +0800
Subject: [PATCH 15/37] modify accomplishment measure

---
 xinference/model/embedding/core.py                     |  1 +
 xinference/model/embedding/llama_cpp/core.py           |  2 +-
 .../model/embedding/sentence_transformers/core.py      |  2 +-
 xinference/model/llm/core.py                           |  2 +-
 xinference/model/llm/llama_cpp/core.py                 |  2 +-
 xinference/model/llm/lmdeploy/core.py                  |  3 +--
 xinference/model/llm/mlx/core.py                       |  5 +----
 xinference/model/llm/sglang/core.py                    |  4 +---
 xinference/model/llm/transformers/core.py              |  2 +-
 xinference/model/llm/transformers/multimodal/core.py   |  6 ------
 xinference/model/llm/vllm/core.py                      | 10 +++-------
 xinference/model/rerank/core.py                        |  1 +
 xinference/model/rerank/sentence_transformers/core.py  |  3 ++-
 xinference/model/rerank/vllm/core.py                   |  2 +-
 xinference/model/utils.py                              |  2 +-
 15 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index 299ec4c5d1..c7f5ddb554 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -20,6 +20,7 @@
 from collections import defaultdict
 from typing import Annotated, Dict, List, Literal, Optional, Union
 
+from .match_result import MatchResult
 from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError
 from ...device_utils import empty_cache
 from ..core import VirtualEnvSettings
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 6e2908ffdd..932df57f16 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -24,6 +24,7 @@
 
 from packaging import version
 
+from ..match_result import MatchResult
 from ....types import Embedding
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 
@@ -235,7 +236,6 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(model_family, model_spec, quantization)
         return result.is_match
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 843d68ea37..6cb66f7ca2 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -19,6 +19,7 @@
 import numpy as np
 import torch
 
+from ..match_result import MatchResult
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
 from ...utils import is_flash_attn_available
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
@@ -434,7 +435,6 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(model_family, model_spec, quantization)
         return result.is_match
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index ee446d024a..2626060579 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,7 +31,7 @@
 
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
-    from .match_result import ErrorType, MatchResult
+    from .match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index f35fae9f6e..5790c3a3ca 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -21,6 +21,7 @@
 
 from packaging import version
 
+from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM, chat_context_var
@@ -86,7 +87,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index cd0aa892cf..134e668d7a 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -18,6 +18,7 @@
 
 import torch
 
+from ..match_result import MatchResult
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
 from ..core import LLM
 from ..llm_family import LLMFamilyV2, LLMSpecV1
@@ -121,7 +122,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -189,7 +189,6 @@ def load(self):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index d2d4b25697..7f53112ab3 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -39,6 +39,7 @@
 
 import xoscar as xo
 
+from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....fields import max_tokens_field
 from ....types import (
@@ -411,7 +412,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -772,7 +772,6 @@ def _sanitize_generate_config(
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -858,8 +857,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
-
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index f3658b5ed7..9365f2833b 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -24,6 +24,7 @@
 
 from xoscar.utils import get_next_port
 
+from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import (
     ChatCompletion,
@@ -342,7 +343,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -728,7 +728,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -860,7 +859,6 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 89a966136d..bc828d65b3 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -20,6 +20,7 @@
 
 import torch
 
+from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....device_utils import (
     get_device_preferred_dtype,
@@ -500,7 +501,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
diff --git a/xinference/model/llm/transformers/multimodal/core.py b/xinference/model/llm/transformers/multimodal/core.py
index ae67e102b5..4d6451f42e 100644
--- a/xinference/model/llm/transformers/multimodal/core.py
+++ b/xinference/model/llm/transformers/multimodal/core.py
@@ -39,21 +39,18 @@ def decide_device(self):
         """
         Update self._device
         """
-        pass
 
     @abstractmethod
     def load_processor(self):
         """
         Load self._processor and self._tokenizer
         """
-        pass
 
     @abstractmethod
     def load_multimodal_model(self):
         """
         Load self._model
         """
-        pass
 
     def load(self):
         self.decide_device()
@@ -71,7 +68,6 @@ def build_inputs_from_messages(
         actual parameters needed for inference,
         e.g. input_ids, attention_masks, etc.
         """
-        pass
 
     @abstractmethod
     def build_generate_kwargs(
@@ -82,7 +78,6 @@ def build_generate_kwargs(
         Hyperparameters needed for generation,
         e.g. temperature, max_new_tokens, etc.
         """
-        pass
 
     @abstractmethod
     def build_streaming_iter(
@@ -95,7 +90,6 @@ def build_streaming_iter(
         The length of prompt token usually comes from the input_ids.
         In this interface you need to call the `build_inputs_from_messages` and `build_generate_kwargs`.
         """
-        pass
 
     def get_stop_strs(self) -> List[str]:
         return []
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 9d76d5685e..7e9d6d3865 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -42,6 +42,7 @@
 from packaging import version
 from typing_extensions import NotRequired
 
+from ..match_result import MatchResult, ErrorType
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import (
     ChatCompletion,
@@ -881,7 +882,6 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -1460,7 +1460,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -1739,7 +1738,6 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
@@ -1748,7 +1746,6 @@ def match_json(
     def match_json_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
 
         # Use base class validation first
         base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
@@ -1816,7 +1813,7 @@ def is_vision_model_supported(
 
         if isinstance(llm_family, CustomLLMFamilyV2):
             if not is_vision_model_supported(
-                llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST
+                llm_family.model_family.lower()
             ):
                 return MatchResult.failure(
                     reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
@@ -1825,8 +1822,7 @@ def is_vision_model_supported(
                 )
         else:
             if not is_vision_model_supported(
-                llm_family.model_name.lower(),
-                [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST],
+                llm_family.model_name.lower()
             ):
                 return MatchResult.failure(
                     reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index 929522f23e..d3e3b5702c 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -17,6 +17,7 @@
 from collections import defaultdict
 from typing import Dict, List, Literal, Optional
 
+from .match_result import MatchResult
 from ..._compat import BaseModel
 from ...types import Rerank
 from ..core import VirtualEnvSettings
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index ee81a9adac..87efe31b5b 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -22,6 +22,7 @@
 import torch
 import torch.nn as nn
 
+from ..match_result import MatchResult
 from ....device_utils import empty_cache
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import is_flash_attn_available
@@ -341,7 +342,7 @@ def match_json(
         model_spec: RerankSpecV1,
         quantization: str,
     ) -> bool:
-        from ..match_result import MatchResult
+        pass
 
         result = cls.match_json_with_reason(model_family, model_spec, quantization)
         return result.is_match
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index f9763b567a..114eef5907 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -2,6 +2,7 @@
 import uuid
 from typing import List, Optional
 
+from ..match_result import MatchResult
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
@@ -149,7 +150,6 @@ def match_json(
         model_spec: RerankSpecV1,
         quantization: str,
     ) -> bool:
-        from ..match_result import MatchResult
 
         result = cls.match_json_with_reason(model_family, model_spec, quantization)
         return result.is_match
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ad0dabbf35..383f188382 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -520,7 +520,7 @@ def get_engine_params_by_name(
                                         if hasattr(
                                             engine_class, "match_json_with_reason"
                                         ):
-                                            from .llm.match_result import MatchResult
+                                            pass
 
                                             result = (
                                                 engine_class.match_json_with_reason(

From 114ec633ea524f493a3b509fab49b94d1ea444b3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:41:38 +0800
Subject: [PATCH 16/37] modify accomplishment measure

---
 xinference/model/embedding/core.py                     |  2 +-
 xinference/model/embedding/llama_cpp/core.py           |  2 +-
 .../model/embedding/sentence_transformers/core.py      |  2 +-
 xinference/model/llm/llama_cpp/core.py                 |  2 +-
 xinference/model/llm/lmdeploy/core.py                  |  2 +-
 xinference/model/llm/mlx/core.py                       |  2 +-
 xinference/model/llm/sglang/core.py                    |  2 +-
 xinference/model/llm/transformers/core.py              |  2 +-
 xinference/model/llm/vllm/core.py                      | 10 +++-------
 xinference/model/rerank/core.py                        |  2 +-
 xinference/model/rerank/sentence_transformers/core.py  |  2 +-
 xinference/model/rerank/vllm/core.py                   |  2 +-
 12 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index c7f5ddb554..6f934b6e5f 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -20,12 +20,12 @@
 from collections import defaultdict
 from typing import Annotated, Dict, List, Literal, Optional, Union
 
-from .match_result import MatchResult
 from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError
 from ...device_utils import empty_cache
 from ..core import VirtualEnvSettings
 from ..utils import ModelInstanceInfoMixin
 from .embed_family import match_embedding
+from .match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 932df57f16..4b3d6ed125 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -24,9 +24,9 @@
 
 from packaging import version
 
-from ..match_result import MatchResult
 from ....types import Embedding
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
+from ..match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 6cb66f7ca2..29bcb66a33 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -19,10 +19,10 @@
 import numpy as np
 import torch
 
-from ..match_result import MatchResult
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
 from ...utils import is_flash_attn_available
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
+from ..match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = []
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 5790c3a3ca..386f8eb662 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -21,11 +21,11 @@
 
 from packaging import version
 
-from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
 from ..utils import ChatModelMixin
 
 logger = logging.getLogger(__name__)
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 134e668d7a..f1c2605a24 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -18,10 +18,10 @@
 
 import torch
 
-from ..match_result import MatchResult
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
 from ..core import LLM
 from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
 from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
 
 logger = logging.getLogger(__name__)
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 7f53112ab3..943dddd7c4 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -39,7 +39,6 @@
 
 import xoscar as xo
 
-from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....fields import max_tokens_field
 from ....types import (
@@ -52,6 +51,7 @@
 )
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 9365f2833b..7095289a5d 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -24,7 +24,6 @@
 
 from xoscar.utils import get_next_port
 
-from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import (
     ChatCompletion,
@@ -38,6 +37,7 @@
 from .. import LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
 from ..llm_family import CustomLLMFamilyV2
+from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index bc828d65b3..8fae36576d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -20,7 +20,6 @@
 
 import torch
 
-from ..match_result import MatchResult
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....device_utils import (
     get_device_preferred_dtype,
@@ -41,6 +40,7 @@
 from ...utils import select_device
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     LLAMA3_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 7e9d6d3865..7bb0664354 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -42,7 +42,6 @@
 from packaging import version
 from typing_extensions import NotRequired
 
-from ..match_result import MatchResult, ErrorType
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import (
     ChatCompletion,
@@ -57,6 +56,7 @@
 from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
 from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config
+from ..match_result import ErrorType, MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -1812,18 +1812,14 @@ def is_vision_model_supported(
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_vision_model_supported(
-                llm_family.model_family.lower()
-            ):
+            if not is_vision_model_supported(llm_family.model_family.lower()):
                 return MatchResult.failure(
                     reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,
                     technical_details=f"Custom vision family: {llm_family.model_family}",
                 )
         else:
-            if not is_vision_model_supported(
-                llm_family.model_name.lower()
-            ):
+            if not is_vision_model_supported(llm_family.model_name.lower()):
                 return MatchResult.failure(
                     reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index d3e3b5702c..c02b230abd 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -17,11 +17,11 @@
 from collections import defaultdict
 from typing import Dict, List, Literal, Optional
 
-from .match_result import MatchResult
 from ..._compat import BaseModel
 from ...types import Rerank
 from ..core import VirtualEnvSettings
 from ..utils import ModelInstanceInfoMixin
+from .match_result import MatchResult
 from .rerank_family import check_engine_by_model_name_and_engine, match_rerank
 
 logger = logging.getLogger(__name__)
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index 87efe31b5b..a21d4f106a 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -22,7 +22,6 @@
 import torch
 import torch.nn as nn
 
-from ..match_result import MatchResult
 from ....device_utils import empty_cache
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import is_flash_attn_available
@@ -32,6 +31,7 @@
     RerankModelFamilyV2,
     RerankSpecV1,
 )
+from ..match_result import MatchResult
 from ..utils import preprocess_sentence
 
 logger = logging.getLogger(__name__)
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 114eef5907..339106f408 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -2,10 +2,10 @@
 import uuid
 from typing import List, Optional
 
-from ..match_result import MatchResult
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
+from ..match_result import MatchResult
 
 SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
 

From c17b78e521c4b686b74ace48c95a3e7025542a79 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:47:39 +0800
Subject: [PATCH 17/37] mypy test

---
 xinference/model/embedding/match_result.py | 2 +-
 xinference/model/llm/match_result.py       | 2 +-
 xinference/model/llm/vllm/core.py          | 8 ++++----
 xinference/model/rerank/match_result.py    | 2 +-
 xinference/model/utils.py                  | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py
index 47775f20f9..3e33c268d4 100644
--- a/xinference/model/embedding/match_result.py
+++ b/xinference/model/embedding/match_result.py
@@ -45,7 +45,7 @@ def failure(
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for API responses."""
-        result = {"is_match": self.is_match}
+        result: Dict[str, Any] = {"is_match": self.is_match}
         if not self.is_match:
             if self.reason:
                 result["reason"] = self.reason
diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py
index eeff2461f2..3ab90d2c37 100644
--- a/xinference/model/llm/match_result.py
+++ b/xinference/model/llm/match_result.py
@@ -45,7 +45,7 @@ def failure(
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for API responses."""
-        result = {"is_match": self.is_match}
+        result: Dict[str, Any] = {"is_match": self.is_match}
         if not self.is_match:
             if self.reason:
                 result["reason"] = self.reason
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 7bb0664354..4aeccc0f21 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -1003,7 +1003,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_model_supported(
+            if not llm_family.model_family or not is_model_supported(
                 llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
             ):
                 return MatchResult.failure(
@@ -1551,7 +1551,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_chat_model_supported(
+            if not llm_family.model_family or not is_chat_model_supported(
                 llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
             ):
                 return MatchResult.failure(
@@ -1812,14 +1812,14 @@ def is_vision_model_supported(
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_vision_model_supported(llm_family.model_family.lower()):
+            if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
                 return MatchResult.failure(
                     reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,
                     technical_details=f"Custom vision family: {llm_family.model_family}",
                 )
         else:
-            if not is_vision_model_supported(llm_family.model_name.lower()):
+            if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
                 return MatchResult.failure(
                     reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,
diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py
index 125e791afd..1cd278aa5d 100644
--- a/xinference/model/rerank/match_result.py
+++ b/xinference/model/rerank/match_result.py
@@ -45,7 +45,7 @@ def failure(
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for API responses."""
-        result = {"is_match": self.is_match}
+        result: Dict[str, Any] = {"is_match": self.is_match}
         if not self.is_match:
             if self.reason:
                 result["reason"] = self.reason
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 383f188382..158fd316c7 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 def get_engine_params_by_name(
     model_type: Optional[str], model_name: str
 ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
-    engine_params: Dict[str, Any] = {}
+    engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
 
     if model_type == "LLM":
         from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES

From b19475109dc12aa0e5266a293db01591ffa69318 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:48:43 +0800
Subject: [PATCH 18/37] mypy test

---
 xinference/model/llm/vllm/core.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 4aeccc0f21..bf9f07b813 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -1812,14 +1812,18 @@ def is_vision_model_supported(
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
+            if not llm_family.model_family or not is_vision_model_supported(
+                llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
+            ):
                 return MatchResult.failure(
                     reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,
                     technical_details=f"Custom vision family: {llm_family.model_family}",
                 )
         else:
-            if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
+            if not llm_family.model_name or not is_vision_model_supported(
+                llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
+            ):
                 return MatchResult.failure(
                     reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
                     error_type=ErrorType.MODEL_COMPATIBILITY,

From 2aa43d7439da5146906c40a767c3ba03a03f10cb Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:55:19 +0800
Subject: [PATCH 19/37] mypy test

---
 xinference/model/utils.py | 75 ++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 40 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 158fd316c7..f6db71ee8a 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -542,19 +542,24 @@ def get_engine_params_by_name(
                             pass
 
                     if detailed_error:
-                        engine_params[engine_name] = detailed_error
+                        # Convert error dict to string format for consistency
+                        error_parts = [detailed_error.get("error", "Unknown error")]
+                        if detailed_error.get("error_type"):
+                            error_parts.append(f"Type: {detailed_error['error_type']}")
+                        if detailed_error.get("technical_details"):
+                            error_parts.append(
+                                f"Details: {detailed_error['technical_details']}"
+                            )
+                        engine_params[engine_name] = " | ".join(error_parts)
                     else:
                         # Fallback to basic error checking for backward compatibility
-                        error_msg = None
+                        error_msg: Optional[str] = None
                         for engine_class in llm_engine_classes:
                             try:
                                 if hasattr(engine_class, "check_lib"):
                                     lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
                                     if not lib_available:
-                                        error_msg = {
-                                            "error": f"Engine {engine_name} library is not available",
-                                            "error_type": "dependency_missing",
-                                        }
+                                        error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
                                         break
                                 else:
                                     # If no check_lib method, try import check
@@ -575,30 +580,20 @@ def get_engine_params_by_name(
                                     importlib.import_module(module_name)
                                     break
                             except ImportError as e:
-                                error_msg = {
-                                    "error": f"Engine {engine_name} library is not installed: {str(e)}",
-                                    "error_type": "dependency_missing",
-                                }
+                                error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
                             except Exception as e:
-                                error_msg = {
-                                    "error": f"Engine {engine_name} is not available: {str(e)}",
-                                    "error_type": "configuration_error",
-                                }
+                                error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
 
                         if error_msg is None:
-                            error_msg = {
-                                "error": f"Engine {engine_name} is not compatible with current model or environment",
-                                "error_type": "model_compatibility",
-                            }
+                            error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
 
                         engine_params[engine_name] = error_msg
 
                 except Exception as e:
-                    # If exception occurs during checking, return structured error
-                    engine_params[engine_name] = {
-                        "error": f"Error checking engine {engine_name}: {str(e)}",
-                        "error_type": "configuration_error",
-                    }
+                    # If exception occurs during checking, return structured error as string
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)"
+                    )
 
         # Filter out llm_class field
         for engine, params in engine_params.items():
@@ -606,7 +601,7 @@ def get_engine_params_by_name(
                 params, list
             ):  # Only process parameter lists of available engines
                 for param in params:
-                    if "llm_class" in param:
+                    if isinstance(param, dict) and "llm_class" in param:
                         del param["llm_class"]
 
         return engine_params
@@ -638,7 +633,7 @@ def get_engine_params_by_name(
             if engine_name not in engine_params:  # Engine not in available list
                 try:
                     embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
-                    error_msg = None
+                    embedding_error_msg: Optional[str] = None
 
                     # Try to find specific error reasons
                     for embedding_engine_class in embedding_engine_classes:
@@ -646,7 +641,7 @@ def get_engine_params_by_name(
                             if hasattr(embedding_engine_class, "check_lib"):
                                 embedding_lib_available: bool = embedding_engine_class.check_lib()  # type: ignore[assignment]
                                 if not embedding_lib_available:
-                                    error_msg = (
+                                    embedding_error_msg = (
                                         f"Engine {engine_name} library is not available"
                                     )
                                     break
@@ -671,17 +666,17 @@ def get_engine_params_by_name(
                                 importlib.import_module(module_name)
                                 break
                         except ImportError as e:
-                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                            embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
                         except Exception as e:
-                            error_msg = (
+                            embedding_error_msg = (
                                 f"Engine {engine_name} is not available: {str(e)}"
                             )
 
-                    if error_msg is None:
-                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                    if embedding_error_msg is None:
+                        embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                    engine_params[engine_name] = embedding_error_msg
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
@@ -695,7 +690,7 @@ def get_engine_params_by_name(
                 params, list
             ):  # Only process parameter lists of available engines
                 for param in params:
-                    if "embedding_class" in param:
+                    if isinstance(param, dict) and "embedding_class" in param:
                         del param["embedding_class"]
 
         return engine_params
@@ -725,7 +720,7 @@ def get_engine_params_by_name(
             if engine_name not in engine_params:  # Engine not in available list
                 try:
                     rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
-                    error_msg = None
+                    rerank_error_msg: Optional[str] = None
 
                     # Try to find specific error reasons
                     for rerank_engine_class in rerank_engine_classes:
@@ -733,7 +728,7 @@ def get_engine_params_by_name(
                             if hasattr(rerank_engine_class, "check_lib"):
                                 rerank_lib_available: bool = rerank_engine_class.check_lib()  # type: ignore[assignment]
                                 if not rerank_lib_available:
-                                    error_msg = (
+                                    rerank_error_msg = (
                                         f"Engine {engine_name} library is not available"
                                     )
                                     break
@@ -758,17 +753,17 @@ def get_engine_params_by_name(
                                 importlib.import_module(module_name)
                                 break
                         except ImportError as e:
-                            error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                            rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
                         except Exception as e:
-                            error_msg = (
+                            rerank_error_msg = (
                                 f"Engine {engine_name} is not available: {str(e)}"
                             )
 
-                    if error_msg is None:
-                        error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                    if rerank_error_msg is None:
+                        rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
 
                     # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = error_msg
+                    engine_params[engine_name] = rerank_error_msg
 
                 except Exception as e:
                     # If exception occurs during checking, return error message string
@@ -782,7 +777,7 @@ def get_engine_params_by_name(
                 params, list
             ):  # Only process parameter lists of available engines
                 for param in params:
-                    if "rerank_class" in param:
+                    if isinstance(param, dict) and "rerank_class" in param:
                         del param["rerank_class"]
 
         return engine_params

From 173e49410bdd6806a59ef6292e7d9d9b71b0f15d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:38:53 +0800
Subject: [PATCH 20/37] mypy test

---
 xinference/model/utils.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index f6db71ee8a..c34e03ef46 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -35,6 +35,7 @@
     Tuple,
     Type,
     Union,
+    cast,
 )
 
 import huggingface_hub
@@ -543,14 +544,16 @@ def get_engine_params_by_name(
 
                     if detailed_error:
                         # Convert error dict to string format for consistency
-                        error_parts = [detailed_error.get("error", "Unknown error")]
-                        if detailed_error.get("error_type"):
-                            error_parts.append(f"Type: {detailed_error['error_type']}")
-                        if detailed_error.get("technical_details"):
-                            error_parts.append(
-                                f"Details: {detailed_error['technical_details']}"
-                            )
-                        engine_params[engine_name] = " | ".join(error_parts)
+                        error_parts = [detailed_error.get("error") or "Unknown error"]
+                        error_type = detailed_error.get("error_type")
+                        if error_type:
+                            error_parts.append(f"Type: {error_type}")
+                        technical_details = detailed_error.get("technical_details")
+                        if technical_details:
+                            error_parts.append(f"Details: {technical_details}")
+                        # Filter out None values and join
+                        error_parts_filtered = [part for part in error_parts if part is not None]
+                        engine_params[engine_name] = " | ".join(error_parts_filtered)
                     else:
                         # Fallback to basic error checking for backward compatibility
                         error_msg: Optional[str] = None
@@ -600,7 +603,8 @@ def get_engine_params_by_name(
             if isinstance(
                 params, list
             ):  # Only process parameter lists of available engines
-                for param in params:
+                assert isinstance(params, list)
+                for param in params:  # type: ignore
                     if isinstance(param, dict) and "llm_class" in param:
                         del param["llm_class"]
 
@@ -689,7 +693,8 @@ def get_engine_params_by_name(
             if isinstance(
                 params, list
             ):  # Only process parameter lists of available engines
-                for param in params:
+                assert isinstance(params, list)
+                for param in params:  # type: ignore
                     if isinstance(param, dict) and "embedding_class" in param:
                         del param["embedding_class"]
 
@@ -776,7 +781,8 @@ def get_engine_params_by_name(
             if isinstance(
                 params, list
             ):  # Only process parameter lists of available engines
-                for param in params:
+                assert isinstance(params, list)
+                for param in params:  # type: ignore
                     if isinstance(param, dict) and "rerank_class" in param:
                         del param["rerank_class"]
 

From bc41700758bf5f10cbf7897a3d5c1c3ca7142dd9 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:40:43 +0800
Subject: [PATCH 21/37] mypy test

---
 xinference/model/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index c34e03ef46..3bd7cdb3c3 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -35,7 +35,6 @@
     Tuple,
     Type,
     Union,
-    cast,
 )
 
 import huggingface_hub

From fc9b422eeaa3752c8bf07b0974558c2305986b80 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:41:54 +0800
Subject: [PATCH 22/37] mypy test

---
 xinference/model/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 3bd7cdb3c3..6e4a47dda0 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -551,7 +551,9 @@ def get_engine_params_by_name(
                         if technical_details:
                             error_parts.append(f"Details: {technical_details}")
                         # Filter out None values and join
-                        error_parts_filtered = [part for part in error_parts if part is not None]
+                        error_parts_filtered = [
+                            part for part in error_parts if part is not None
+                        ]
                         engine_params[engine_name] = " | ".join(error_parts_filtered)
                     else:
                         # Fallback to basic error checking for backward compatibility

From 5030b261cc9e57a4debd0ebb93339d7ec6421d29 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 16:44:04 +0800
Subject: [PATCH 23/37] mypy fix

---
 xinference/model/utils.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 6e4a47dda0..780602dec2 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -600,12 +600,9 @@ def get_engine_params_by_name(
                     )
 
         # Filter out llm_class field
-        for engine, params in engine_params.items():
-            if isinstance(
-                params, list
-            ):  # Only process parameter lists of available engines
-                assert isinstance(params, list)
-                for param in params:  # type: ignore
+        for engine in engine_params.keys():
+            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+                for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "llm_class" in param:
                         del param["llm_class"]
 
@@ -690,12 +687,9 @@ def get_engine_params_by_name(
                     )
 
         # Filter out embedding_class field
-        for engine, params in engine_params.items():
-            if isinstance(
-                params, list
-            ):  # Only process parameter lists of available engines
-                assert isinstance(params, list)
-                for param in params:  # type: ignore
+        for engine in engine_params.keys():
+            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+                for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "embedding_class" in param:
                         del param["embedding_class"]
 
@@ -778,12 +772,9 @@ def get_engine_params_by_name(
                     )
 
         # Filter out rerank_class field
-        for engine, params in engine_params.items():
-            if isinstance(
-                params, list
-            ):  # Only process parameter lists of available engines
-                assert isinstance(params, list)
-                for param in params:  # type: ignore
+        for engine in engine_params.keys():
+            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+                for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "rerank_class" in param:
                         del param["rerank_class"]
 

From cf517326630651f59e5873e1fa501a3a67dc2908 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 16:47:59 +0800
Subject: [PATCH 24/37] mypy fix

---
 xinference/model/utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 780602dec2..c0c5233128 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -601,7 +601,9 @@ def get_engine_params_by_name(
 
         # Filter out llm_class field
         for engine in engine_params.keys():
-            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+            if isinstance(
+                engine_params[engine], list
+            ):  # Only process parameter lists of available engines
                 for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "llm_class" in param:
                         del param["llm_class"]
@@ -688,7 +690,9 @@ def get_engine_params_by_name(
 
         # Filter out embedding_class field
         for engine in engine_params.keys():
-            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+            if isinstance(
+                engine_params[engine], list
+            ):  # Only process parameter lists of available engines
                 for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "embedding_class" in param:
                         del param["embedding_class"]
@@ -773,7 +777,9 @@ def get_engine_params_by_name(
 
         # Filter out rerank_class field
         for engine in engine_params.keys():
-            if isinstance(engine_params[engine], list):  # Only process parameter lists of available engines
+            if isinstance(
+                engine_params[engine], list
+            ):  # Only process parameter lists of available engines
                 for param in engine_params[engine]:  # type: ignore
                     if isinstance(param, dict) and "rerank_class" in param:
                         del param["rerank_class"]

From 0660aaba3e420a332b7f3934e3a70a321f3452c6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 17:54:22 +0800
Subject: [PATCH 25/37] mypy fix

---
 xinference/model/utils.py | 190 +++++++++++++++++++++++++++-----------
 1 file changed, 137 insertions(+), 53 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index c0c5233128..96beec9618 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -14,7 +14,6 @@
 
 import asyncio
 import functools
-import importlib.util
 import json
 import logging
 import os
@@ -566,22 +565,65 @@ def get_engine_params_by_name(
                                         error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
                                         break
                                 else:
-                                    # If no check_lib method, try import check
-                                    module_name = engine_name.lower().replace(".", "")
-                                    if engine_name == "vLLM":
-                                        module_name = "vllm"
-                                    elif engine_name == "SGLang":
-                                        module_name = "sglang"
-                                    elif engine_name == "llama.cpp":
-                                        module_name = "llama_cpp"
-                                    elif engine_name == "MLX":
-                                        module_name = "mlx"
-                                    elif engine_name == "LMDEPLOY":
-                                        module_name = "lmdeploy"
-                                    elif engine_name == "Transformers":
-                                        module_name = "transformers"
-
-                                    importlib.import_module(module_name)
+                                    # If no check_lib method, try to use engine's match method for compatibility check
+                                    # This provides more detailed and accurate error information
+                                    try:
+                                        # Create a minimal test spec if we don't have real model specs
+                                        from .llm.llm_family import (
+                                            LLMFamilyV2,
+                                            PytorchLLMSpecV2,
+                                        )
+
+                                        # Create a minimal test case
+                                        test_family = LLMFamilyV2(
+                                            model_name="test",
+                                            model_family="test",
+                                            model_specs=[
+                                                PytorchLLMSpecV2(
+                                                    model_format="pytorch",
+                                                    quantization="none",
+                                                )
+                                            ],
+                                        )
+                                        test_spec = test_family.model_specs[0]
+
+                                        # Use the engine's match method if available
+                                        if hasattr(
+                                            engine_class, "match_json_with_reason"
+                                        ):
+                                            result = (
+                                                engine_class.match_json_with_reason(
+                                                    test_family, test_spec, "none"
+                                                )
+                                            )
+                                            if result.is_match:
+                                                break  # Engine is available
+                                            else:
+                                                error_msg = f"Engine {engine_name}: {result.reason}"
+                                                if result.error_type:
+                                                    error_msg += (
+                                                        f" (Type: {result.error_type})"
+                                                    )
+                                                break
+                                        elif hasattr(engine_class, "match_json"):
+                                            # Fallback to simple match method - use test data
+                                            if engine_class.match_json(
+                                                test_family, test_spec, "none"
+                                            ):
+                                                break
+                                            else:
+                                                error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
+                                                break
+                                        else:
+                                            # Final fallback: generic import check
+                                            raise ImportError(
+                                                "No compatibility check method available"
+                                            )
+
+                                    except ImportError as e:
+                                        error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+                                    except Exception as e:
+                                        error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
                                     break
                             except ImportError as e:
                                 error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
@@ -650,24 +692,45 @@ def get_engine_params_by_name(
                                     )
                                     break
                             else:
-                                # If no check_lib method, try import check
-                                module_name = engine_name.lower().replace(".", "")
-                                if engine_name == "vLLM":
-                                    module_name = "vllm"
-                                elif engine_name == "SGLang":
-                                    module_name = "sglang"
-                                elif engine_name == "llama.cpp":
-                                    module_name = "llama_cpp"
-                                elif engine_name == "MLX":
-                                    module_name = "mlx"
-                                elif engine_name == "LMDEPLOY":
-                                    module_name = "lmdeploy"
-                                elif engine_name == "Transformers":
-                                    module_name = "transformers"
-                                elif engine_name == "SentenceTransformers":
-                                    module_name = "sentence_transformers"
-
-                                importlib.import_module(module_name)
+                                # If no check_lib method, try to use engine's match method for compatibility check
+                                try:
+                                    from .embedding.core import (
+                                        EmbeddingModelFamilyV2,
+                                        TransformersEmbeddingSpecV1,
+                                    )
+
+                                    # Use the engine's match method if available
+                                    if hasattr(embedding_engine_class, "match"):
+                                        # Create a minimal test case
+                                        test_family = EmbeddingModelFamilyV2(
+                                            model_name="test",
+                                            model_specs=[
+                                                TransformersEmbeddingSpecV1(
+                                                    model_format="pytorch",
+                                                    quantization="none",
+                                                )
+                                            ],
+                                        )
+                                        test_spec = test_family.model_specs[0]
+
+                                        # Use the engine's match method to check compatibility
+                                        if embedding_engine_class.match(
+                                            test_family, test_spec, "none"
+                                        ):
+                                            break  # Engine is available
+                                        else:
+                                            embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                                            break
+                                    else:
+                                        # Final fallback: generic import check
+                                        raise ImportError(
+                                            "No compatibility check method available"
+                                        )
+
+                                except ImportError as e:
+                                    embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                                except Exception as e:
+                                    embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}"
                                 break
                         except ImportError as e:
                             embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
@@ -737,24 +800,45 @@ def get_engine_params_by_name(
                                     )
                                     break
                             else:
-                                # If no check_lib method, try import check
-                                module_name = engine_name.lower().replace(".", "")
-                                if engine_name == "vLLM":
-                                    module_name = "vllm"
-                                elif engine_name == "SGLang":
-                                    module_name = "sglang"
-                                elif engine_name == "llama.cpp":
-                                    module_name = "llama_cpp"
-                                elif engine_name == "MLX":
-                                    module_name = "mlx"
-                                elif engine_name == "LMDEPLOY":
-                                    module_name = "lmdeploy"
-                                elif engine_name == "Transformers":
-                                    module_name = "transformers"
-                                elif engine_name == "SentenceTransformers":
-                                    module_name = "sentence_transformers"
-
-                                importlib.import_module(module_name)
+                                # If no check_lib method, try to use engine's match method for compatibility check
+                                try:
+                                    from .rerank.core import (
+                                        RerankModelFamilyV2,
+                                        RerankSpecV1,
+                                    )
+
+                                    # Use the engine's match method if available
+                                    if hasattr(rerank_engine_class, "match"):
+                                        # Create a minimal test case
+                                        test_family = RerankModelFamilyV2(
+                                            model_name="test",
+                                            model_specs=[
+                                                RerankSpecV1(
+                                                    model_format="pytorch",
+                                                    quantization="none",
+                                                )
+                                            ],
+                                        )
+                                        test_spec = test_family.model_specs[0]
+
+                                        # Use the engine's match method to check compatibility
+                                        if rerank_engine_class.match(
+                                            test_family, test_spec, "none"
+                                        ):
+                                            break  # Engine is available
+                                        else:
+                                            rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                                            break
+                                    else:
+                                        # Final fallback: generic import check
+                                        raise ImportError(
+                                            "No compatibility check method available"
+                                        )
+
+                                except ImportError as e:
+                                    rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                                except Exception as e:
+                                    rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}"
                                 break
                         except ImportError as e:
                             rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"

From 996f3cdc0040312c6f8d6587dffeaa74c925f656 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 18:17:59 +0800
Subject: [PATCH 26/37] mypy fix

---
 xinference/model/utils.py | 225 +++++++++++++++++++++++++-------------
 1 file changed, 150 insertions(+), 75 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 96beec9618..0ed516085d 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -541,28 +541,19 @@ def get_engine_params_by_name(
                             pass
 
                     if detailed_error:
-                        # Convert error dict to string format for consistency
-                        error_parts = [detailed_error.get("error") or "Unknown error"]
-                        error_type = detailed_error.get("error_type")
-                        if error_type:
-                            error_parts.append(f"Type: {error_type}")
-                        technical_details = detailed_error.get("technical_details")
-                        if technical_details:
-                            error_parts.append(f"Details: {technical_details}")
-                        # Filter out None values and join
-                        error_parts_filtered = [
-                            part for part in error_parts if part is not None
+                        # Convert error dict to array format with error, type, details fields
+                        engine_params[engine_name] = [
+                            f"error: {detailed_error.get('error') or 'Unknown error'}",
+                            f"type: {detailed_error.get('error_type') or 'unknown'}",
+                            f"details: {detailed_error.get('technical_details') or 'No additional details available'}",
                         ]
-                        engine_params[engine_name] = " | ".join(error_parts_filtered)
                     else:
                         # Fallback to basic error checking for backward compatibility
-                        error_msg: Optional[str] = None
                         for engine_class in llm_engine_classes:
                             try:
                                 if hasattr(engine_class, "check_lib"):
                                     lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
                                     if not lib_available:
-                                        error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
                                         break
                                 else:
                                     # If no check_lib method, try to use engine's match method for compatibility check
@@ -599,11 +590,12 @@ def get_engine_params_by_name(
                                             if result.is_match:
                                                 break  # Engine is available
                                             else:
-                                                error_msg = f"Engine {engine_name}: {result.reason}"
-                                                if result.error_type:
-                                                    error_msg += (
-                                                        f" (Type: {result.error_type})"
-                                                    )
+                                                # Create array format for match method errors
+                                                engine_params[engine_name] = [
+                                                    f"error: Engine {engine_name}: {result.reason}",
+                                                    f"type: {result.error_type or 'model_compatibility'}",
+                                                    f"details: Engine {engine_name} compatibility check failed: {result.reason}",
+                                                ]
                                                 break
                                         elif hasattr(engine_class, "match_json"):
                                             # Fallback to simple match method - use test data
@@ -612,7 +604,6 @@ def get_engine_params_by_name(
                                             ):
                                                 break
                                             else:
-                                                error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
                                                 break
                                         else:
                                             # Final fallback: generic import check
@@ -621,25 +612,49 @@ def get_engine_params_by_name(
                                             )
 
                                     except ImportError as e:
-                                        error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+                                        engine_params[engine_name] = [
+                                            f"error: Engine {engine_name} library is not installed: {str(e)}",
+                                            f"type: dependency_missing",
+                                            f"details: Missing required dependency for {engine_name} engine: {str(e)}",
+                                        ]
+                                        break
                                     except Exception as e:
-                                        error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
-                                    break
+                                        engine_params[engine_name] = [
+                                            f"error: Engine {engine_name} is not available: {str(e)}",
+                                            f"type: configuration_error",
+                                            f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                                        ]
+                                        break
                             except ImportError as e:
-                                error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+                                engine_params[engine_name] = [
+                                    f"error: Engine {engine_name} library is not installed: {str(e)}",
+                                    f"type: dependency_missing",
+                                    f"details: Missing required dependency for {engine_name} engine: {str(e)}",
+                                ]
+                                break
                             except Exception as e:
-                                error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
-
-                        if error_msg is None:
-                            error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
+                                engine_params[engine_name] = [
+                                    f"error: Engine {engine_name} is not available: {str(e)}",
+                                    f"type: configuration_error",
+                                    f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                                ]
+                                break
 
-                        engine_params[engine_name] = error_msg
+                        # Only set default error if not already set by one of the exception handlers
+                        if engine_name not in engine_params:
+                            engine_params[engine_name] = [
+                                f"error: Engine {engine_name} is not compatible with current model or environment",
+                                f"type: model_compatibility",
+                                f"details: The {engine_name} engine cannot handle the current model configuration",
+                            ]
 
                 except Exception as e:
-                    # If exception occurs during checking, return structured error as string
-                    engine_params[engine_name] = (
-                        f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)"
-                    )
+                    # If exception occurs during checking, return structured error as array
+                    engine_params[engine_name] = [
+                        f"error: Error checking engine {engine_name}: {str(e)}",
+                        f"type: configuration_error",
+                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+                    ]
 
         # Filter out llm_class field
         for engine in engine_params.keys():
@@ -679,7 +694,7 @@ def get_engine_params_by_name(
             if engine_name not in engine_params:  # Engine not in available list
                 try:
                     embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
-                    embedding_error_msg: Optional[str] = None
+                    embedding_error_details: Optional[Dict[str, str]] = None
 
                     # Try to find specific error reasons
                     for embedding_engine_class in embedding_engine_classes:
@@ -687,9 +702,11 @@ def get_engine_params_by_name(
                             if hasattr(embedding_engine_class, "check_lib"):
                                 embedding_lib_available: bool = embedding_engine_class.check_lib()  # type: ignore[assignment]
                                 if not embedding_lib_available:
-                                    embedding_error_msg = (
-                                        f"Engine {engine_name} library is not available"
-                                    )
+                                    embedding_error_details = {
+                                        "error": f"Engine {engine_name} library is not available",
+                                        "error_type": "dependency_missing",
+                                        "technical_details": f"The required library for {engine_name} engine is not installed or not accessible",
+                                    }
                                     break
                             else:
                                 # If no check_lib method, try to use engine's match method for compatibility check
@@ -719,7 +736,11 @@ def get_engine_params_by_name(
                                         ):
                                             break  # Engine is available
                                         else:
-                                            embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                                            embedding_error_details = {
+                                                "error": f"Engine {engine_name} is not compatible with current model or environment",
+                                                "error_type": "model_compatibility",
+                                                "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
+                                            }
                                             break
                                     else:
                                         # Final fallback: generic import check
@@ -728,28 +749,52 @@ def get_engine_params_by_name(
                                         )
 
                                 except ImportError as e:
-                                    embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                                    embedding_error_details = {
+                                        "error": f"Engine {engine_name} library is not installed: {str(e)}",
+                                        "error_type": "dependency_missing",
+                                        "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+                                    }
                                 except Exception as e:
-                                    embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}"
+                                    embedding_error_details = {
+                                        "error": f"Engine {engine_name} is not available: {str(e)}",
+                                        "error_type": "configuration_error",
+                                        "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                                    }
                                 break
                         except ImportError as e:
-                            embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                            embedding_error_details = {
+                                "error": f"Engine {engine_name} library is not installed: {str(e)}",
+                                "error_type": "dependency_missing",
+                                "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+                            }
                         except Exception as e:
-                            embedding_error_msg = (
-                                f"Engine {engine_name} is not available: {str(e)}"
-                            )
-
-                    if embedding_error_msg is None:
-                        embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
-                    # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = embedding_error_msg
+                            embedding_error_details = {
+                                "error": f"Engine {engine_name} is not available: {str(e)}",
+                                "error_type": "configuration_error",
+                                "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                            }
+
+                    if embedding_error_details is None:
+                        embedding_error_details = {
+                            "error": f"Engine {engine_name} is not compatible with current model or environment",
+                            "error_type": "model_compatibility",
+                            "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
+                        }
+
+                    # For unavailable engines, format error message as array like LLM
+                    engine_params[engine_name] = [
+                        f"error: {embedding_error_details.get('error') or 'Unknown error'}",
+                        f"type: {embedding_error_details.get('error_type') or 'unknown'}",
+                        f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}",
+                    ]
 
                 except Exception as e:
-                    # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
-                        f"Error checking engine {engine_name}: {str(e)}"
-                    )
+                    # If exception occurs during checking, return structured error as array like LLM
+                    engine_params[engine_name] = [
+                        f"error: Error checking engine {engine_name}: {str(e)}",
+                        f"type: configuration_error",
+                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+                    ]
 
         # Filter out embedding_class field
         for engine in engine_params.keys():
@@ -787,7 +832,7 @@ def get_engine_params_by_name(
             if engine_name not in engine_params:  # Engine not in available list
                 try:
                     rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
-                    rerank_error_msg: Optional[str] = None
+                    rerank_error_details: Optional[Dict[str, str]] = None
 
                     # Try to find specific error reasons
                     for rerank_engine_class in rerank_engine_classes:
@@ -795,9 +840,11 @@ def get_engine_params_by_name(
                             if hasattr(rerank_engine_class, "check_lib"):
                                 rerank_lib_available: bool = rerank_engine_class.check_lib()  # type: ignore[assignment]
                                 if not rerank_lib_available:
-                                    rerank_error_msg = (
-                                        f"Engine {engine_name} library is not available"
-                                    )
+                                    rerank_error_details = {
+                                        "error": f"Engine {engine_name} library is not available",
+                                        "error_type": "dependency_missing",
+                                        "technical_details": f"The required library for {engine_name} engine is not installed or not accessible",
+                                    }
                                     break
                             else:
                                 # If no check_lib method, try to use engine's match method for compatibility check
@@ -827,7 +874,11 @@ def get_engine_params_by_name(
                                         ):
                                             break  # Engine is available
                                         else:
-                                            rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+                                            rerank_error_details = {
+                                                "error": f"Engine {engine_name} is not compatible with current model or environment",
+                                                "error_type": "model_compatibility",
+                                                "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
+                                            }
                                             break
                                     else:
                                         # Final fallback: generic import check
@@ -836,28 +887,52 @@ def get_engine_params_by_name(
                                         )
 
                                 except ImportError as e:
-                                    rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                                    rerank_error_details = {
+                                        "error": f"Engine {engine_name} library is not installed: {str(e)}",
+                                        "error_type": "dependency_missing",
+                                        "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+                                    }
                                 except Exception as e:
-                                    rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}"
+                                    rerank_error_details = {
+                                        "error": f"Engine {engine_name} is not available: {str(e)}",
+                                        "error_type": "configuration_error",
+                                        "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                                    }
                                 break
                         except ImportError as e:
-                            rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+                            rerank_error_details = {
+                                "error": f"Engine {engine_name} library is not installed: {str(e)}",
+                                "error_type": "dependency_missing",
+                                "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+                            }
                         except Exception as e:
-                            rerank_error_msg = (
-                                f"Engine {engine_name} is not available: {str(e)}"
-                            )
-
-                    if rerank_error_msg is None:
-                        rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
-                    # For unavailable engines, directly return error message string
-                    engine_params[engine_name] = rerank_error_msg
+                            rerank_error_details = {
+                                "error": f"Engine {engine_name} is not available: {str(e)}",
+                                "error_type": "configuration_error",
+                                "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+                            }
+
+                    if rerank_error_details is None:
+                        rerank_error_details = {
+                            "error": f"Engine {engine_name} is not compatible with current model or environment",
+                            "error_type": "model_compatibility",
+                            "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
+                        }
+
+                    # For unavailable engines, format error message as array like LLM
+                    engine_params[engine_name] = [
+                        f"error: {rerank_error_details.get('error') or 'Unknown error'}",
+                        f"type: {rerank_error_details.get('error_type') or 'unknown'}",
+                        f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}",
+                    ]
 
                 except Exception as e:
-                    # If exception occurs during checking, return error message string
-                    engine_params[engine_name] = (
-                        f"Error checking engine {engine_name}: {str(e)}"
-                    )
+                    # If exception occurs during checking, return structured error as array like LLM
+                    engine_params[engine_name] = [
+                        f"error: Error checking engine {engine_name}: {str(e)}",
+                        f"type: configuration_error",
+                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+                    ]
 
         # Filter out rerank_class field
         for engine in engine_params.keys():

From 41b0735eec9c293dabea4d8c6965c8c736b51e09 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 10:02:34 +0800
Subject: [PATCH 27/37] mypy fix

---
 xinference/model/utils.py | 111 +++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 67 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 0ed516085d..146f145513 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -541,12 +541,10 @@ def get_engine_params_by_name(
                             pass
 
                     if detailed_error:
-                        # Convert error dict to array format with error, type, details fields
-                        engine_params[engine_name] = [
-                            f"error: {detailed_error.get('error') or 'Unknown error'}",
-                            f"type: {detailed_error.get('error_type') or 'unknown'}",
-                            f"details: {detailed_error.get('technical_details') or 'No additional details available'}",
-                        ]
+                        # Return only the error message without engine_name prefix (key already contains engine name)
+                        engine_params[engine_name] = (
+                            detailed_error.get("error") or "Unknown error"
+                        )
                     else:
                         # Fallback to basic error checking for backward compatibility
                         for engine_class in llm_engine_classes:
@@ -590,12 +588,11 @@ def get_engine_params_by_name(
                                             if result.is_match:
                                                 break  # Engine is available
                                             else:
-                                                # Create array format for match method errors
-                                                engine_params[engine_name] = [
-                                                    f"error: Engine {engine_name}: {result.reason}",
-                                                    f"type: {result.error_type or 'model_compatibility'}",
-                                                    f"details: Engine {engine_name} compatibility check failed: {result.reason}",
-                                                ]
+                                                # Return only the error message without engine_name prefix (key already contains engine name)
+                                                engine_params[engine_name] = (
+                                                    result.reason
+                                                    or "Unknown compatibility error"
+                                                )
                                                 break
                                         elif hasattr(engine_class, "match_json"):
                                             # Fallback to simple match method - use test data
@@ -612,49 +609,37 @@ def get_engine_params_by_name(
                                             )
 
                                     except ImportError as e:
-                                        engine_params[engine_name] = [
-                                            f"error: Engine {engine_name} library is not installed: {str(e)}",
-                                            f"type: dependency_missing",
-                                            f"details: Missing required dependency for {engine_name} engine: {str(e)}",
-                                        ]
+                                        engine_params[engine_name] = (
+                                            f"Engine {engine_name} library is not installed: {str(e)}"
+                                        )
                                         break
                                     except Exception as e:
-                                        engine_params[engine_name] = [
-                                            f"error: Engine {engine_name} is not available: {str(e)}",
-                                            f"type: configuration_error",
-                                            f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
-                                        ]
+                                        engine_params[engine_name] = (
+                                            f"Engine {engine_name} is not available: {str(e)}"
+                                        )
                                         break
                             except ImportError as e:
-                                engine_params[engine_name] = [
-                                    f"error: Engine {engine_name} library is not installed: {str(e)}",
-                                    f"type: dependency_missing",
-                                    f"details: Missing required dependency for {engine_name} engine: {str(e)}",
-                                ]
+                                engine_params[engine_name] = (
+                                    f"Engine {engine_name} library is not installed: {str(e)}"
+                                )
                                 break
                             except Exception as e:
-                                engine_params[engine_name] = [
-                                    f"error: Engine {engine_name} is not available: {str(e)}",
-                                    f"type: configuration_error",
-                                    f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
-                                ]
+                                engine_params[engine_name] = (
+                                    f"Engine {engine_name} is not available: {str(e)}"
+                                )
                                 break
 
                         # Only set default error if not already set by one of the exception handlers
                         if engine_name not in engine_params:
-                            engine_params[engine_name] = [
-                                f"error: Engine {engine_name} is not compatible with current model or environment",
-                                f"type: model_compatibility",
-                                f"details: The {engine_name} engine cannot handle the current model configuration",
-                            ]
+                            engine_params[engine_name] = (
+                                f"Engine {engine_name} is not compatible with current model or environment"
+                            )
 
                 except Exception as e:
-                    # If exception occurs during checking, return structured error as array
-                    engine_params[engine_name] = [
-                        f"error: Error checking engine {engine_name}: {str(e)}",
-                        f"type: configuration_error",
-                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
-                    ]
+                    # If exception occurs during checking, return simple string format
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
 
         # Filter out llm_class field
         for engine in engine_params.keys():
@@ -781,20 +766,16 @@ def get_engine_params_by_name(
                             "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
                         }
 
-                    # For unavailable engines, format error message as array like LLM
-                    engine_params[engine_name] = [
-                        f"error: {embedding_error_details.get('error') or 'Unknown error'}",
-                        f"type: {embedding_error_details.get('error_type') or 'unknown'}",
-                        f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}",
-                    ]
+                    # For unavailable engines, return simple string format
+                    engine_params[engine_name] = (
+                        embedding_error_details.get("error") or "Unknown error"
+                    )
 
                 except Exception as e:
-                    # If exception occurs during checking, return structured error as array like LLM
-                    engine_params[engine_name] = [
-                        f"error: Error checking engine {engine_name}: {str(e)}",
-                        f"type: configuration_error",
-                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
-                    ]
+                    # If exception occurs during checking, return simple string format
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
 
         # Filter out embedding_class field
         for engine in engine_params.keys():
@@ -919,20 +900,16 @@ def get_engine_params_by_name(
                             "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
                         }
 
-                    # For unavailable engines, format error message as array like LLM
-                    engine_params[engine_name] = [
-                        f"error: {rerank_error_details.get('error') or 'Unknown error'}",
-                        f"type: {rerank_error_details.get('error_type') or 'unknown'}",
-                        f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}",
-                    ]
+                    # For unavailable engines, return simple string format
+                    engine_params[engine_name] = (
+                        rerank_error_details.get("error") or "Unknown error"
+                    )
 
                 except Exception as e:
-                    # If exception occurs during checking, return structured error as array like LLM
-                    engine_params[engine_name] = [
-                        f"error: Error checking engine {engine_name}: {str(e)}",
-                        f"type: configuration_error",
-                        f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
-                    ]
+                    # If exception occurs during checking, return simple string format
+                    engine_params[engine_name] = (
+                        f"Error checking engine {engine_name}: {str(e)}"
+                    )
 
         # Filter out rerank_class field
         for engine in engine_params.keys():

From c760a589e971d5db5dbd97582010a5736e633c55 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:09:06 +0800
Subject: [PATCH 28/37] Modify class name

---
 xinference/model/embedding/core.py               |  2 +-
 xinference/model/embedding/llama_cpp/core.py     |  4 ++--
 .../embedding/sentence_transformers/core.py      |  4 ++--
 xinference/model/llm/core.py                     |  2 +-
 xinference/model/llm/llama_cpp/core.py           |  4 ++--
 xinference/model/llm/lmdeploy/core.py            |  8 ++++----
 xinference/model/llm/mlx/core.py                 | 14 +++++++-------
 xinference/model/llm/sglang/core.py              | 16 ++++++++--------
 xinference/model/llm/transformers/core.py        |  4 ++--
 xinference/model/llm/vllm/core.py                | 16 ++++++++--------
 xinference/model/rerank/core.py                  |  2 +-
 .../model/rerank/sentence_transformers/core.py   |  4 ++--
 xinference/model/rerank/vllm/core.py             |  4 ++--
 xinference/model/utils.py                        | 10 +++++-----
 14 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index 6f934b6e5f..b68e5236ca 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -173,7 +173,7 @@ def match_json(
         pass
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 4b3d6ed125..d84434384f 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -237,11 +237,11 @@ def match_json(
         quantization: str,
     ) -> bool:
 
-        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        result = cls.match_with_reason(model_family, model_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 29bcb66a33..c1789f9912 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -436,11 +436,11 @@ def match_json(
         quantization: str,
     ) -> bool:
 
-        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        result = cls.match_with_reason(model_family, model_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 2626060579..3020483219 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -161,7 +161,7 @@ def match_json(
         raise NotImplementedError
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         """
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 386f8eb662..e8ff96f83b 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -88,11 +88,11 @@ def match_json(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index f1c2605a24..90115dec06 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -123,11 +123,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
@@ -190,11 +190,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 943dddd7c4..ff6b2e51ea 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -413,11 +413,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
@@ -773,17 +773,17 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
         # Use base class validation first
-        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
         if not base_result.is_match:
             return base_result
 
@@ -857,11 +857,11 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 7095289a5d..d22a157777 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -344,11 +344,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
@@ -729,17 +729,17 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
         # Use base class validation first
-        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
         if not base_result.is_match:
             return base_result
 
@@ -860,17 +860,17 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
         # Use base class validation first
-        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
         if not base_result.is_match:
             return base_result
 
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 8fae36576d..5a4a9f557d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -502,11 +502,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index bf9f07b813..bc0eede4c0 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -883,11 +883,11 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
@@ -1461,17 +1461,17 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
         from ..match_result import ErrorType, MatchResult
 
         # Use base class validation first
-        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
         if not base_result.is_match:
             return base_result
 
@@ -1739,16 +1739,16 @@ def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
 
-        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        result = cls.match_with_reason(llm_family, llm_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> "MatchResult":
 
         # Use base class validation first
-        base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
         if not base_result.is_match:
             return base_result
 
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index c02b230abd..2d3edde1c2 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -133,7 +133,7 @@ def match_json(
         pass
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index a21d4f106a..42332bc477 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -344,11 +344,11 @@ def match_json(
     ) -> bool:
         pass
 
-        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        result = cls.match_with_reason(model_family, model_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 339106f408..c2ee75cfef 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -151,11 +151,11 @@ def match_json(
         quantization: str,
     ) -> bool:
 
-        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        result = cls.match_with_reason(model_family, model_spec, quantization)
         return result.is_match
 
     @classmethod
-    def match_json_with_reason(
+    def match_with_reason(
         cls,
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 146f145513..e27c93d851 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -499,7 +499,7 @@ def get_engine_params_by_name(
                 try:
                     llm_engine_classes = SUPPORTED_ENGINES[engine_name]
 
-                    # Try to get detailed error information from engine's match_json_with_reason
+                    # Try to get detailed error information from engine's match_with_reason
                     detailed_error = None
 
                     # We need a sample model to test against, use the first available spec
@@ -517,12 +517,12 @@ def get_engine_params_by_name(
                                 for engine_class in llm_engine_classes:
                                     try:
                                         if hasattr(
-                                            engine_class, "match_json_with_reason"
+                                            engine_class, "match_with_reason"
                                         ):
                                             pass
 
                                             result = (
-                                                engine_class.match_json_with_reason(
+                                                engine_class.match_with_reason(
                                                     llm_family, llm_spec, quantization
                                                 )
                                             )
@@ -578,10 +578,10 @@ def get_engine_params_by_name(
 
                                         # Use the engine's match method if available
                                         if hasattr(
-                                            engine_class, "match_json_with_reason"
+                                            engine_class, "match_with_reason"
                                         ):
                                             result = (
-                                                engine_class.match_json_with_reason(
+                                                engine_class.match_with_reason(
                                                     test_family, test_spec, "none"
                                                 )
                                             )

From 6615014c8faae1821c90b0862339672ee215ca9a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:12:15 +0800
Subject: [PATCH 29/37] Modify class name

---
 xinference/model/utils.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index e27c93d851..377259af77 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -516,15 +516,11 @@ def get_engine_params_by_name(
                                 # Test each engine class for detailed error info
                                 for engine_class in llm_engine_classes:
                                     try:
-                                        if hasattr(
-                                            engine_class, "match_with_reason"
-                                        ):
+                                        if hasattr(engine_class, "match_with_reason"):
                                             pass
 
-                                            result = (
-                                                engine_class.match_with_reason(
-                                                    llm_family, llm_spec, quantization
-                                                )
+                                            result = engine_class.match_with_reason(
+                                                llm_family, llm_spec, quantization
                                             )
                                             if not result.is_match:
                                                 detailed_error = {
@@ -577,13 +573,9 @@ def get_engine_params_by_name(
                                         test_spec = test_family.model_specs[0]
 
                                         # Use the engine's match method if available
-                                        if hasattr(
-                                            engine_class, "match_with_reason"
-                                        ):
-                                            result = (
-                                                engine_class.match_with_reason(
-                                                    test_family, test_spec, "none"
-                                                )
+                                        if hasattr(engine_class, "match_with_reason"):
+                                            result = engine_class.match_with_reason(
+                                                test_family, test_spec, "none"
                                             )
                                             if result.is_match:
                                                 break  # Engine is available

From 2105c83392399ae5eb800bb6f00a19422e81d25d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:44:35 +0800
Subject: [PATCH 30/37] commit

---
 xinference/model/utils.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 377259af77..ea7adb309e 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -529,12 +529,21 @@ def get_engine_params_by_name(
                                                     "technical_details": result.technical_details,
                                                 }
                                                 break
-                                    except Exception:
-                                        # Fall back to next engine class
+                                    except Exception as e:
+                                        # Fall back to next engine class with clear error logging
+                                        logger.warning(
+                                            f"Engine class {engine_class.__name__} match_with_reason failed: {e}"
+                                        )
+                                        # Continue to try next engine class, but this is expected behavior for fallback
                                         continue
-                        except Exception:
-                            # If we can't get model family, continue with basic checking
-                            pass
+                        except Exception as e:
+                            # If we can't get model family, fail with clear error
+                            logger.error(
+                                f"Failed to get model family for {model_name} (LLM): {e}"
+                            )
+                            raise RuntimeError(
+                                f"Unable to process LLM model {model_name}: {e}"
+                            )
 
                     if detailed_error:
                         # Return only the error message without engine_name prefix (key already contains engine name)

From eb1bb43dc6358228ff7462f002ff1d62348eda56 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:21:15 +0800
Subject: [PATCH 31/37] new engine ability display

---
 xinference/model/embedding/core.py            |  53 +--
 xinference/model/embedding/flag/core.py       |  17 +-
 xinference/model/embedding/llama_cpp/core.py  |  59 +--
 .../embedding/sentence_transformers/core.py   |  70 +---
 xinference/model/embedding/vllm/core.py       |  17 +-
 xinference/model/llm/core.py                  |  48 +--
 xinference/model/llm/llama_cpp/core.py        |  68 +--
 xinference/model/llm/lmdeploy/core.py         |  70 +---
 xinference/model/llm/mlx/core.py              | 165 ++------
 xinference/model/llm/sglang/core.py           | 387 ++++++++++--------
 xinference/model/llm/transformers/core.py     |  74 +---
 xinference/model/llm/vllm/core.py             | 274 +++----------
 xinference/model/rerank/core.py               |  55 +--
 .../rerank/sentence_transformers/core.py      |  68 +--
 xinference/model/rerank/vllm/core.py          |  67 +--
 xinference/model/utils.py                     | 165 ++++++--
 16 files changed, 591 insertions(+), 1066 deletions(-)

diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index b68e5236ca..42f39049f6 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -25,7 +25,6 @@
 from ..core import VirtualEnvSettings
 from ..utils import ModelInstanceInfoMixin
 from .embed_family import match_embedding
-from .match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
@@ -159,7 +158,7 @@ def __init__(
 
     @classmethod
     @abstractmethod
-    def check_lib(cls) -> bool:
+    def check_lib(cls) -> Union[bool, str]:
         pass
 
     @classmethod
@@ -169,62 +168,24 @@ def match_json(
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ) -> bool:
+    ) -> Union[bool, str]:
         pass
 
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: EmbeddingModelFamilyV2,
-        model_spec: EmbeddingSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        """
-        Check if the engine can handle the given embedding model with detailed error information.
-
-        This method provides detailed failure reasons and suggestions when an engine
-        cannot handle a specific model configuration. The default implementation
-        falls back to the boolean match_json method for backward compatibility.
-
-        Args:
-            model_family: The embedding model family information
-            model_spec: The model specification
-            quantization: The quantization method
-
-        Returns:
-            MatchResult: Detailed match result with reasons and suggestions
-        """
-        from .match_result import ErrorType, MatchResult
-
-        # Default implementation for backward compatibility
-        if cls.match_json(model_family, model_spec, quantization):
-            return MatchResult.success()
-        else:
-            # Get basic reason based on common failure patterns
-            if not cls.check_lib():
-                return MatchResult.failure(
-                    reason=f"Required library for {cls.__name__} is not available",
-                    error_type=ErrorType.DEPENDENCY_MISSING,
-                )
-            else:
-                return MatchResult.failure(
-                    reason=f"Embedding model configuration is not compatible with {cls.__name__}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                )
-
     @classmethod
     def match(
         cls,
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ):
+    ) -> bool:
         """
         Return if the model_spec can be matched.
         """
-        if not cls.check_lib():
+        lib_result = cls.check_lib()
+        if lib_result != True:
             return False
-        return cls.match_json(model_family, model_spec, quantization)
+        match_result = cls.match_json(model_family, model_spec, quantization)
+        return match_result == True
 
     @abstractmethod
     def load(self):
diff --git a/xinference/model/embedding/flag/core.py b/xinference/model/embedding/flag/core.py
index a53036449e..174a860d91 100644
--- a/xinference/model/embedding/flag/core.py
+++ b/xinference/model/embedding/flag/core.py
@@ -285,8 +285,12 @@ def encode(
         return result
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("FlagEmbedding") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("FlagEmbedding") is not None
+            else "FlagEmbedding library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -294,10 +298,15 @@ def match_json(
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ) -> bool:
+    ) -> Union[bool, str]:
+        # Check library availability first
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
+
         if (
             model_spec.model_format in ["pytorch"]
             and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST
         ):
             return True
-        return False
+        return f"FlagEmbedding engine only supports pytorch format and models in FLAG_EMBEDDER_MODEL_LIST, got format: {model_spec.model_format}, model: {model_family.model_name}"
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index d84434384f..a8e68f450b 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -26,7 +26,6 @@
 
 from ....types import Embedding
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
-from ..match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
@@ -226,8 +225,12 @@ def _handle_embedding():
         return Embedding(**r)  # type: ignore
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("xllamacpp") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("xllamacpp") is not None
+            else "xllamacpp library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -235,52 +238,24 @@ def match_json(
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ) -> bool:
-
-        result = cls.match_with_reason(model_family, model_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: EmbeddingModelFamilyV2,
-        model_spec: EmbeddingSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="llama.cpp library (xllamacpp) is not installed for embedding",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="xllamacpp package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if model_spec.model_format not in ["ggufv2"]:
-            return MatchResult.failure(
-                reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2",
-            )
+            return f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}"
 
         # Check embedding-specific requirements
         if not hasattr(model_spec, "model_file_name_template"):
-            return MatchResult.failure(
-                reason="GGUF embedding model requires proper file configuration",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details="Missing model_file_name_template for GGUF embedding",
-            )
+            return "GGUF embedding model requires proper file configuration (missing model_file_name_template)"
 
         # Check model dimensions for llama.cpp compatibility
         model_dimensions = model_family.dimensions
         if model_dimensions > 4096:  # llama.cpp may have limitations
-            return MatchResult.failure(
-                reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Large embedding dimensions: {model_dimensions}",
-            )
+            return f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)"
 
         # Check platform-specific considerations
         import platform
@@ -289,10 +264,6 @@ def match_with_reason(
 
         # llama.cpp works across platforms but may have performance differences
         if current_platform == "Windows":
-            return MatchResult.failure(
-                reason="llama.cpp embedding may have limited performance on Windows",
-                error_type=ErrorType.OS_REQUIREMENT,
-                technical_details=f"Windows platform: {current_platform}",
-            )
+            return "llama.cpp embedding may have limited performance on Windows"
 
-        return MatchResult.success()
+        return True
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index c1789f9912..4e1c7b8b73 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -22,7 +22,6 @@
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
 from ...utils import is_flash_attn_available
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
-from ..match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = []
@@ -425,8 +424,12 @@ def base64_to_image(base64_str: str) -> Image.Image:
         return result
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("sentence_transformers") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("sentence_transformers") is not None
+            else "sentence_transformers library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -434,53 +437,25 @@ def match_json(
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ) -> bool:
-
-        result = cls.match_with_reason(model_family, model_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: EmbeddingModelFamilyV2,
-        model_spec: EmbeddingSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="Sentence Transformers library is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="sentence_transformers package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if model_spec.model_format not in ["pytorch"]:
-            return MatchResult.failure(
-                reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
-            )
+            return f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}"
 
         # Check model dimensions compatibility
         model_dimensions = model_family.dimensions
         if model_dimensions > 1536:  # Very large embedding models
-            return MatchResult.failure(
-                reason=f"Large embedding model detected ({model_dimensions} dimensions)",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Large embedding dimensions: {model_dimensions}",
-            )
+            return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues"
 
         # Check token limits
         max_tokens = model_family.max_tokens
         if max_tokens > 8192:  # Very high token limits
-            return MatchResult.failure(
-                reason=f"High token limit model detected (max_tokens: {max_tokens})",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details=f"High max_tokens: {max_tokens}",
-            )
+            return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
 
         # Check for special model requirements
         model_name = model_family.model_name.lower()
@@ -489,23 +464,16 @@ def match_with_reason(
         if "gte" in model_name and "qwen2" in model_name:
             # These models have specific requirements
             if not hasattr(cls, "_check_qwen_gte_requirements"):
-                return MatchResult.failure(
-                    reason="Qwen2 GTE models require special handling",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details="Qwen2 GTE model special requirements",
-                )
+                return "Qwen2 GTE models require special handling"
 
         # Check Qwen3 models
         if "qwen3" in model_name:
-            # Qwen3 has flash attention requirements
+            # Qwen3 has flash attention requirements - basic check
             try:
-                # This would be checked during actual loading
                 pass
+
+                # This would be checked during actual loading
             except Exception:
-                return MatchResult.failure(
-                    reason="Qwen3 embedding model may have compatibility issues",
-                    error_type=ErrorType.VERSION_REQUIREMENT,
-                    technical_details="Qwen3 model compatibility check",
-                )
+                return "Qwen3 embedding model may have compatibility issues"
 
-        return MatchResult.success()
+        return True
diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 8905d36297..8fc32ebac8 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -149,8 +149,12 @@ def create_embedding(
         return result
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("vllm") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("vllm") is not None
+            else "vllm library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -158,12 +162,17 @@ def match_json(
         model_family: EmbeddingModelFamilyV2,
         model_spec: EmbeddingSpecV1,
         quantization: str,
-    ) -> bool:
+    ) -> Union[bool, str]:
+        # Check library availability first
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
+
         if model_spec.model_format in ["pytorch"]:
             prefix = model_family.model_name.split("-", 1)[0]
             if prefix in SUPPORTED_MODELS_PREFIXES:
                 return True
-        return False
+        return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}"
 
     def wait_for_load(self):
         # set context length after engine inited
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 3020483219..5942a42879 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,7 +31,6 @@
 
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
-    from .match_result import MatchResult
 
 logger = logging.getLogger(__name__)
 
@@ -71,7 +70,7 @@ def __init__(
 
     @classmethod
     @abstractmethod
-    def check_lib(cls) -> bool:
+    def check_lib(cls) -> Union[bool, str]:
         raise NotImplementedError
 
     @staticmethod
@@ -149,54 +148,19 @@ def load(self):
     def match(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if not cls.check_lib():
+        lib_result = cls.check_lib()
+        if lib_result != True:
             return False
-        return cls.match_json(llm_family, llm_spec, quantization)
+        match_result = cls.match_json(llm_family, llm_spec, quantization)
+        return match_result == True
 
     @classmethod
     @abstractmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
+    ) -> Union[bool, str]:
         raise NotImplementedError
 
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        """
-        Check if the engine can handle the given model with detailed error information.
-
-        This method provides detailed failure reasons and suggestions when an engine
-        cannot handle a specific model configuration. The default implementation
-        falls back to the boolean match_json method for backward compatibility.
-
-        Args:
-            llm_family: The model family information
-            llm_spec: The model specification
-            quantization: The quantization method
-
-        Returns:
-            MatchResult: Detailed match result with reasons and suggestions
-        """
-        from .match_result import ErrorType, MatchResult
-
-        # Default implementation for backward compatibility
-        if cls.match_json(llm_family, llm_spec, quantization):
-            return MatchResult.success()
-        else:
-            # Get basic reason based on common failure patterns
-            if not cls.check_lib():
-                return MatchResult.failure(
-                    reason=f"Required library for {cls.__name__} is not available",
-                    error_type=ErrorType.DEPENDENCY_MISSING,
-                )
-            else:
-                return MatchResult.failure(
-                    reason=f"Model configuration is not compatible with {cls.__name__}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                )
-
     def prepare_parse_reasoning_content(
         self, reasoning_content: bool, enable_thinking: bool = True
     ):
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index e8ff96f83b..5d379e642d 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -25,7 +25,6 @@
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
 from ..utils import ChatModelMixin
 
 logger = logging.getLogger(__name__)
@@ -80,73 +79,34 @@ def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
         return llamacpp_model_config
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("xllamacpp") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("xllamacpp") is not None
+            else "xllamacpp library is not installed"
+        )
 
     @classmethod
     def match_json(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="llama.cpp library (xllamacpp) is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="xllamacpp package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if llm_spec.model_format not in ["ggufv2"]:
-            return MatchResult.failure(
-                reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2",
+            return (
+                f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}"
             )
 
-        # Check model abilities - llama.cpp supports both chat and generation
-        if (
-            "chat" not in llm_family.model_ability
-            and "generate" not in llm_family.model_ability
-        ):
-            return MatchResult.failure(
-                reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # Check platform-specific issues
-        import platform
-
-        current_platform = platform.system()
-
-        # Check for ARM64 specific issues
-        if current_platform == "Darwin" and platform.machine() == "arm64":
-            # Apple Silicon specific checks could go here
-            pass
-        elif current_platform == "Windows":
-            # Windows specific checks could go here
-            pass
-
         # Check memory requirements (basic heuristic)
         model_size = float(str(llm_spec.model_size_in_billions))
         if model_size > 70:  # Very large models
-            return MatchResult.failure(
-                reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Large model size: {model_size}B parameters",
-            )
+            return f"llama.cpp may struggle with very large models ({model_size}B parameters)"
 
-        return MatchResult.success()
+        return True
 
     def load(self):
         try:
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 90115dec06..9689c3ddce 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -21,7 +21,6 @@
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
 from ..core import LLM
 from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
 from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
 
 logger = logging.getLogger(__name__)
@@ -115,28 +114,18 @@ def load(self):
         raise ValueError("LMDEPLOY engine has not supported generate yet.")
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("lmdeploy") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("lmdeploy") is not None
+            else "lmdeploy library is not installed"
+        )
 
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        return MatchResult.failure(
-            reason="LMDeploy base model does not support direct inference",
-            error_type=ErrorType.MODEL_COMPATIBILITY,
-            technical_details="LMDeploy base model class is not intended for direct use",
-        )
+    ) -> Union[bool, str]:
+        return "LMDeploy base model does not support direct inference, use specific LMDeploy model classes"
 
     def generate(
         self,
@@ -188,52 +177,23 @@ def load(self):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability first
-        if not LMDEPLOY_INSTALLED:
-            return MatchResult.failure(
-                reason="LMDeploy library is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="lmdeploy package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility and quantization
         if llm_spec.model_format == "awq":
             # LMDeploy has specific AWQ quantization requirements
             if "4" not in quantization:
-                return MatchResult.failure(
-                    reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"AWQ + {quantization} not supported by LMDeploy",
-                )
+                return f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}"
 
         # Check model compatibility
         if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
-            return MatchResult.failure(
-                reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Unsupported chat model: {llm_family.model_name}",
-            )
-
-        # Check model abilities - LMDeploy primarily supports chat models
-        if "chat" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
+            return f"Chat model not supported by LMDeploy: {llm_family.model_name}"
 
-        return MatchResult.success()
+        return True
 
     async def async_chat(
         self,
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index ff6b2e51ea..ab8f1608db 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -18,7 +18,6 @@
 import importlib.util
 import logging
 import pathlib
-import platform
 import sys
 import threading
 import time
@@ -51,7 +50,6 @@
 )
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -405,73 +403,32 @@ def wait_for_load(self):
         self._context_length = get_context_length(config)
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("mlx_lm") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("mlx_lm") is not None
+            else "mlx_lm library is not installed"
+        )
 
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Check platform compatibility first - MLX only works on Apple Silicon
-        if sys.platform != "darwin" or platform.processor() != "arm":
-            return MatchResult.failure(
-                reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
-                error_type=ErrorType.OS_REQUIREMENT,
-                technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
-            )
-
-        # Check library availability (only if platform is compatible)
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="MLX library (mlx_lm) is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="mlx_lm package not found in Python environment",
-            )
+    ) -> Union[bool, str]:
+        # Check library availability first
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if llm_spec.model_format not in ["mlx"]:
-            return MatchResult.failure(
-                reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
-            )
-
-        # Check model abilities - MLX supports generation but not chat/vision in this base class
-        if "generate" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # MLX base model doesn't support chat or vision
-        if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability:
-            return MatchResult.failure(
-                reason="MLX base model does not support chat or vision abilities",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}",
-            )
+            return f"MLX engine only supports MLX format, got: {llm_spec.model_format}"
 
         # Check memory constraints for Apple Silicon
         model_size = float(str(llm_spec.model_size_in_billions))
         if model_size > 70:  # Large models may be problematic
-            return MatchResult.failure(
-                reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Large model size: {model_size}B on Apple Silicon",
-            )
+            return f"MLX may have memory limitations with very large models ({model_size}B parameters)"
 
-        return MatchResult.success()
+        return True
 
     def _get_prompt_cache(
         self, prompt, lora_name: Optional[str] = None, model: Any = None
@@ -771,39 +728,13 @@ def _sanitize_generate_config(
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Use base class validation first
-        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
-        if not base_result.is_match:
+    ) -> Union[bool, str]:
+        # First run base class checks
+        base_result = super().match_json(llm_family, llm_spec, quantization)
+        if base_result != True:
             return base_result
 
-        # Check chat ability
-        if "chat" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # MLX Chat doesn't support vision
-        if "vision" in llm_family.model_ability:
-            return MatchResult.failure(
-                reason="MLX Chat model does not support vision abilities",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Vision ability not supported in MLXChatModel",
-            )
-
-        return MatchResult.success()
+        return True
 
     def chat(
         self,
@@ -850,59 +781,27 @@ def chat(
 
 class MLXVisionModel(MLXModel, ChatModelMixin):
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("mlx_vlm") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("mlx_vlm") is not None
+            else "mlx_vlm library is not installed"
+        )
 
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Check platform compatibility first - MLX only works on Apple Silicon
-        if sys.platform != "darwin" or platform.processor() != "arm":
-            return MatchResult.failure(
-                reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
-                error_type=ErrorType.OS_REQUIREMENT,
-                technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
-            )
-
-        # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="MLX Vision library (mlx_vlm) is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="mlx_vlm package not found in Python environment",
-            )
+    ) -> Union[bool, str]:
+        # Check library availability first - MLX Vision uses mlx_vlm
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if llm_spec.model_format not in ["mlx"]:
-            return MatchResult.failure(
-                reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
-            )
-
-        # Check vision ability
-        if "vision" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # Check for distributed inference limitations
-        # MLX Vision models don't support distributed inference
-        # This could be checked here if needed
+            return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}"
 
-        return MatchResult.success()
+        return True
 
     def _load_model(self, **kwargs):
         try:
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index d22a157777..ccb44c00bd 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -15,7 +15,6 @@
 import json
 import logging
 import multiprocessing
-import platform
 import sys
 import threading
 import time
@@ -37,7 +36,6 @@
 from .. import LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
 from ..llm_family import CustomLLMFamilyV2
-from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -336,110 +334,130 @@ def _sanitize_generate_config(
         return generate_config
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("sglang") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        # Check CUDA first - this is the most important requirement
+        try:
+            import torch
 
-    @classmethod
-    def match_json(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
+            if not torch.cuda.is_available():
+                return "SGLang requires CUDA support but no CUDA devices detected"
+        except ImportError:
+            return "SGLang requires PyTorch with CUDA support"
+
+        if importlib.util.find_spec("sglang") is None:
+            return "sglang library is not installed"
+
+        try:
+            if not getattr(sglang, "__version__", None):
+                return "SGLang version information is not available"
+
+            # Check version - SGLang requires recent version
+            from packaging import version
 
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
+            if version.parse(sglang.__version__) < version.parse("0.1.0"):
+                return f"SGLang version {sglang.__version__} is too old, minimum required is 0.1.0"
+
+            return True
+        except Exception as e:
+            return f"Error checking SGLang library: {str(e)}"
 
     @classmethod
-    def match_with_reason(
+    def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability first
-        if not SGLANG_INSTALLED:
-            return MatchResult.failure(
-                reason="SGLang library is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="sglang package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
-        # Check hardware requirements - SGLang requires CUDA
-        if not cls._has_cuda_device():
-            return MatchResult.failure(
-                reason="SGLang requires CUDA GPU support",
-                error_type=ErrorType.HARDWARE_REQUIREMENT,
-                technical_details="No CUDA devices detected",
-            )
+        # Check GPU requirements
+        try:
+            import torch
 
-        # Check OS requirements
-        if not cls._is_linux():
-            return MatchResult.failure(
-                reason="SGLang only supports Linux operating system",
-                error_type=ErrorType.OS_REQUIREMENT,
-                technical_details=f"Current OS: {platform.system()}, required: Linux",
-            )
+            if torch.cuda.device_count() == 0:
+                return "SGLang requires CUDA support but no CUDA devices detected"
+        except ImportError:
+            return "SGLang requires PyTorch with CUDA support"
 
         # Check model format compatibility
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"SGLang does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {llm_spec.model_format}",
-            )
+            return f"SGLang does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
 
         # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return MatchResult.failure(
-                    reason=f"SGLang pytorch format does not support quantization: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"pytorch + {quantization} combination not supported",
-                )
+                return f"SGLang pytorch format does not support quantization: {quantization}"
+
+        # Check model compatibility with more flexible matching
+        def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
+            """Check if model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants (e.g., qwen3 variants)
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common patterns
+            model_lower = model_name.lower()
+            if any(
+                family in model_lower
+                for family in [
+                    "qwen3",
+                    "llama",
+                    "mistral",
+                    "mixtral",
+                    "qwen2",
+                    "qwen2.5",
+                    "deepseek",
+                    "yi",
+                    "baichuan",
+                ]
+            ):
+                # Check if there's a corresponding supported model with same family
+                for supported in supported_list:
+                    if any(
+                        family in supported.lower()
+                        for family in [
+                            "qwen3",
+                            "llama",
+                            "mistral",
+                            "mixtral",
+                            "qwen2",
+                            "qwen2.5",
+                            "deepseek",
+                            "yi",
+                            "baichuan",
+                        ]
+                    ):
+                        return True
+
+            return False
 
-        # Check model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
-                return MatchResult.failure(
-                    reason=f"Custom model family not supported by SGLang: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom family: {llm_family.model_family}",
+            if not llm_family.model_family or not is_model_supported(
+                llm_family.model_family.lower(), SGLANG_SUPPORTED_MODELS
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if llm_family.model_name not in SGLANG_SUPPORTED_MODELS:
-                return MatchResult.failure(
-                    reason=f"Model not supported by SGLang: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported model: {llm_family.model_name}",
+            if not is_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in SGLANG_SUPPORTED_MODELS],
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Model may not be fully supported by SGLang: {llm_family.model_name}"
                 )
 
-        # Check model abilities with flexible logic
-        # SGLang can handle models with various text generation capabilities
-        has_text_capability = (
-            "generate" in llm_family.model_ability
-            or "chat" in llm_family.model_ability
-            or "reasoning" in llm_family.model_ability
-            or "tools" in llm_family.model_ability
-        )
-
-        if not has_text_capability:
-            return MatchResult.failure(
-                reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # SGLang is primarily designed for text models, not specialized models
-        specialized_abilities = ["embedding", "rerank", "audio", "vision"]
-        has_specialized = any(
-            ability in llm_family.model_ability for ability in specialized_abilities
-        )
-        if has_specialized:
-            return MatchResult.failure(
-                reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
-            )
-
-        return MatchResult.success()
+        return True
 
     @staticmethod
     def _convert_state_to_completion_chunk(
@@ -727,65 +745,76 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Use base class validation first
-        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
-        if not base_result.is_match:
+    ) -> Union[bool, str]:
+        # First run base class checks
+        base_result = super().match_json(llm_family, llm_spec, quantization)
+        if base_result != True:
             return base_result
 
         # Check model format compatibility (same as base)
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"SGLang Chat does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
-            )
+            return f"SGLang Chat does not support model format: {llm_spec.model_format}"
 
         # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return MatchResult.failure(
-                    reason=f"SGLang Chat pytorch format does not support quantization: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"Chat pytorch + {quantization} not supported",
-                )
+                return f"SGLang Chat pytorch format does not support quantization: {quantization}"
+
+        # Check chat model compatibility with more flexible matching
+        def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
+            """Check if chat model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common chat patterns
+            model_lower = model_name.lower()
+            if any(suffix in model_lower for suffix in ["chat", "instruct", "coder"]):
+                if any(
+                    family in model_lower
+                    for family in [
+                        "qwen3",
+                        "llama",
+                        "mistral",
+                        "mixtral",
+                        "qwen2",
+                        "qwen2.5",
+                        "deepseek",
+                        "yi",
+                        "baichuan",
+                    ]
+                ):
+                    return True
+
+            return False
 
-        # Check chat model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
-                return MatchResult.failure(
-                    reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom chat family: {llm_family.model_family}",
+            if not is_chat_model_supported(
+                llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS:
-                return MatchResult.failure(
-                    reason=f"Chat model not supported by SGLang: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported chat model: {llm_family.model_name}",
+            if not is_chat_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS],
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Chat model may not be fully supported by SGLang: {llm_family.model_name}"
                 )
 
-        # Check chat ability
-        if "chat" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        return MatchResult.success()
+        return True
 
     def _sanitize_chat_config(
         self,
@@ -858,65 +887,81 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Use base class validation first
-        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
-        if not base_result.is_match:
+    ) -> Union[bool, str]:
+        # First run base class checks
+        base_result = super().match_json(llm_family, llm_spec, quantization)
+        if base_result != True:
             return base_result
 
         # Vision models have the same format restrictions as base SGLANG
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"SGLang Vision does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+            return (
+                f"SGLang Vision does not support model format: {llm_spec.model_format}"
             )
 
         # Vision models typically work with specific quantization settings
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return MatchResult.failure(
-                    reason=f"SGLang Vision pytorch format does not support quantization: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"Vision pytorch + {quantization} not supported",
-                )
+                return f"SGLang Vision pytorch format does not support quantization: {quantization}"
+
+        # Check vision model compatibility with more flexible matching
+        def is_vision_model_supported(
+            model_name: str, supported_list: List[str]
+        ) -> bool:
+            """Check if vision model is supported with flexible matching."""
+            # Direct match
+            if model_name in supported_list:
+                return True
+
+            # Partial matching for models with variants
+            for supported in supported_list:
+                if model_name.startswith(
+                    supported.lower()
+                ) or supported.lower().startswith(model_name):
+                    return True
+
+            # Family-based matching for common vision patterns
+            model_lower = model_name.lower()
+            if any(suffix in model_lower for suffix in ["vision", "vl", "multi", "mm"]):
+                if any(
+                    family in model_lower
+                    for family in [
+                        "qwen3",
+                        "llama",
+                        "mistral",
+                        "mixtral",
+                        "qwen2",
+                        "qwen2.5",
+                        "deepseek",
+                        "yi",
+                        "baichuan",
+                        "internvl",
+                    ]
+                ):
+                    return True
+
+            return False
 
-        # Check vision model compatibility
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
-                return MatchResult.failure(
-                    reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom vision family: {llm_family.model_family}",
+            if not is_vision_model_supported(
+                llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
-                return MatchResult.failure(
-                    reason=f"Vision model not supported by SGLang: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported vision model: {llm_family.model_name}",
+            if not is_vision_model_supported(
+                llm_family.model_name.lower(),
+                [s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST],
+            ):
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Vision model may not be fully supported by SGLang: {llm_family.model_name}"
                 )
 
-        # Check vision ability
-        if "vision" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        return MatchResult.success()
+        return True
 
     def _sanitize_chat_config(
         self,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 5a4a9f557d..39e963164b 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -40,7 +40,6 @@
 from ...utils import select_device
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     LLAMA3_TOOL_CALL_FAMILY,
@@ -494,78 +493,33 @@ def stop(self):
             del self._tokenizer
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("transformers") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("transformers") is not None
+            else "transformers library is not installed"
+        )
 
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="Transformers library is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="transformers or torch package not found",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         supported_formats = ["pytorch", "gptq", "awq", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"Transformers does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Transformers unsupported format: {llm_spec.model_format}",
-            )
+            return f"Transformers does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
 
         # Check for models that shouldn't use Transformers by default
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:
-            return MatchResult.failure(
-                reason=f"Model {model_family} is not recommended for Transformers engine",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}",
-            )
+            return f"Model {model_family} is not recommended for Transformers engine, has specialized engine preference"
 
-        # Check model abilities with flexible logic
-        # Transformers can handle models with various text processing capabilities
-        has_text_capability = (
-            "generate" in llm_family.model_ability
-            or "chat" in llm_family.model_ability
-            or "reasoning" in llm_family.model_ability
-            or "tools" in llm_family.model_ability
-        )
-
-        if not has_text_capability:
-            return MatchResult.failure(
-                reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # Check for highly specialized models that might not work well with generic Transformers engine
-        specialized_abilities = ["embedding", "rerank", "audio", "vision"]
-        has_specialized = any(
-            ability in llm_family.model_ability for ability in specialized_abilities
-        )
-        if has_specialized and not has_text_capability:
-            return MatchResult.failure(
-                reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
-            )
-
-        return MatchResult.success()
+        return True
 
     def build_prefill_attention_mask(
         self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
@@ -1023,8 +977,6 @@ def match_json(
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:
             return False
-        if "chat" not in llm_family.model_ability:
-            return False
         return True
 
     async def chat(
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index bc0eede4c0..7262053a50 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -19,7 +19,6 @@
 import logging
 import multiprocessing
 import os
-import platform
 import sys
 import threading
 import time
@@ -56,7 +55,6 @@
 from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
 from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config
-from ..match_result import ErrorType, MatchResult
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -852,111 +850,77 @@ def _sanitize_generate_config(
         return sanitized
 
     @classmethod
-    def check_lib(cls) -> bool:
+    def check_lib(cls) -> Union[bool, str]:
+        # Check CUDA first - this is the most important requirement
+        try:
+            import torch
+
+            if not torch.cuda.is_available():
+                return "vLLM requires CUDA support but no CUDA devices detected"
+        except ImportError:
+            return "vLLM requires PyTorch with CUDA support"
+
         if importlib.util.find_spec("vllm") is None:
-            return False
+            return "vLLM library is not installed"
 
         try:
             import vllm
 
             if not getattr(vllm, "__version__", None):
-                return False
+                return "vLLM version information is not available"
 
             # Check version
             from packaging import version
 
             if version.parse(vllm.__version__) < version.parse("0.3.0"):
-                return False
-
-            # Check CUDA
-            import torch
-
-            if not torch.cuda.is_available():
-                return False
+                return f"vLLM version {vllm.__version__} is too old, minimum required is 0.3.0"
 
             return True
-        except Exception:
-            return False
+        except Exception as e:
+            return f"Error checking vLLM library: {str(e)}"
 
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability first
         if not VLLM_INSTALLED:
-            return MatchResult.failure(
-                reason="vLLM library is not installed",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="vllm package not found in Python environment",
-            )
+            return "vLLM library is not installed"
 
-        # Check hardware requirements
-        if not cls._has_cuda_device() and not cls._has_mlu_device():
-            return MatchResult.failure(
-                reason="vLLM requires CUDA or MLU accelerator support",
-                error_type=ErrorType.HARDWARE_REQUIREMENT,
-                technical_details="No CUDA or MLU devices detected",
-            )
+        # Check GPU device count
+        try:
+            import torch
 
-        # Check OS requirements
-        if not cls._is_linux():
-            return MatchResult.failure(
-                reason="vLLM only supports Linux operating system",
-                error_type=ErrorType.OS_REQUIREMENT,
-                technical_details=f"Current OS: {platform.system()}, required: Linux",
-            )
+            if torch.cuda.device_count() == 0:
+                return "vLLM requires CUDA support but no CUDA devices detected"
+        except ImportError:
+            return "vLLM requires PyTorch with CUDA support"
 
         # Check model format
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"vLLM does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {llm_spec.model_format}",
-            )
+            return f"vLLM does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
 
         # Check quantization compatibility with format
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return MatchResult.failure(
-                    reason=f"vLLM pytorch format does not support quantization: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"pytorch + {quantization} combination not supported",
+                return (
+                    f"vLLM pytorch format does not support quantization: {quantization}"
                 )
 
         if llm_spec.model_format == "awq":
             if "4" not in quantization:
-                return MatchResult.failure(
-                    reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"AWQ + {quantization} not supported, only 4-bit",
+                return (
+                    f"vLLM AWQ format requires 4-bit quantization, got: {quantization}"
                 )
 
         if llm_spec.model_format == "gptq":
             if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
                 if not any(q in quantization for q in ("3", "4", "8")):
-                    return MatchResult.failure(
-                        reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}",
-                        error_type=ErrorType.QUANTIZATION,
-                        technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3",
-                    )
+                    return f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}"
             else:
                 if "4" not in quantization:
-                    return MatchResult.failure(
-                        reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}",
-                        error_type=ErrorType.VERSION_REQUIREMENT,
-                        technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3",
-                    )
+                    return f"Older vLLM version only supports 4-bit GPTQ, got: {quantization} (requires vLLM >= 0.3.3 for 3/8-bit)"
 
         # Check model compatibility with more flexible matching
         def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
@@ -1006,53 +970,19 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
             if not llm_family.model_family or not is_model_supported(
                 llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
             ):
-                return MatchResult.failure(
-                    reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom family: {llm_family.model_family}",
-                )
+                return f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}"
         else:
             if not is_model_supported(
                 llm_family.model_name.lower(),
                 [s.lower() for s in VLLM_SUPPORTED_MODELS],
             ):
-                return MatchResult.failure(
-                    reason=f"Model may not be supported by vLLM: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported model: {llm_family.model_name}",
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Model may not be fully supported by vLLM: {llm_family.model_name}"
                 )
 
-        # Check model abilities with flexible logic
-        # vLLM can handle models that have text generation capabilities
-        # Models with 'chat' ability usually also support 'generate'
-        has_text_capability = (
-            "generate" in llm_family.model_ability
-            or "chat" in llm_family.model_ability
-            or "reasoning" in llm_family.model_ability
-            or "tools" in llm_family.model_ability
-        )
-
-        if not has_text_capability:
-            return MatchResult.failure(
-                reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # Additional check: ensure model doesn't have conflicting abilities
-        conflicting_abilities = ["embedding", "rerank"]
-        has_conflicting = any(
-            ability in llm_family.model_ability for ability in conflicting_abilities
-        )
-        if has_conflicting:
-            return MatchResult.failure(
-                reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}",
-            )
-
         # All checks passed
-        return MatchResult.success()
+        return True
 
     @staticmethod
     def _convert_request_output_to_completion_chunk(
@@ -1459,48 +1389,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
-        # Use base class validation first
-        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
-        if not base_result.is_match:
+    ) -> Union[bool, str]:
+        # First run base class checks
+        base_result = super().match_json(llm_family, llm_spec, quantization)
+        if base_result != True:
             return base_result
 
         # Chat-specific format support (includes GGUFv2 for newer vLLM)
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"vLLM Chat does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
-            )
+            return f"vLLM Chat does not support model format: {llm_spec.model_format}"
 
         # GGUFv2 requires newer vLLM version
         if llm_spec.model_format == "ggufv2":
             if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")):
-                return MatchResult.failure(
-                    reason="vLLM GGUF support requires version >= 0.8.2",
-                    error_type=ErrorType.VERSION_REQUIREMENT,
-                    technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2",
-                )
+                return f"vLLM GGUF support requires version >= 0.8.2, current: {VLLM_VERSION}"
 
         # AWQ chat models support more quantization levels
         if llm_spec.model_format == "awq":
             if not any(q in quantization for q in ("4", "8")):
-                return MatchResult.failure(
-                    reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"Chat AWQ + {quantization} not supported",
-                )
+                return f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}"
 
         # Check chat model compatibility with flexible matching
         def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
@@ -1554,46 +1462,18 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
             if not llm_family.model_family or not is_chat_model_supported(
                 llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
             ):
-                return MatchResult.failure(
-                    reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom chat family: {llm_family.model_family}",
-                )
+                return f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}"
         else:
             if not is_chat_model_supported(
                 llm_family.model_name.lower(),
                 [s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS],
             ):
-                return MatchResult.failure(
-                    reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported chat model: {llm_family.model_name}",
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Chat model may not be fully supported by vLLM: {llm_family.model_name}"
                 )
 
-        # Check chat ability with flexible logic
-        # vLLM Chat should work with models that have conversation capabilities
-        has_chat_capability = (
-            "chat" in llm_family.model_ability
-            or "generate" in llm_family.model_ability
-            or "reasoning" in llm_family.model_ability
-        )
-
-        if not has_chat_capability:
-            return MatchResult.failure(
-                reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        # Additional check: ensure model is not purely a tool model without conversation
-        if set(llm_family.model_ability) == {"tools"}:
-            return MatchResult.failure(
-                reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Tool-only model detected",
-            )
-
-        return MatchResult.success()
+        return True
 
     def _sanitize_chat_config(
         self,
@@ -1737,47 +1617,26 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-
-        result = cls.match_with_reason(llm_family, llm_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
-    ) -> "MatchResult":
-
-        # Use base class validation first
-        base_result = super().match_with_reason(llm_family, llm_spec, quantization)
-        if not base_result.is_match:
+    ) -> Union[bool, str]:
+        # First run base class checks
+        base_result = super().match_json(llm_family, llm_spec, quantization)
+        if base_result != True:
             return base_result
 
         # Vision models have the same format restrictions as base VLLM
         supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
         if llm_spec.model_format not in supported_formats:
-            return MatchResult.failure(
-                reason=f"vLLM Vision does not support model format: {llm_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
-            )
+            return f"vLLM Vision does not support model format: {llm_spec.model_format}"
 
         # Vision models typically work with specific quantization settings
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and quantization is not None:
-                return MatchResult.failure(
-                    reason=f"vLLM Vision pytorch format does not support quantization: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"Vision pytorch + {quantization} not supported",
-                )
+                return f"vLLM Vision pytorch format does not support quantization: {quantization}"
 
         # AWQ vision models support more quantization levels than base
         if llm_spec.model_format == "awq":
             if not any(q in quantization for q in ("4", "8")):
-                return MatchResult.failure(
-                    reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}",
-                    error_type=ErrorType.QUANTIZATION,
-                    technical_details=f"Vision AWQ + {quantization} not supported",
-                )
+                return f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}"
 
         # Check vision model compatibility with flexible matching
         def is_vision_model_supported(
@@ -1815,30 +1674,17 @@ def is_vision_model_supported(
             if not llm_family.model_family or not is_vision_model_supported(
                 llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
             ):
-                return MatchResult.failure(
-                    reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Custom vision family: {llm_family.model_family}",
-                )
+                return f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}"
         else:
             if not llm_family.model_name or not is_vision_model_supported(
                 llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
             ):
-                return MatchResult.failure(
-                    reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                    technical_details=f"Unsupported vision model: {llm_family.model_name}",
+                # Instead of hard rejection, give a warning but allow usage
+                logger.warning(
+                    f"Vision model may not be fully supported by vLLM: {llm_family.model_name}"
                 )
 
-        # Check vision ability
-        if "vision" not in llm_family.model_ability:
-            return MatchResult.failure(
-                reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}",
-                error_type=ErrorType.ABILITY_MISMATCH,
-                technical_details=f"Model abilities: {llm_family.model_ability}",
-            )
-
-        return MatchResult.success()
+        return True
 
     def _sanitize_model_config(
         self, model_config: Optional[VLLMModelConfig]
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index 2d3edde1c2..f844825d6c 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -15,13 +15,12 @@
 import os
 from abc import abstractmethod
 from collections import defaultdict
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Union
 
 from ..._compat import BaseModel
 from ...types import Rerank
 from ..core import VirtualEnvSettings
 from ..utils import ModelInstanceInfoMixin
-from .match_result import MatchResult
 from .rerank_family import check_engine_by_model_name_and_engine, match_rerank
 
 logger = logging.getLogger(__name__)
@@ -119,7 +118,7 @@ def __init__(
 
     @classmethod
     @abstractmethod
-    def check_lib(cls) -> bool:
+    def check_lib(cls) -> Union[bool, str]:
         pass
 
     @classmethod
@@ -129,62 +128,24 @@ def match_json(
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
         quantization: str,
-    ) -> bool:
+    ) -> Union[bool, str]:
         pass
 
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: RerankModelFamilyV2,
-        model_spec: RerankSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        """
-        Check if the engine can handle the given rerank model with detailed error information.
-
-        This method provides detailed failure reasons and suggestions when an engine
-        cannot handle a specific model configuration. The default implementation
-        falls back to the boolean match_json method for backward compatibility.
-
-        Args:
-            model_family: The rerank model family information
-            model_spec: The model specification
-            quantization: The quantization method
-
-        Returns:
-            MatchResult: Detailed match result with reasons and suggestions
-        """
-        from .match_result import ErrorType, MatchResult
-
-        # Default implementation for backward compatibility
-        if cls.match_json(model_family, model_spec, quantization):
-            return MatchResult.success()
-        else:
-            # Get basic reason based on common failure patterns
-            if not cls.check_lib():
-                return MatchResult.failure(
-                    reason=f"Required library for {cls.__name__} is not available",
-                    error_type=ErrorType.DEPENDENCY_MISSING,
-                )
-            else:
-                return MatchResult.failure(
-                    reason=f"Rerank model configuration is not compatible with {cls.__name__}",
-                    error_type=ErrorType.MODEL_COMPATIBILITY,
-                )
-
     @classmethod
     def match(
         cls,
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
         quantization: str,
-    ):
+    ) -> bool:
         """
         Return if the model_spec can be matched.
         """
-        if not cls.check_lib():
+        lib_result = cls.check_lib()
+        if lib_result != True:
             return False
-        return cls.match_json(model_family, model_spec, quantization)
+        match_result = cls.match_json(model_family, model_spec, quantization)
+        return match_result == True
 
     @staticmethod
     def _get_tokenizer(model_path):
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index 42332bc477..eddc58ac06 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -16,7 +16,7 @@
 import logging
 import threading
 import uuid
-from typing import List, Optional, Sequence
+from typing import List, Optional, Sequence, Union
 
 import numpy as np
 import torch
@@ -31,7 +31,6 @@
     RerankModelFamilyV2,
     RerankSpecV1,
 )
-from ..match_result import MatchResult
 from ..utils import preprocess_sentence
 
 logger = logging.getLogger(__name__)
@@ -332,8 +331,12 @@ def format_instruction(instruction, query, doc):
         return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("sentence_transformers") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("sentence_transformers") is not None
+            else "sentence_transformers library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -341,44 +344,19 @@ def match_json(
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
         quantization: str,
-    ) -> bool:
-        pass
-
-        result = cls.match_with_reason(model_family, model_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: RerankModelFamilyV2,
-        model_spec: RerankSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="Sentence Transformers library is not installed for reranking",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="sentence_transformers package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if model_spec.model_format not in ["pytorch"]:
-            return MatchResult.failure(
-                reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
-            )
+            return f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}"
 
         # Check rerank-specific requirements
         if not hasattr(model_family, "model_name"):
-            return MatchResult.failure(
-                reason="Rerank model family requires model name specification",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details="Missing model_name in rerank model family",
-            )
+            return "Rerank model family requires model name specification"
 
         # Check model type compatibility
         if model_family.type and model_family.type not in [
@@ -389,27 +367,15 @@ def match_with_reason(
             "LLM-based",
             "LLM-based layerwise",
         ]:
-            return MatchResult.failure(
-                reason=f"Model type '{model_family.type}' may not be compatible with reranking engines",
-                error_type=ErrorType.MODEL_COMPATIBILITY,
-                technical_details=f"Model type: {model_family.type}",
-            )
+            return f"Model type '{model_family.type}' may not be compatible with reranking engines"
 
         # Check max tokens limit for reranking performance
         max_tokens = model_family.max_tokens
         if max_tokens and max_tokens > 8192:  # High token limits for reranking
-            return MatchResult.failure(
-                reason=f"High max_tokens limit for reranking model: {max_tokens}",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details=f"High max_tokens for reranking: {max_tokens}",
-            )
+            return f"High max_tokens limit for reranking model: {max_tokens}, may cause performance issues"
 
         # Check language compatibility
         if not model_family.language or len(model_family.language) == 0:
-            return MatchResult.failure(
-                reason="Rerank model language information is missing",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details="Missing language information in rerank model",
-            )
+            return "Rerank model language information is missing"
 
-        return MatchResult.success()
+        return True
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index c2ee75cfef..4f63c0136c 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -1,11 +1,10 @@
 import importlib.util
 import uuid
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
-from ..match_result import MatchResult
 
 SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
 
@@ -140,8 +139,12 @@ def rerank(
         return Rerank(id=str(uuid.uuid4()), results=reranked_docs, meta=metadata)
 
     @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("vllm") is not None
+    def check_lib(cls) -> Union[bool, str]:
+        return (
+            True
+            if importlib.util.find_spec("vllm") is not None
+            else "vllm library is not installed"
+        )
 
     @classmethod
     def match_json(
@@ -149,35 +152,15 @@ def match_json(
         model_family: RerankModelFamilyV2,
         model_spec: RerankSpecV1,
         quantization: str,
-    ) -> bool:
-
-        result = cls.match_with_reason(model_family, model_spec, quantization)
-        return result.is_match
-
-    @classmethod
-    def match_with_reason(
-        cls,
-        model_family: RerankModelFamilyV2,
-        model_spec: RerankSpecV1,
-        quantization: str,
-    ) -> "MatchResult":
-        from ..match_result import ErrorType, MatchResult
-
+    ) -> Union[bool, str]:
         # Check library availability
-        if not cls.check_lib():
-            return MatchResult.failure(
-                reason="vLLM library is not installed for reranking",
-                error_type=ErrorType.DEPENDENCY_MISSING,
-                technical_details="vllm package not found in Python environment",
-            )
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
 
         # Check model format compatibility
         if model_spec.model_format not in ["pytorch"]:
-            return MatchResult.failure(
-                reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}",
-                error_type=ErrorType.MODEL_FORMAT,
-                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
-            )
+            return f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}"
 
         # Check model name prefix matching
         if model_spec.model_format == "pytorch":
@@ -187,33 +170,17 @@ def match_with_reason(
                 if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
                     # Special handling for Qwen3 models
                     if "qwen3" not in model_family.model_name.lower():
-                        return MatchResult.failure(
-                            reason=f"Model family prefix not supported by vLLM reranking: {prefix}",
-                            error_type=ErrorType.MODEL_COMPATIBILITY,
-                            technical_details=f"Unsupported prefix: {prefix}",
-                        )
+                        return f"Model family prefix not supported by vLLM reranking: {prefix}"
             except (IndexError, AttributeError):
-                return MatchResult.failure(
-                    reason="Unable to parse model family name for vLLM compatibility check",
-                    error_type=ErrorType.CONFIGURATION_ERROR,
-                    technical_details=f"Model name parsing failed: {model_family.model_name}",
-                )
+                return f"Unable to parse model family name for vLLM compatibility check: {model_family.model_name}"
 
         # Check rerank-specific requirements
         if not hasattr(model_family, "model_name"):
-            return MatchResult.failure(
-                reason="Rerank model family requires model name specification for vLLM",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details="Missing model_name in vLLM rerank model family",
-            )
+            return "Rerank model family requires model name specification for vLLM"
 
         # Check max tokens limit for vLLM reranking performance
         max_tokens = model_family.max_tokens
         if max_tokens and max_tokens > 4096:  # vLLM has stricter limits
-            return MatchResult.failure(
-                reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}",
-                error_type=ErrorType.CONFIGURATION_ERROR,
-                technical_details=f"High max_tokens for vLLM reranking: {max_tokens}",
-            )
+            return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues"
 
-        return MatchResult.success()
+        return True
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea7adb309e..3442d38ea1 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -506,33 +506,59 @@ def get_engine_params_by_name(
                     if model_name in LLM_ENGINES and LLM_ENGINES[model_name]:
                         # Try to get model family for testing
                         try:
-                            from .llm.llm_family import match_llm
+                            pass
+
+                            # Get the full model family instead of a single spec
+                            from .llm.llm_family import BUILTIN_LLM_FAMILIES
+
+                            llm_family = None
+                            for family in BUILTIN_LLM_FAMILIES:
+                                if model_name == family.model_name:
+                                    llm_family = family
+                                    break
 
-                            llm_family = match_llm(model_name, None, None, None, None)
                             if llm_family and llm_family.model_specs:
-                                llm_spec = llm_family.model_specs[0]
-                                quantization = llm_spec.quantization or "none"
 
                                 # Test each engine class for detailed error info
                                 for engine_class in llm_engine_classes:
                                     try:
-                                        if hasattr(engine_class, "match_with_reason"):
-                                            pass
+                                        engine_compatible = False
+                                        error_details = None
 
-                                            result = engine_class.match_with_reason(
-                                                llm_family, llm_spec, quantization
+                                        # Try each model spec to find one compatible with this engine
+                                        for llm_spec in llm_family.model_specs:
+                                            quantization = (
+                                                llm_spec.quantization or "none"
                                             )
-                                            if not result.is_match:
-                                                detailed_error = {
-                                                    "error": result.reason,
-                                                    "error_type": result.error_type,
-                                                    "technical_details": result.technical_details,
-                                                }
-                                                break
+
+                                            if hasattr(engine_class, "match_json"):
+                                                match_result = engine_class.match_json(
+                                                    llm_family, llm_spec, quantization
+                                                )
+                                                if match_result == True:
+                                                    engine_compatible = True
+                                                    break  # Found compatible spec
+                                                else:
+                                                    # Save error details, but continue trying other specs
+                                                    error_details = {
+                                                        "error": (
+                                                            match_result
+                                                            if isinstance(
+                                                                match_result, str
+                                                            )
+                                                            else "Engine is not compatible"
+                                                        ),
+                                                        "error_type": "model_compatibility",
+                                                        "technical_details": f"The {engine_class.__name__} engine cannot handle the current model configuration: {llm_spec.model_format} format",
+                                                    }
+
+                                        if not engine_compatible and error_details:
+                                            detailed_error = error_details
+                                            break
                                     except Exception as e:
                                         # Fall back to next engine class with clear error logging
                                         logger.warning(
-                                            f"Engine class {engine_class.__name__} match_with_reason failed: {e}"
+                                            f"Engine class {engine_class.__name__} match_json failed: {e}"
                                         )
                                         # Continue to try next engine class, but this is expected behavior for fallback
                                         continue
@@ -555,8 +581,15 @@ def get_engine_params_by_name(
                         for engine_class in llm_engine_classes:
                             try:
                                 if hasattr(engine_class, "check_lib"):
-                                    lib_available: bool = engine_class.check_lib()  # type: ignore[assignment]
-                                    if not lib_available:
+                                    lib_result = engine_class.check_lib()
+                                    if lib_result != True:
+                                        # If check_lib returns a string, it's an error message
+                                        error_msg = (
+                                            lib_result
+                                            if isinstance(lib_result, str)
+                                            else f"Engine {engine_name} library check failed"
+                                        )
+                                        engine_params[engine_name] = error_msg
                                         break
                                 else:
                                     # If no check_lib method, try to use engine's match method for compatibility check
@@ -564,17 +597,49 @@ def get_engine_params_by_name(
                                     try:
                                         # Create a minimal test spec if we don't have real model specs
                                         from .llm.llm_family import (
+                                            AwqLLMSpecV2,
+                                            GgmlLLMSpecV2,
+                                            GptqLLMSpecV2,
                                             LLMFamilyV2,
+                                            MLXLLMSpecV2,
                                             PytorchLLMSpecV2,
                                         )
 
-                                        # Create a minimal test case
+                                        # Create appropriate test spec based on engine class
+                                        engine_name_lower = (
+                                            engine_class.__name__.lower()
+                                        )
+                                        if "mlx" in engine_name_lower:
+                                            # MLX engines need MLX format
+                                            test_spec_class = MLXLLMSpecV2
+                                            model_format = "mlx"
+                                        elif (
+                                            "ggml" in engine_name_lower
+                                            or "llamacpp" in engine_name_lower
+                                        ):
+                                            # GGML/llama.cpp engines need GGML format
+                                            test_spec_class = GgmlLLMSpecV2
+                                            model_format = "ggmlv3"
+                                        elif "gptq" in engine_name_lower:
+                                            # GPTQ engines need GPTQ format
+                                            test_spec_class = GptqLLMSpecV2
+                                            model_format = "gptq"
+                                        elif "awq" in engine_name_lower:
+                                            # AWQ engines need AWQ format
+                                            test_spec_class = AwqLLMSpecV2
+                                            model_format = "awq"
+                                        else:
+                                            # Default to PyTorch format
+                                            test_spec_class = PytorchLLMSpecV2
+                                            model_format = "pytorch"
+
+                                        # Create a minimal test case with appropriate format
                                         test_family = LLMFamilyV2(
                                             model_name="test",
                                             model_family="test",
                                             model_specs=[
-                                                PytorchLLMSpecV2(
-                                                    model_format="pytorch",
+                                                test_spec_class(
+                                                    model_format=model_format,
                                                     quantization="none",
                                                 )
                                             ],
@@ -597,11 +662,21 @@ def get_engine_params_by_name(
                                                 break
                                         elif hasattr(engine_class, "match_json"):
                                             # Fallback to simple match method - use test data
-                                            if engine_class.match_json(
+                                            match_result = engine_class.match_json(
                                                 test_family, test_spec, "none"
-                                            ):
-                                                break
+                                            )
+                                            if match_result == True:
+                                                break  # Engine is available
                                             else:
+                                                # Get detailed error information
+                                                error_message = (
+                                                    match_result
+                                                    if isinstance(match_result, str)
+                                                    else f"Engine {engine_name} is not compatible with current model or environment"
+                                                )
+                                                engine_params[engine_name] = (
+                                                    error_message
+                                                )
                                                 break
                                         else:
                                             # Final fallback: generic import check
@@ -653,9 +728,7 @@ def get_engine_params_by_name(
 
         return engine_params
     elif model_type == "embedding":
-        from .embedding.embed_family import (
-            EMBEDDING_ENGINES,
-        )
+        from .embedding.embed_family import EMBEDDING_ENGINES
         from .embedding.embed_family import (
             SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
         )
@@ -716,14 +789,23 @@ def get_engine_params_by_name(
                                         )
                                         test_spec = test_family.model_specs[0]
 
-                                        # Use the engine's match method to check compatibility
-                                        if embedding_engine_class.match(
-                                            test_family, test_spec, "none"
-                                        ):
+                                        # Use the engine's match_json method to check compatibility and get detailed error
+                                        match_result = (
+                                            embedding_engine_class.match_json(
+                                                test_family, test_spec, "none"
+                                            )
+                                        )
+                                        if match_result == True:
                                             break  # Engine is available
                                         else:
+                                            # Get detailed error information
+                                            error_message = (
+                                                match_result
+                                                if isinstance(match_result, str)
+                                                else f"Engine {engine_name} is not compatible with current model or environment"
+                                            )
                                             embedding_error_details = {
-                                                "error": f"Engine {engine_name} is not compatible with current model or environment",
+                                                "error": error_message,
                                                 "error_type": "model_compatibility",
                                                 "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
                                             }
@@ -789,9 +871,7 @@ def get_engine_params_by_name(
 
         return engine_params
     elif model_type == "rerank":
-        from .rerank.rerank_family import (
-            RERANK_ENGINES,
-        )
+        from .rerank.rerank_family import RERANK_ENGINES
         from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES
 
         if model_name not in RERANK_ENGINES:
@@ -850,14 +930,21 @@ def get_engine_params_by_name(
                                         )
                                         test_spec = test_family.model_specs[0]
 
-                                        # Use the engine's match method to check compatibility
-                                        if rerank_engine_class.match(
+                                        # Use the engine's match_json method to check compatibility and get detailed error
+                                        match_result = rerank_engine_class.match_json(
                                             test_family, test_spec, "none"
-                                        ):
+                                        )
+                                        if match_result == True:
                                             break  # Engine is available
                                         else:
+                                            # Get detailed error information
+                                            error_message = (
+                                                match_result
+                                                if isinstance(match_result, str)
+                                                else f"Engine {engine_name} is not compatible with current model or environment"
+                                            )
                                             rerank_error_details = {
-                                                "error": f"Engine {engine_name} is not compatible with current model or environment",
+                                                "error": error_message,
                                                 "error_type": "model_compatibility",
                                                 "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
                                             }

From 26ca06f9645f0691cded28dbd2243f27a70912c1 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:27:58 +0800
Subject: [PATCH 32/37] pre-commit

---
 xinference/model/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 3442d38ea1..12be38ec71 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -728,7 +728,9 @@ def get_engine_params_by_name(
 
         return engine_params
     elif model_type == "embedding":
-        from .embedding.embed_family import EMBEDDING_ENGINES
+        from .embedding.embed_family import (
+            EMBEDDING_ENGINES,
+        )
         from .embedding.embed_family import (
             SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
         )

From 48a272d2bed187982f95bbff0d5f7cc9ce517b19 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:47:14 +0800
Subject: [PATCH 33/37] mypy-error

---
 xinference/model/llm/sglang/core.py | 10 +++++-----
 xinference/model/utils.py           | 18 ++++--------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index ccb44c00bd..7d5d13d229 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -448,7 +448,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
                     f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if not is_model_supported(
+            if not llm_family.model_name or not is_model_supported(
                 llm_family.model_name.lower(),
                 [s.lower() for s in SGLANG_SUPPORTED_MODELS],
             ):
@@ -797,7 +797,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_chat_model_supported(
+            if not llm_family.model_family or not is_chat_model_supported(
                 llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS
             ):
                 # Instead of hard rejection, give a warning but allow usage
@@ -805,7 +805,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
                     f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if not is_chat_model_supported(
+            if not llm_family.model_name or not is_chat_model_supported(
                 llm_family.model_name.lower(),
                 [s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS],
             ):
@@ -944,7 +944,7 @@ def is_vision_model_supported(
             return False
 
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if not is_vision_model_supported(
+            if not llm_family.model_family or not is_vision_model_supported(
                 llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST
             ):
                 # Instead of hard rejection, give a warning but allow usage
@@ -952,7 +952,7 @@ def is_vision_model_supported(
                     f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}"
                 )
         else:
-            if not is_vision_model_supported(
+            if not llm_family.model_name or not is_vision_model_supported(
                 llm_family.model_name.lower(),
                 [s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST],
             ):
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 12be38ec71..35f5b21fdc 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -597,9 +597,7 @@ def get_engine_params_by_name(
                                     try:
                                         # Create a minimal test spec if we don't have real model specs
                                         from .llm.llm_family import (
-                                            AwqLLMSpecV2,
-                                            GgmlLLMSpecV2,
-                                            GptqLLMSpecV2,
+                                            LlamaCppLLMSpecV2,
                                             LLMFamilyV2,
                                             MLXLLMSpecV2,
                                             PytorchLLMSpecV2,
@@ -618,18 +616,10 @@ def get_engine_params_by_name(
                                             or "llamacpp" in engine_name_lower
                                         ):
                                             # GGML/llama.cpp engines need GGML format
-                                            test_spec_class = GgmlLLMSpecV2
-                                            model_format = "ggmlv3"
-                                        elif "gptq" in engine_name_lower:
-                                            # GPTQ engines need GPTQ format
-                                            test_spec_class = GptqLLMSpecV2
-                                            model_format = "gptq"
-                                        elif "awq" in engine_name_lower:
-                                            # AWQ engines need AWQ format
-                                            test_spec_class = AwqLLMSpecV2
-                                            model_format = "awq"
+                                            test_spec_class = LlamaCppLLMSpecV2
+                                            model_format = "ggufv2"
                                         else:
-                                            # Default to PyTorch format
+                                            # Default to PyTorch format (supports gptq, awq, fp8, bnb)
                                             test_spec_class = PytorchLLMSpecV2
                                             model_format = "pytorch"
 

From 0acb4711751c2d295cbeb037763407b0735aa229 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 17:54:11 +0800
Subject: [PATCH 34/37] fix mlx CI bug

---
 xinference/model/llm/mlx/core.py | 44 +++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index ab8f1608db..b391ac97b8 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -423,6 +423,14 @@ def match_json(
         if llm_spec.model_format not in ["mlx"]:
             return f"MLX engine only supports MLX format, got: {llm_spec.model_format}"
 
+        # Base MLX model should not handle chat or vision models
+        # Those should be handled by MLXChatModel and MLXVisionModel respectively
+        model_abilities = getattr(llm_family, "model_ability", [])
+        if "chat" in model_abilities:
+            return False  # Let MLXChatModel handle this
+        if "vision" in model_abilities:
+            return False  # Let MLXVisionModel handle this
+
         # Check memory constraints for Apple Silicon
         model_size = float(str(llm_spec.model_size_in_billions))
         if model_size > 70:  # Large models may be problematic
@@ -729,10 +737,28 @@ def _sanitize_generate_config(
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> Union[bool, str]:
-        # First run base class checks
-        base_result = super().match_json(llm_family, llm_spec, quantization)
-        if base_result != True:
-            return base_result
+        # Check library availability first
+        lib_result = cls.check_lib()
+        if lib_result != True:
+            return lib_result
+
+        # Check model format compatibility
+        if llm_spec.model_format not in ["mlx"]:
+            return f"MLX Chat engine only supports MLX format, got: {llm_spec.model_format}"
+
+        # Check that this model has chat ability
+        model_abilities = getattr(llm_family, "model_ability", [])
+        if "chat" not in model_abilities:
+            return False  # Not a chat model
+
+        # MLX Chat doesn't support vision
+        if "vision" in model_abilities:
+            return False  # Let MLXVisionModel handle this
+
+        # Check memory constraints for Apple Silicon
+        model_size = float(str(llm_spec.model_size_in_billions))
+        if model_size > 70:  # Large models may be problematic
+            return f"MLX Chat may have memory limitations with very large models ({model_size}B parameters)"
 
         return True
 
@@ -801,6 +827,16 @@ def match_json(
         if llm_spec.model_format not in ["mlx"]:
             return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}"
 
+        # Check that this model has vision ability
+        model_abilities = getattr(llm_family, "model_ability", [])
+        if "vision" not in model_abilities:
+            return False  # Not a vision model
+
+        # Check memory constraints for Apple Silicon
+        model_size = float(str(llm_spec.model_size_in_billions))
+        if model_size > 70:  # Large models may be problematic
+            return f"MLX Vision may have memory limitations with very large models ({model_size}B parameters)"
+
         return True
 
     def _load_model(self, **kwargs):

From 1b973b41f50de563b97f256318ce47ca839abe3c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 30 Oct 2025 16:19:19 +0800
Subject: [PATCH 35/37] fix CI bug

---
 xinference/model/embedding/vllm/core.py       | 72 +++++++++++++++++--
 xinference/model/rerank/vllm/core.py          | 62 ++++++++++++++--
 .../model/rerank/vllm/tests/test_vllm.py      |  1 +
 3 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 8fc32ebac8..674eeaa21e 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -22,7 +22,7 @@
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 
 logger = logging.getLogger(__name__)
-SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
+SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
 
 
 class VLLMEmbeddingModel(EmbeddingModel):
@@ -32,16 +32,44 @@ def __init__(self, *args, **kwargs):
 
     def load(self):
         try:
+            # Handle vLLM-transformers config conflict by setting environment variable
+            import os
+
+            os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm"
+
             from vllm import LLM
 
-        except ImportError:
+        except ImportError as e:
             error_message = "Failed to import module 'vllm'"
             installation_guide = [
                 "Please make sure 'vllm' is installed. ",
                 "You can install it by `pip install vllm`\n",
             ]
 
+            # Check if it's a config conflict error
+            if "aimv2" in str(e):
+                error_message = (
+                    "vLLM has a configuration conflict with transformers library"
+                )
+                installation_guide = [
+                    "This is a known issue with certain vLLM and transformers versions.",
+                    "Try upgrading transformers or using a different vLLM version.\n",
+                ]
+
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        except Exception as e:
+            # Handle config registration conflicts
+            if "aimv2" in str(e) and "already used by a Transformers config" in str(e):
+                error_message = (
+                    "vLLM has a configuration conflict with transformers library"
+                )
+                installation_guide = [
+                    "This is a known issue with certain vLLM and transformers versions.",
+                    "Try: pip install --upgrade transformers vllm\n",
+                ]
+                raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}")
+            raise
+
         if self.model_family.model_name in {
             "Qwen3-Embedding-0.6B",
             "Qwen3-Embedding-4B",
@@ -168,11 +196,41 @@ def match_json(
         if lib_result != True:
             return lib_result
 
-        if model_spec.model_format in ["pytorch"]:
-            prefix = model_family.model_name.split("-", 1)[0]
-            if prefix in SUPPORTED_MODELS_PREFIXES:
-                return True
-        return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}"
+        # Check model format compatibility
+        if model_spec.model_format not in ["pytorch"]:
+            return f"VLLM Embedding engine only supports pytorch format models, got format: {model_spec.model_format}"
+
+        # Check model name prefix matching
+        prefix = model_family.model_name.split("-", 1)[0]
+        if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
+            return f"VLLM Embedding engine only supports models with prefixes {SUPPORTED_MODELS_PREFIXES}, got model: {model_family.model_name}"
+
+        # Additional runtime compatibility checks for vLLM version
+        try:
+            import vllm
+            from packaging.version import Version
+
+            vllm_version = Version(vllm.__version__)
+
+            # Check for vLLM version compatibility issues
+            if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"):
+                # vLLM 0.10.x has V1 engine issues on CPU
+                import platform
+
+                if platform.system() == "Darwin" and platform.machine() in [
+                    "arm64",
+                    "arm",
+                ]:
+                    # Check if this is likely to run on CPU (most common for testing)
+                    return f"vLLM {vllm_version} has compatibility issues with embedding models on Apple Silicon CPUs. Consider using a different platform or vLLM version."
+            elif vllm_version >= Version("0.11.0"):
+                # vLLM 0.11+ should have fixed the config conflict issue
+                pass
+        except Exception:
+            # If version check fails, continue with basic validation
+            pass
+
+        return True
 
     def wait_for_load(self):
         # set context length after engine inited
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 4f63c0136c..2c6d9dbeed 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -6,22 +6,49 @@
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
 
-SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
+SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
 
 
 class VLLMRerankModel(RerankModel):
     def load(self):
         try:
+            # Handle vLLM-transformers config conflict by setting environment variable
+            import os
+
+            os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm"
+
             from vllm import LLM
 
-        except ImportError:
+        except ImportError as e:
             error_message = "Failed to import module 'vllm'"
             installation_guide = [
                 "Please make sure 'vllm' is installed. ",
                 "You can install it by `pip install vllm`\n",
             ]
 
+            # Check if it's a config conflict error
+            if "aimv2" in str(e):
+                error_message = (
+                    "vLLM has a configuration conflict with transformers library"
+                )
+                installation_guide = [
+                    "This is a known issue with certain vLLM and transformers versions.",
+                    "Try upgrading transformers or using a different vLLM version.\n",
+                ]
+
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        except Exception as e:
+            # Handle config registration conflicts
+            if "aimv2" in str(e) and "already used by a Transformers config" in str(e):
+                error_message = (
+                    "vLLM has a configuration conflict with transformers library"
+                )
+                installation_guide = [
+                    "This is a known issue with certain vLLM and transformers versions.",
+                    "Try: pip install --upgrade transformers vllm\n",
+                ]
+                raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}")
+            raise
 
         if self.model_family.model_name in {
             "Qwen3-Reranker-0.6B",
@@ -180,7 +207,34 @@ def match_json(
 
         # Check max tokens limit for vLLM reranking performance
         max_tokens = model_family.max_tokens
-        if max_tokens and max_tokens > 4096:  # vLLM has stricter limits
-            return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues"
+        if (
+            max_tokens and max_tokens > 32768
+        ):  # vLLM has stricter limits, but Qwen3 can handle up to 32k
+            return f"Max tokens limit too high for vLLM reranking model: {max_tokens}, exceeds safe limit"
+
+        # Additional runtime compatibility checks for vLLM version
+        try:
+            import vllm
+            from packaging.version import Version
+
+            vllm_version = Version(vllm.__version__)
+
+            # Check for vLLM version compatibility issues
+            if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"):
+                # vLLM 0.10.x has V1 engine issues on CPU
+                import platform
+
+                if platform.system() == "Darwin" and platform.machine() in [
+                    "arm64",
+                    "arm",
+                ]:
+                    # Check if this is likely to run on CPU (most common for testing)
+                    return f"vLLM {vllm_version} has compatibility issues with reranking models on Apple Silicon CPUs. Consider using a different platform or vLLM version."
+            elif vllm_version >= Version("0.11.0"):
+                # vLLM 0.11+ should have fixed the config conflict issue
+                pass
+        except Exception:
+            # If version check fails, continue with basic validation
+            pass
 
         return True
diff --git a/xinference/model/rerank/vllm/tests/test_vllm.py b/xinference/model/rerank/vllm/tests/test_vllm.py
index 37b948ac42..578b62bdd4 100644
--- a/xinference/model/rerank/vllm/tests/test_vllm.py
+++ b/xinference/model/rerank/vllm/tests/test_vllm.py
@@ -61,6 +61,7 @@ def test_qwen3_vllm(setup):
         model_name="Qwen3-Reranker-0.6B",
         model_type="rerank",
         model_engine="vllm",
+        max_num_batched_tokens=81920,  # Allow larger batch size for Qwen3
     )
 
     model = client.get_model(model_uid)

From f52824a70484083cd68ef82341d4f4e9b87d8863 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 10 Nov 2025 16:44:45 +0800
Subject: [PATCH 36/37] modify embedding sentence_transformers

---
 xinference/model/embedding/sentence_transformers/core.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 4e1c7b8b73..6521358a3f 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -449,13 +449,13 @@ def match_json(
 
         # Check model dimensions compatibility
         model_dimensions = model_family.dimensions
-        if model_dimensions > 1536:  # Very large embedding models
-            return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues"
+        if model_dimensions > 8192:  # Extremely large embedding models
+            return f"Extremely large embedding model detected ({model_dimensions} dimensions), may have performance issues"
 
         # Check token limits
         max_tokens = model_family.max_tokens
-        if max_tokens > 8192:  # Very high token limits
-            return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
+        if max_tokens > 131072:  # Extremely high token limits (128K)
+            return f"Extremely high token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
 
         # Check for special model requirements
         model_name = model_family.model_name.lower()

From dd2f141d06d5716b274e41c95ac5dee7bcc64575 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 12 Nov 2025 10:44:08 +0800
Subject: [PATCH 37/37] modify embedding sentence_transformers

---
 xinference/model/embedding/vllm/core.py | 47 +++++++++++++++++++++++--
 xinference/model/rerank/vllm/core.py    | 40 +++++++++++++++++++++
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 674eeaa21e..c037ce2b53 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -89,6 +89,34 @@ def load(self):
                     is_matryoshka=True,
                 )
 
+        # Set appropriate VLLM configuration parameters based on model capabilities
+        model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+        # Set max_model_len based on model family capabilities with reasonable limits
+        max_model_len = min(model_max_tokens, 8192)
+        if "max_model_len" not in self._kwargs:
+            self._kwargs["max_model_len"] = max_model_len
+
+        # Ensure max_num_batched_tokens is sufficient for large models
+        if "max_num_batched_tokens" not in self._kwargs:
+            # max_num_batched_tokens should be at least max_model_len
+            # Set to a reasonable minimum that satisfies the constraint
+            self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+        # Configure other reasonable defaults for embedding models
+        if "gpu_memory_utilization" not in self._kwargs:
+            self._kwargs["gpu_memory_utilization"] = 0.7
+
+        # Use a smaller block size for better compatibility
+        if "block_size" not in self._kwargs:
+            self._kwargs["block_size"] = 16
+
+        logger.debug(
+            f"VLLM configuration for {self.model_family.model_name}: "
+            f"max_model_len={self._kwargs.get('max_model_len')}, "
+            f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+        )
+
         self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()
 
@@ -246,6 +274,21 @@ def _set_context_length(self):
                 self._model.llm_engine.vllm_config.model_config.max_model_len
             )
         else:
-            # v1
-            logger.warning("vLLM v1 is not supported, ignore context length setting")
+            # v1 - Get max_model_len from the v1 engine configuration
+            try:
+                # For v1, access the config differently
+                if hasattr(self._model.llm_engine, "vllm_config"):
+                    self._context_length = (
+                        self._model.llm_engine.vllm_config.model_config.max_model_len
+                    )
+                elif hasattr(self._model.llm_engine, "model_config"):
+                    self._context_length = (
+                        self._model.llm_engine.model_config.max_model_len
+                    )
+                else:
+                    # Fallback to the configured value
+                    self._context_length = self._kwargs.get("max_model_len", 512)
+            except Exception as e:
+                logger.warning(f"Failed to get context length from vLLM v1 engine: {e}")
+                self._context_length = self._kwargs.get("max_model_len", 512)
         logger.debug("Model context length: %s", self._context_length)
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 2c6d9dbeed..9729a2ccc7 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -1,4 +1,6 @@
 import importlib.util
+import json
+import logging
 import uuid
 from typing import List, Optional, Union
 
@@ -6,6 +8,8 @@
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
 
+logger = logging.getLogger(__name__)
+
 SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
 
 
@@ -67,6 +71,42 @@ def load(self):
                     classifier_from_token=["no", "yes"],
                     is_original_qwen3_reranker=True,
                 )
+            elif isinstance(self._kwargs["hf_overrides"], str):
+                self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
+                self._kwargs["hf_overrides"].update(
+                    architectures=["Qwen3ForSequenceClassification"],
+                    classifier_from_token=["no", "yes"],
+                    is_original_qwen3_reranker=True,
+                )
+
+        # Set appropriate VLLM configuration parameters based on model capabilities
+        model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+        # Set max_model_len based on model family capabilities with reasonable limits
+        max_model_len = min(model_max_tokens, 8192)
+        if "max_model_len" not in self._kwargs:
+            self._kwargs["max_model_len"] = max_model_len
+
+        # Ensure max_num_batched_tokens is sufficient for large models
+        if "max_num_batched_tokens" not in self._kwargs:
+            # max_num_batched_tokens should be at least max_model_len
+            # Set to a reasonable minimum that satisfies the constraint
+            self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+        # Configure other reasonable defaults for reranking models
+        if "gpu_memory_utilization" not in self._kwargs:
+            self._kwargs["gpu_memory_utilization"] = 0.7
+
+        # Use a smaller block size for better compatibility
+        if "block_size" not in self._kwargs:
+            self._kwargs["block_size"] = 16
+
+        logger.debug(
+            f"VLLM configuration for rerank model {self.model_family.model_name}: "
+            f"max_model_len={self._kwargs.get('max_model_len')}, "
+            f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+        )
+
         self._model = LLM(model=self._model_path, task="score", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()