From daa305adeab4a1b7b1332256257c036280bcef37 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 13 Oct 2025 17:39:17 +0800
Subject: [PATCH 01/37] FEAT: add engine ability display
---
xinference/model/llm/vllm/core.py | 25 ++-
xinference/model/utils.py | 255 ++++++++++++++++++++++++++++--
2 files changed, 263 insertions(+), 17 deletions(-)
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 4da42ed48b..58b0a523aa 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -851,7 +851,30 @@ def _sanitize_generate_config(
@classmethod
def check_lib(cls) -> bool:
- return importlib.util.find_spec("vllm") is not None
+ if importlib.util.find_spec("vllm") is None:
+ return False
+
+ try:
+ import vllm
+
+ if not getattr(vllm, "__version__", None):
+ return False
+
+ # Check version
+ from packaging import version
+
+ if version.parse(vllm.__version__) < version.parse("0.3.0"):
+ return False
+
+ # Check CUDA
+ import torch
+
+ if not torch.cuda.is_available():
+ return False
+
+ return True
+ except Exception:
+ return False
@classmethod
def match_json(
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea5dec74d5..0d8e471bb0 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -14,6 +14,7 @@
import asyncio
import functools
+import importlib.util
import json
import logging
import os
@@ -472,44 +473,266 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
-) -> Optional[Dict[str, List[dict]]]:
+) -> Optional[Dict[str, Union[List[dict], str]]]:
if model_type == "LLM":
- from .llm.llm_family import LLM_ENGINES
+ from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
if model_name not in LLM_ENGINES:
return None
- # filter llm_class
- engine_params = deepcopy(LLM_ENGINES[model_name])
- for engine, params in engine_params.items():
+ # Get all supported engines, not just currently available ones
+ all_supported_engines = list(SUPPORTED_ENGINES.keys())
+ engine_params = {}
+
+ # First add currently available engine parameters
+ available_engines = deepcopy(LLM_ENGINES[model_name])
+ for engine, params in available_engines.items():
for param in params:
- del param["llm_class"]
+ # Remove previous available attribute as available engines don't need this flag
+ if "available" in param:
+ del param["available"]
+ engine_params[engine] = params
+
+ # Check unavailable engines
+ for engine_name in all_supported_engines:
+ if engine_name not in engine_params: # Engine not in available list
+ try:
+ engine_classes = SUPPORTED_ENGINES[engine_name]
+ error_msg = None
+
+ # Try to find specific error reasons
+ for engine_class in engine_classes:
+ try:
+ if hasattr(engine_class, "check_lib"):
+ lib_available = engine_class.check_lib()
+ if not lib_available:
+ error_msg = (
+ f"Engine {engine_name} library is not available"
+ )
+ break
+ else:
+ # If no check_lib method, try import check
+ module_name = engine_name.lower().replace(".", "")
+ if engine_name == "vLLM":
+ module_name = "vllm"
+ elif engine_name == "SGLang":
+ module_name = "sglang"
+ elif engine_name == "llama.cpp":
+ module_name = "llama_cpp"
+ elif engine_name == "MLX":
+ module_name = "mlx"
+ elif engine_name == "LMDEPLOY":
+ module_name = "lmdeploy"
+ elif engine_name == "Transformers":
+ module_name = "transformers"
+
+ importlib.import_module(module_name)
+ break
+ except ImportError as e:
+ error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ except Exception as e:
+ error_msg = (
+ f"Engine {engine_name} is not available: {str(e)}"
+ )
+
+ if error_msg is None:
+ error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+ # For unavailable engines, directly return error message string
+ engine_params[engine_name] = error_msg
+
+ except Exception as e:
+ # If exception occurs during checking, return error message string
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
+
+ # Filter out llm_class field
+ for engine, params in engine_params.items():
+ if isinstance(
+ params, list
+ ): # Only process parameter lists of available engines
+ for param in params:
+ if "llm_class" in param:
+ del param["llm_class"]
return engine_params
elif model_type == "embedding":
- from .embedding.embed_family import EMBEDDING_ENGINES
+ from .embedding.embed_family import (
+ EMBEDDING_ENGINES,
+ )
+ from .embedding.embed_family import (
+ SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
+ )
if model_name not in EMBEDDING_ENGINES:
return None
- # filter embedding_class
- engine_params = deepcopy(EMBEDDING_ENGINES[model_name])
- for engine, params in engine_params.items():
+ # Get all supported engines, not just currently available ones
+ all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys())
+ engine_params = {}
+
+ # First add currently available engine parameters
+ available_engines = deepcopy(EMBEDDING_ENGINES[model_name])
+ for engine, params in available_engines.items():
for param in params:
- del param["embedding_class"]
+ # Remove previous available attribute as available engines don't need this flag
+ if "available" in param:
+ del param["available"]
+ engine_params[engine] = params
+
+ # Check unavailable engines
+ for engine_name in all_supported_engines:
+ if engine_name not in engine_params: # Engine not in available list
+ try:
+ engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+ error_msg = None
+
+ # Try to find specific error reasons
+ for engine_class in engine_classes:
+ try:
+ if hasattr(engine_class, "check_lib"):
+ lib_available = engine_class.check_lib()
+ if not lib_available:
+ error_msg = (
+ f"Engine {engine_name} library is not available"
+ )
+ break
+ else:
+ # If no check_lib method, try import check
+ module_name = engine_name.lower().replace(".", "")
+ if engine_name == "vLLM":
+ module_name = "vllm"
+ elif engine_name == "SGLang":
+ module_name = "sglang"
+ elif engine_name == "llama.cpp":
+ module_name = "llama_cpp"
+ elif engine_name == "MLX":
+ module_name = "mlx"
+ elif engine_name == "LMDEPLOY":
+ module_name = "lmdeploy"
+ elif engine_name == "Transformers":
+ module_name = "transformers"
+ elif engine_name == "SentenceTransformers":
+ module_name = "sentence_transformers"
+
+ importlib.import_module(module_name)
+ break
+ except ImportError as e:
+ error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ except Exception as e:
+ error_msg = (
+ f"Engine {engine_name} is not available: {str(e)}"
+ )
+
+ if error_msg is None:
+ error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+ # For unavailable engines, directly return error message string
+ engine_params[engine_name] = error_msg
+
+ except Exception as e:
+ # If exception occurs during checking, return error message string
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
+
+ # Filter out embedding_class field
+ for engine, params in engine_params.items():
+ if isinstance(
+ params, list
+ ): # Only process parameter lists of available engines
+ for param in params:
+ if "embedding_class" in param:
+ del param["embedding_class"]
return engine_params
elif model_type == "rerank":
- from .rerank.rerank_family import RERANK_ENGINES
+ from .rerank.rerank_family import (
+ RERANK_ENGINES,
+ )
+ from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES
if model_name not in RERANK_ENGINES:
return None
- # filter rerank_class
- engine_params = deepcopy(RERANK_ENGINES[model_name])
- for engine, params in engine_params.items():
+ # Get all supported engines, not just currently available ones
+ all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys())
+ engine_params = {}
+
+ # First add currently available engine parameters
+ available_engines = deepcopy(RERANK_ENGINES[model_name])
+ for engine, params in available_engines.items():
for param in params:
- del param["rerank_class"]
+ # Remove previous available attribute as available engines don't need this flag
+ if "available" in param:
+ del param["available"]
+ engine_params[engine] = params
+
+ # Check unavailable engines
+ for engine_name in all_supported_engines:
+ if engine_name not in engine_params: # Engine not in available list
+ try:
+ engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+ error_msg = None
+
+ # Try to find specific error reasons
+ for engine_class in engine_classes:
+ try:
+ if hasattr(engine_class, "check_lib"):
+ lib_available = engine_class.check_lib()
+ if not lib_available:
+ error_msg = (
+ f"Engine {engine_name} library is not available"
+ )
+ break
+ else:
+ # If no check_lib method, try import check
+ module_name = engine_name.lower().replace(".", "")
+ if engine_name == "vLLM":
+ module_name = "vllm"
+ elif engine_name == "SGLang":
+ module_name = "sglang"
+ elif engine_name == "llama.cpp":
+ module_name = "llama_cpp"
+ elif engine_name == "MLX":
+ module_name = "mlx"
+ elif engine_name == "LMDEPLOY":
+ module_name = "lmdeploy"
+ elif engine_name == "Transformers":
+ module_name = "transformers"
+ elif engine_name == "SentenceTransformers":
+ module_name = "sentence_transformers"
+
+ importlib.import_module(module_name)
+ break
+ except ImportError as e:
+ error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ except Exception as e:
+ error_msg = (
+ f"Engine {engine_name} is not available: {str(e)}"
+ )
+
+ if error_msg is None:
+ error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+
+ # For unavailable engines, directly return error message string
+ engine_params[engine_name] = error_msg
+
+ except Exception as e:
+ # If exception occurs during checking, return error message string
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
+
+ # Filter out rerank_class field
+ for engine, params in engine_params.items():
+ if isinstance(
+ params, list
+ ): # Only process parameter lists of available engines
+ for param in params:
+ if "rerank_class" in param:
+ del param["rerank_class"]
return engine_params
else:
From 5347c4be930b4125382555c1328b78b4fd8a1fce Mon Sep 17 00:00:00 2001
From: yiboyasss <3359595624@qq.com>
Date: Mon, 13 Oct 2025 18:16:12 +0800
Subject: [PATCH 02/37] feat: frontend supports engine ability display
---
.../components/launchModelDrawer.js | 69 ++++++-------------
.../launch_model/components/selectField.js | 42 +++++++++++
2 files changed, 64 insertions(+), 47 deletions(-)
create mode 100644 xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js
diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
index 1169f06269..ccff202111 100644
--- a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
+++ b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js
@@ -13,15 +13,11 @@ import {
CircularProgress,
Collapse,
Drawer,
- FormControl,
FormControlLabel,
- InputLabel,
ListItemButton,
ListItemText,
- MenuItem,
Radio,
RadioGroup,
- Select,
Switch,
TextField,
Tooltip,
@@ -39,45 +35,11 @@ import DynamicFieldList from './dynamicFieldList'
import getModelFormConfig from './modelFormConfig'
import PasteDialog from './pasteDialog'
import Progress from './progress'
+import SelectField from './selectField'
const enginesWithNWorker = ['SGLang', 'vLLM', 'MLX']
const modelEngineType = ['LLM', 'embedding', 'rerank']
-const SelectField = ({
- label,
- labelId,
- name,
- value,
- onChange,
- options = [],
- disabled = false,
- required = false,
-}) => (
-
- {label}
-
-
-)
-
const LaunchModelDrawer = ({
modelData,
modelType,
@@ -549,19 +511,32 @@ const LaunchModelDrawer = ({
const engineItems = useMemo(() => {
return engineOptions.map((engine) => {
- const modelFormats = Array.from(
- new Set(enginesObj[engine]?.map((item) => item.model_format))
- )
+ const engineData = enginesObj[engine]
+ let modelFormats = []
+ let label = engine
+ let disabled = false
+
+ if (Array.isArray(engineData)) {
+ modelFormats = Array.from(
+ new Set(engineData.map((item) => item.model_format))
+ )
- const relevantSpecs = modelData.model_specs.filter((spec) =>
- modelFormats.includes(spec.model_format)
- )
+ const relevantSpecs = modelData.model_specs.filter((spec) =>
+ modelFormats.includes(spec.model_format)
+ )
+
+ const cached = relevantSpecs.some((spec) => isCached(spec))
- const cached = relevantSpecs.some((spec) => isCached(spec))
+ label = cached ? `${engine} ${t('launchModel.cached')}` : engine
+ } else if (typeof engineData === 'string') {
+ label = `${engine} (${engineData})`
+ disabled = true
+ }
return {
value: engine,
- label: cached ? `${engine} ${t('launchModel.cached')}` : engine,
+ label,
+ disabled,
}
})
}, [engineOptions, enginesObj, modelData])
diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js
new file mode 100644
index 0000000000..7e9a4af8ce
--- /dev/null
+++ b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js
@@ -0,0 +1,42 @@
+import { FormControl, InputLabel, MenuItem, Select } from '@mui/material'
+
+const SelectField = ({
+ label,
+ labelId,
+ name,
+ value,
+ onChange,
+ options = [],
+ disabled = false,
+ required = false,
+}) => (
+
+ {label}
+
+
+)
+
+export default SelectField
From 2466777ddf2a3431f35b7770b9003a78242cdbe3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 09:52:09 +0800
Subject: [PATCH 03/37] FEAT: add engine ability display
---
xinference/model/utils.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 0d8e471bb0..ea1c18eec8 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,6 +474,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
) -> Optional[Dict[str, Union[List[dict], str]]]:
+ engine_params: Optional[Dict[str, Union[List[dict], str]]] = None
+
if model_type == "LLM":
from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
From 8e1fa20df8db50443bd75271424a0f2fba834a41 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:01:29 +0800
Subject: [PATCH 04/37] FEAT: add engine ability display
---
xinference/model/utils.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea1c18eec8..7763b6fba5 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -506,7 +506,7 @@ def get_engine_params_by_name(
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ lib_available: bool = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -587,14 +587,14 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+ engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ lib_available: bool = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -675,14 +675,14 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+ engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ lib_available: bool = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
From da58bf468322393589b63b788e7c5b78c32a6568 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:48:18 +0800
Subject: [PATCH 05/37] FEAT: add engine ability display
---
xinference/model/utils.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 7763b6fba5..d1bd6f072f 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -473,8 +473,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
-) -> Optional[Dict[str, Union[List[dict], str]]]:
- engine_params: Optional[Dict[str, Union[List[dict], str]]] = None
+) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
+ engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None
if model_type == "LLM":
from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
@@ -506,7 +506,7 @@ def get_engine_params_by_name(
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available: bool = engine_class.check_lib()
+ lib_available = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -587,14 +587,14 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+ engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available: bool = engine_class.check_lib()
+ lib_available = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -675,14 +675,14 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name]
+ engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
for engine_class in engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available: bool = engine_class.check_lib()
+ lib_available = engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
From 38aad40977460da0b3548005d545b2eb03d50bf6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 10:52:46 +0800
Subject: [PATCH 06/37] FEAT: add engine ability display
---
xinference/model/utils.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index d1bd6f072f..42f1e5913d 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -499,11 +499,11 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes = SUPPORTED_ENGINES[engine_name]
+ llm_engine_classes = SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
- for engine_class in engine_classes:
+ for engine_class in llm_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
lib_available = engine_class.check_lib()
@@ -587,11 +587,11 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
+ embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
- for engine_class in engine_classes:
+ for engine_class in embedding_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
lib_available = engine_class.check_lib()
@@ -675,11 +675,11 @@ def get_engine_params_by_name(
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
- engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
+ rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
error_msg = None
# Try to find specific error reasons
- for engine_class in engine_classes:
+ for engine_class in rerank_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
lib_available = engine_class.check_lib()
From a679c3b2be223097099b400f36775a4fd109ac68 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:02:01 +0800
Subject: [PATCH 07/37] FEAT: add engine ability display
---
xinference/model/utils.py | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 42f1e5913d..373a7d24d9 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
- engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None
+ engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
if model_type == "LLM":
from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
@@ -484,7 +484,6 @@ def get_engine_params_by_name(
# Get all supported engines, not just currently available ones
all_supported_engines = list(SUPPORTED_ENGINES.keys())
- engine_params = {}
# First add currently available engine parameters
available_engines = deepcopy(LLM_ENGINES[model_name])
@@ -572,7 +571,6 @@ def get_engine_params_by_name(
# Get all supported engines, not just currently available ones
all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys())
- engine_params = {}
# First add currently available engine parameters
available_engines = deepcopy(EMBEDDING_ENGINES[model_name])
@@ -660,7 +658,6 @@ def get_engine_params_by_name(
# Get all supported engines, not just currently available ones
all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys())
- engine_params = {}
# First add currently available engine parameters
available_engines = deepcopy(RERANK_ENGINES[model_name])
From 340ff708d41410062c0386e14ffeb505b2a6fbe9 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:11:30 +0800
Subject: [PATCH 08/37] FEAT: add engine ability display
---
xinference/model/utils.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 373a7d24d9..5f2d437219 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -589,10 +589,10 @@ def get_engine_params_by_name(
error_msg = None
# Try to find specific error reasons
- for engine_class in embedding_engine_classes:
+ for embedding_engine_class in embedding_engine_classes:
try:
- if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ if hasattr(embedding_engine_class, "check_lib"):
+ lib_available = embedding_engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -676,10 +676,10 @@ def get_engine_params_by_name(
error_msg = None
# Try to find specific error reasons
- for engine_class in rerank_engine_classes:
+ for rerank_engine_class in rerank_engine_classes:
try:
- if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ if hasattr(rerank_engine_class, "check_lib"):
+ lib_available = rerank_engine_class.check_lib()
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
From 19e1e2a1fdea15472a18be13073784a11901c70e Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:26:09 +0800
Subject: [PATCH 09/37] FEAT: add engine ability display
---
xinference/model/utils.py | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 5f2d437219..b073cc879b 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -505,7 +505,7 @@ def get_engine_params_by_name(
for engine_class in llm_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available = engine_class.check_lib()
+ lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
if not lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
@@ -540,11 +540,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ engine_params[engine_name] = error_msg # type: ignore[arg-type]
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = (
+ engine_params[engine_name] = ( # type: ignore[arg-type]
f"Error checking engine {engine_name}: {str(e)}"
)
@@ -592,8 +592,8 @@ def get_engine_params_by_name(
for embedding_engine_class in embedding_engine_classes:
try:
if hasattr(embedding_engine_class, "check_lib"):
- lib_available = embedding_engine_class.check_lib()
- if not lib_available:
+ embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment]
+ if not embedding_lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
)
@@ -629,11 +629,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ engine_params[engine_name] = error_msg # type: ignore[arg-type]
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = (
+ engine_params[engine_name] = ( # type: ignore[arg-type]
f"Error checking engine {engine_name}: {str(e)}"
)
@@ -679,8 +679,8 @@ def get_engine_params_by_name(
for rerank_engine_class in rerank_engine_classes:
try:
if hasattr(rerank_engine_class, "check_lib"):
- lib_available = rerank_engine_class.check_lib()
- if not lib_available:
+ rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment]
+ if not rerank_lib_available:
error_msg = (
f"Engine {engine_name} library is not available"
)
@@ -716,11 +716,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ engine_params[engine_name] = error_msg # type: ignore[arg-type]
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = (
+ engine_params[engine_name] = ( # type: ignore[arg-type]
f"Error checking engine {engine_name}: {str(e)}"
)
From cc84a84bc2817c28268f965d9d161def9a458f2c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:48:54 +0800
Subject: [PATCH 10/37] FEAT: add engine ability display
---
xinference/model/utils.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index b073cc879b..783ceba2e4 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -540,11 +540,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg # type: ignore[arg-type]
+ engine_params[engine_name] = error_msg
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = ( # type: ignore[arg-type]
+ engine_params[engine_name] = (
f"Error checking engine {engine_name}: {str(e)}"
)
@@ -629,11 +629,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg # type: ignore[arg-type]
+ engine_params[engine_name] = error_msg
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = ( # type: ignore[arg-type]
+ engine_params[engine_name] = (
f"Error checking engine {engine_name}: {str(e)}"
)
@@ -716,11 +716,11 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg # type: ignore[arg-type]
+ engine_params[engine_name] = error_msg
except Exception as e:
# If exception occurs during checking, return error message string
- engine_params[engine_name] = ( # type: ignore[arg-type]
+ engine_params[engine_name] = (
f"Error checking engine {engine_name}: {str(e)}"
)
From d9b3a434c09a4f2b552aedec487258a6b432ca3c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 14 Oct 2025 11:57:11 +0800
Subject: [PATCH 11/37] FEAT: add engine ability display
---
xinference/model/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 783ceba2e4..18de3c26e4 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
- engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
+ engine_params: Dict[str, Any] = {}
if model_type == "LLM":
from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
From d9d313699613323e94b83b4ad0ff141986d2f209 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 11:22:30 +0800
Subject: [PATCH 12/37] modify accomplishment measure
---
xinference/model/embedding/core.py | 40 ++
xinference/model/embedding/llama_cpp/core.py | 62 ++-
.../embedding/sentence_transformers/core.py | 77 ++-
xinference/model/llm/core.py | 38 ++
xinference/model/llm/llama_cpp/core.py | 59 ++-
xinference/model/llm/lmdeploy/core.py | 64 ++-
xinference/model/llm/mlx/core.py | 158 +++++-
xinference/model/llm/sglang/core.py | 229 +++++++--
xinference/model/llm/transformers/core.py | 70 ++-
xinference/model/llm/vllm/core.py | 461 +++++++++++++++---
xinference/model/rerank/core.py | 40 ++
.../rerank/sentence_transformers/core.py | 75 ++-
xinference/model/rerank/vllm/core.py | 73 ++-
xinference/model/utils.py | 140 ++++--
14 files changed, 1390 insertions(+), 196 deletions(-)
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index fffbc7633c..299ec4c5d1 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -171,6 +171,46 @@ def match_json(
) -> bool:
pass
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: EmbeddingModelFamilyV2,
+ model_spec: EmbeddingSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ """
+ Check if the engine can handle the given embedding model with detailed error information.
+
+ This method provides detailed failure reasons and suggestions when an engine
+ cannot handle a specific model configuration. The default implementation
+ falls back to the boolean match_json method for backward compatibility.
+
+ Args:
+ model_family: The embedding model family information
+ model_spec: The model specification
+ quantization: The quantization method
+
+ Returns:
+ MatchResult: Detailed match result with reasons and suggestions
+ """
+ from .match_result import ErrorType, MatchResult
+
+ # Default implementation for backward compatibility
+ if cls.match_json(model_family, model_spec, quantization):
+ return MatchResult.success()
+ else:
+ # Get basic reason based on common failure patterns
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason=f"Required library for {cls.__name__} is not available",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ )
+ else:
+ return MatchResult.failure(
+ reason=f"Embedding model configuration is not compatible with {cls.__name__}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ )
+
@classmethod
def match(
cls,
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index fb8c4e45ca..6e2908ffdd 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -235,6 +235,64 @@ def match_json(
model_spec: EmbeddingSpecV1,
quantization: str,
) -> bool:
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: EmbeddingModelFamilyV2,
+ model_spec: EmbeddingSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="llama.cpp library (xllamacpp) is not installed for embedding",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="xllamacpp package not found in Python environment",
+ )
+
+ # Check model format compatibility
if model_spec.model_format not in ["ggufv2"]:
- return False
- return True
+ return MatchResult.failure(
+ reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2",
+ )
+
+ # Check embedding-specific requirements
+ if not hasattr(model_spec, "model_file_name_template"):
+ return MatchResult.failure(
+ reason="GGUF embedding model requires proper file configuration",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details="Missing model_file_name_template for GGUF embedding",
+ )
+
+ # Check model dimensions for llama.cpp compatibility
+ model_dimensions = model_family.dimensions
+ if model_dimensions > 4096: # llama.cpp may have limitations
+ return MatchResult.failure(
+ reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Large embedding dimensions: {model_dimensions}",
+ )
+
+ # Check platform-specific considerations
+ import platform
+
+ current_platform = platform.system()
+
+ # llama.cpp works across platforms but may have performance differences
+ if current_platform == "Windows":
+ return MatchResult.failure(
+ reason="llama.cpp embedding may have limited performance on Windows",
+ error_type=ErrorType.OS_REQUIREMENT,
+ technical_details=f"Windows platform: {current_platform}",
+ )
+
+ return MatchResult.success()
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 05f7753e8e..843d68ea37 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -434,5 +434,78 @@ def match_json(
model_spec: EmbeddingSpecV1,
quantization: str,
) -> bool:
- # As default embedding engine, sentence-transformer support all models
- return model_spec.model_format in ["pytorch"]
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: EmbeddingModelFamilyV2,
+ model_spec: EmbeddingSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="Sentence Transformers library is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="sentence_transformers package not found in Python environment",
+ )
+
+ # Check model format compatibility
+ if model_spec.model_format not in ["pytorch"]:
+ return MatchResult.failure(
+ reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+ )
+
+ # Check model dimensions compatibility
+ model_dimensions = model_family.dimensions
+ if model_dimensions > 1536: # Very large embedding models
+ return MatchResult.failure(
+ reason=f"Large embedding model detected ({model_dimensions} dimensions)",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Large embedding dimensions: {model_dimensions}",
+ )
+
+ # Check token limits
+ max_tokens = model_family.max_tokens
+ if max_tokens > 8192: # Very high token limits
+ return MatchResult.failure(
+ reason=f"High token limit model detected (max_tokens: {max_tokens})",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details=f"High max_tokens: {max_tokens}",
+ )
+
+ # Check for special model requirements
+ model_name = model_family.model_name.lower()
+
+ # Check Qwen2 GTE models
+ if "gte" in model_name and "qwen2" in model_name:
+ # These models have specific requirements
+ if not hasattr(cls, "_check_qwen_gte_requirements"):
+ return MatchResult.failure(
+ reason="Qwen2 GTE models require special handling",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details="Qwen2 GTE model special requirements",
+ )
+
+ # Check Qwen3 models
+ if "qwen3" in model_name:
+ # Qwen3 has flash attention requirements
+ try:
+ # This would be checked during actual loading
+ pass
+ except Exception:
+ return MatchResult.failure(
+ reason="Qwen3 embedding model may have compatibility issues",
+ error_type=ErrorType.VERSION_REQUIREMENT,
+ technical_details="Qwen3 model compatibility check",
+ )
+
+ return MatchResult.success()
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 8abc8f04a6..ee446d024a 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,6 +31,7 @@
if TYPE_CHECKING:
from .llm_family import LLMFamilyV2, LLMSpecV1
+ from .match_result import ErrorType, MatchResult
logger = logging.getLogger(__name__)
@@ -159,6 +160,43 @@ def match_json(
) -> bool:
raise NotImplementedError
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ """
+ Check if the engine can handle the given model with detailed error information.
+
+ This method provides detailed failure reasons and suggestions when an engine
+ cannot handle a specific model configuration. The default implementation
+ falls back to the boolean match_json method for backward compatibility.
+
+ Args:
+ llm_family: The model family information
+ llm_spec: The model specification
+ quantization: The quantization method
+
+ Returns:
+ MatchResult: Detailed match result with reasons and suggestions
+ """
+ from .match_result import ErrorType, MatchResult
+
+ # Default implementation for backward compatibility
+ if cls.match_json(llm_family, llm_spec, quantization):
+ return MatchResult.success()
+ else:
+ # Get basic reason based on common failure patterns
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason=f"Required library for {cls.__name__} is not available",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ )
+ else:
+ return MatchResult.failure(
+ reason=f"Model configuration is not compatible with {cls.__name__}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ )
+
def prepare_parse_reasoning_content(
self, reasoning_content: bool, enable_thinking: bool = True
):
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index d009378dbe..f35fae9f6e 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -86,14 +86,67 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
) -> bool:
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="llama.cpp library (xllamacpp) is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="xllamacpp package not found in Python environment",
+ )
+
+ # Check model format compatibility
if llm_spec.model_format not in ["ggufv2"]:
- return False
+ return MatchResult.failure(
+ reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2",
+ )
+
+ # Check model abilities - llama.cpp supports both chat and generation
if (
"chat" not in llm_family.model_ability
and "generate" not in llm_family.model_ability
):
- return False
- return True
+ return MatchResult.failure(
+ reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # Check platform-specific issues
+ import platform
+
+ current_platform = platform.system()
+
+ # Check for ARM64 specific issues
+ if current_platform == "Darwin" and platform.machine() == "arm64":
+ # Apple Silicon specific checks could go here
+ pass
+ elif current_platform == "Windows":
+ # Windows specific checks could go here
+ pass
+
+ # Check memory requirements (basic heuristic)
+ model_size = float(str(llm_spec.model_size_in_billions))
+ if model_size > 70: # Very large models
+ return MatchResult.failure(
+ reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Large model size: {model_size}B parameters",
+ )
+
+ return MatchResult.success()
def load(self):
try:
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 0144a6f734..cd0aa892cf 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -121,7 +121,22 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ return MatchResult.failure(
+ reason="LMDeploy base model does not support direct inference",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details="LMDeploy base model class is not intended for direct use",
+ )
def generate(
self,
@@ -174,13 +189,52 @@ def load(self):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability first
+ if not LMDEPLOY_INSTALLED:
+ return MatchResult.failure(
+ reason="LMDeploy library is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="lmdeploy package not found in Python environment",
+ )
+
+ # Check model format compatibility and quantization
if llm_spec.model_format == "awq":
- # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+ # LMDeploy has specific AWQ quantization requirements
if "4" not in quantization:
- return False
+ return MatchResult.failure(
+ reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"AWQ + {quantization} not supported by LMDeploy",
+ )
+
+ # Check model compatibility
if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
- return False
- return LMDEPLOY_INSTALLED
+ return MatchResult.failure(
+ reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported chat model: {llm_family.model_name}",
+ )
+
+ # Check model abilities - LMDeploy primarily supports chat models
+ if "chat" not in llm_family.model_ability:
+ return MatchResult.failure(
+ reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ return MatchResult.success()
async def async_chat(
self,
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 80b9c4be2f..cf24d31fdf 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -411,17 +411,67 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in ["mlx"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="MLX library (mlx_lm) is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="mlx_lm package not found in Python environment",
+ )
+
+ # Check platform compatibility - MLX only works on Apple Silicon
if sys.platform != "darwin" or platform.processor() != "arm":
- # only work for Mac M chips
- return False
+ return MatchResult.failure(
+ reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
+ error_type=ErrorType.OS_REQUIREMENT,
+ technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
+ )
+
+ # Check model format compatibility
+ if llm_spec.model_format not in ["mlx"]:
+ return MatchResult.failure(
+ reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
+ )
+
+ # Check model abilities - MLX supports generation but not chat/vision in this base class
if "generate" not in llm_family.model_ability:
- return False
+ return MatchResult.failure(
+ reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # MLX base model doesn't support chat or vision
if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability:
- # do not process chat or vision
- return False
- return True
+ return MatchResult.failure(
+ reason="MLX base model does not support chat or vision abilities",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}",
+ )
+
+ # Check memory constraints for Apple Silicon
+ model_size = float(str(llm_spec.model_size_in_billions))
+ if model_size > 70: # Large models may be problematic
+ return MatchResult.failure(
+ reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Large model size: {model_size}B on Apple Silicon",
+ )
+
+ return MatchResult.success()
def _get_prompt_cache(
self, prompt, lora_name: Optional[str] = None, model: Any = None
@@ -722,17 +772,39 @@ def _sanitize_generate_config(
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in ["mlx"]:
- return False
- if sys.platform != "darwin" or platform.processor() != "arm":
- # only work for Mac M chips
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Use base class validation first
+ base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ if not base_result.is_match:
+ return base_result
+
+ # Check chat ability
if "chat" not in llm_family.model_ability:
- return False
+ return MatchResult.failure(
+ reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # MLX Chat doesn't support vision
if "vision" in llm_family.model_ability:
- # do not process vision
- return False
- return True
+ return MatchResult.failure(
+ reason="MLX Chat model does not support vision abilities",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Vision ability not supported in MLXChatModel",
+ )
+
+ return MatchResult.success()
def chat(
self,
@@ -786,14 +858,54 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in ["mlx"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability first - MLX Vision uses mlx_vlm
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="MLX Vision library (mlx_vlm) is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="mlx_vlm package not found in Python environment",
+ )
+
+ # Check platform compatibility
if sys.platform != "darwin" or platform.processor() != "arm":
- # only work for Mac M chips
- return False
+ return MatchResult.failure(
+ reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
+ error_type=ErrorType.OS_REQUIREMENT,
+ technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
+ )
+
+ # Check model format compatibility
+ if llm_spec.model_format not in ["mlx"]:
+ return MatchResult.failure(
+ reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
+ )
+
+ # Check vision ability
if "vision" not in llm_family.model_ability:
- return False
- return True
+ return MatchResult.failure(
+ reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # Check for distributed inference limitations
+ # MLX Vision models don't support distributed inference
+ # This could be checked here if needed
+
+ return MatchResult.success()
def _load_model(self, **kwargs):
try:
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index d3bbfc1570..f3658b5ed7 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -15,6 +15,7 @@
import json
import logging
import multiprocessing
+import platform
import sys
import threading
import time
@@ -341,24 +342,104 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability first
+ if not SGLANG_INSTALLED:
+ return MatchResult.failure(
+ reason="SGLang library is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="sglang package not found in Python environment",
+ )
+
+ # Check hardware requirements - SGLang requires CUDA
if not cls._has_cuda_device():
- return False
+ return MatchResult.failure(
+ reason="SGLang requires CUDA GPU support",
+ error_type=ErrorType.HARDWARE_REQUIREMENT,
+ technical_details="No CUDA devices detected",
+ )
+
+ # Check OS requirements
if not cls._is_linux():
- return False
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
- return False
+ return MatchResult.failure(
+ reason="SGLang only supports Linux operating system",
+ error_type=ErrorType.OS_REQUIREMENT,
+ technical_details=f"Current OS: {platform.system()}, required: Linux",
+ )
+
+ # Check model format compatibility
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"SGLang does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {llm_spec.model_format}",
+ )
+
+ # Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
- if quantization != "none" and not (quantization is None):
- return False
+ if quantization != "none" and quantization is not None:
+ return MatchResult.failure(
+ reason=f"SGLang pytorch format does not support quantization: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"pytorch + {quantization} combination not supported",
+ )
+
+ # Check model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
- return False
+ return MatchResult.failure(
+ reason=f"Custom model family not supported by SGLang: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom family: {llm_family.model_family}",
+ )
else:
if llm_family.model_name not in SGLANG_SUPPORTED_MODELS:
- return False
- if "generate" not in llm_family.model_ability:
- return False
- return SGLANG_INSTALLED
+ return MatchResult.failure(
+ reason=f"Model not supported by SGLang: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported model: {llm_family.model_name}",
+ )
+
+ # Check model abilities with flexible logic
+ # SGLang can handle models with various text generation capabilities
+ has_text_capability = (
+ "generate" in llm_family.model_ability
+ or "chat" in llm_family.model_ability
+ or "reasoning" in llm_family.model_ability
+ or "tools" in llm_family.model_ability
+ )
+
+ if not has_text_capability:
+ return MatchResult.failure(
+ reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # SGLang is primarily designed for text models, not specialized models
+ specialized_abilities = ["embedding", "rerank", "audio", "vision"]
+ has_specialized = any(
+ ability in llm_family.model_ability for ability in specialized_abilities
+ )
+ if has_specialized:
+ return MatchResult.failure(
+ reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
+ )
+
+ return MatchResult.success()
@staticmethod
def _convert_state_to_completion_chunk(
@@ -647,20 +728,65 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Use base class validation first
+ base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ if not base_result.is_match:
+ return base_result
+
+ # Check model format compatibility (same as base)
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"SGLang Chat does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
+ )
+
+ # Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
- if quantization != "none" and not (quantization is None):
- return False
+ if quantization != "none" and quantization is not None:
+ return MatchResult.failure(
+ reason=f"SGLang Chat pytorch format does not support quantization: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"Chat pytorch + {quantization} not supported",
+ )
+
+ # Check chat model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
- return False
+ return MatchResult.failure(
+ reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom chat family: {llm_family.model_family}",
+ )
else:
if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS:
- return False
+ return MatchResult.failure(
+ reason=f"Chat model not supported by SGLang: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported chat model: {llm_family.model_name}",
+ )
+
+ # Check chat ability
if "chat" not in llm_family.model_ability:
- return False
- return SGLANG_INSTALLED
+ return MatchResult.failure(
+ reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ return MatchResult.success()
def _sanitize_chat_config(
self,
@@ -734,24 +860,65 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if not cls._has_cuda_device():
- return False
- if not cls._is_linux():
- return False
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Use base class validation first
+ base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ if not base_result.is_match:
+ return base_result
+
+ # Vision models have the same format restrictions as base SGLANG
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"SGLang Vision does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+ )
+
+ # Vision models typically work with specific quantization settings
if llm_spec.model_format == "pytorch":
- if quantization != "none" and not (quantization is None):
- return False
+ if quantization != "none" and quantization is not None:
+ return MatchResult.failure(
+ reason=f"SGLang Vision pytorch format does not support quantization: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"Vision pytorch + {quantization} not supported",
+ )
+
+ # Check vision model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
- return False
+ return MatchResult.failure(
+ reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom vision family: {llm_family.model_family}",
+ )
else:
if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
- return False
+ return MatchResult.failure(
+ reason=f"Vision model not supported by SGLang: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported vision model: {llm_family.model_name}",
+ )
+
+ # Check vision ability
if "vision" not in llm_family.model_ability:
- return False
- return SGLANG_INSTALLED
+ return MatchResult.failure(
+ reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ return MatchResult.success()
def _sanitize_chat_config(
self,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 6ad98c38e8..89a966136d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -500,14 +500,72 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="Transformers library is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="transformers or torch package not found",
+ )
+
+ # Check model format compatibility
+ supported_formats = ["pytorch", "gptq", "awq", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"Transformers does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Transformers unsupported format: {llm_spec.model_format}",
+ )
+
+ # Check for models that shouldn't use Transformers by default
model_family = llm_family.model_family or llm_family.model_name
if model_family in NON_DEFAULT_MODEL_LIST:
- return False
- if "generate" not in llm_family.model_ability:
- return False
- return True
+ return MatchResult.failure(
+ reason=f"Model {model_family} is not recommended for Transformers engine",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}",
+ )
+
+ # Check model abilities with flexible logic
+ # Transformers can handle models with various text processing capabilities
+ has_text_capability = (
+ "generate" in llm_family.model_ability
+ or "chat" in llm_family.model_ability
+ or "reasoning" in llm_family.model_ability
+ or "tools" in llm_family.model_ability
+ )
+
+ if not has_text_capability:
+ return MatchResult.failure(
+ reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # Check for highly specialized models that might not work well with generic Transformers engine
+ specialized_abilities = ["embedding", "rerank", "audio", "vision"]
+ has_specialized = any(
+ ability in llm_family.model_ability for ability in specialized_abilities
+ )
+ if has_specialized and not has_text_capability:
+ return MatchResult.failure(
+ reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
+ )
+
+ return MatchResult.success()
def build_prefill_attention_mask(
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 58b0a523aa..9d76d5685e 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -19,6 +19,7 @@
import logging
import multiprocessing
import os
+import platform
import sys
import threading
import time
@@ -880,35 +881,178 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability first
+ if not VLLM_INSTALLED:
+ return MatchResult.failure(
+ reason="vLLM library is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="vllm package not found in Python environment",
+ )
+
+ # Check hardware requirements
if not cls._has_cuda_device() and not cls._has_mlu_device():
- return False
+ return MatchResult.failure(
+ reason="vLLM requires CUDA or MLU accelerator support",
+ error_type=ErrorType.HARDWARE_REQUIREMENT,
+ technical_details="No CUDA or MLU devices detected",
+ )
+
+ # Check OS requirements
if not cls._is_linux():
- return False
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
- return False
+ return MatchResult.failure(
+ reason="vLLM only supports Linux operating system",
+ error_type=ErrorType.OS_REQUIREMENT,
+ technical_details=f"Current OS: {platform.system()}, required: Linux",
+ )
+
+ # Check model format
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"vLLM does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {llm_spec.model_format}",
+ )
+
+ # Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return False
+ return MatchResult.failure(
+ reason=f"vLLM pytorch format does not support quantization: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"pytorch + {quantization} combination not supported",
+ )
+
if llm_spec.model_format == "awq":
- # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
if "4" not in quantization:
- return False
+ return MatchResult.failure(
+ reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"AWQ + {quantization} not supported, only 4-bit",
+ )
+
if llm_spec.model_format == "gptq":
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
if not any(q in quantization for q in ("3", "4", "8")):
- return False
+ return MatchResult.failure(
+ reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3",
+ )
else:
if "4" not in quantization:
- return False
+ return MatchResult.failure(
+ reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}",
+ error_type=ErrorType.VERSION_REQUIREMENT,
+ technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3",
+ )
+
+ # Check model compatibility with more flexible matching
+ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
+ """Check if model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants (e.g., qwen3 variants)
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common patterns
+ model_lower = model_name.lower()
+ if any(
+ family in model_lower
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "gemma",
+ "baichuan",
+ "deepseek",
+ ]
+ ):
+ # Check if there's a corresponding supported model with same family
+ for supported in supported_list:
+ if any(
+ family in supported.lower()
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "gemma",
+ "baichuan",
+ "deepseek",
+ ]
+ ):
+ return True
+
+ return False
+
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
- return False
+ if not is_model_supported(
+ llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
+ ):
+ return MatchResult.failure(
+ reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom family: {llm_family.model_family}",
+ )
else:
- if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
- return False
- if "generate" not in llm_family.model_ability:
- return False
- return VLLM_INSTALLED
+ if not is_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in VLLM_SUPPORTED_MODELS],
+ ):
+ return MatchResult.failure(
+ reason=f"Model may not be supported by vLLM: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported model: {llm_family.model_name}",
+ )
+
+ # Check model abilities with flexible logic
+ # vLLM can handle models that have text generation capabilities
+ # Models with 'chat' ability usually also support 'generate'
+ has_text_capability = (
+ "generate" in llm_family.model_ability
+ or "chat" in llm_family.model_ability
+ or "reasoning" in llm_family.model_ability
+ or "tools" in llm_family.model_ability
+ )
+
+ if not has_text_capability:
+ return MatchResult.failure(
+ reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # Additional check: ensure model doesn't have conflicting abilities
+ conflicting_abilities = ["embedding", "rerank"]
+ has_conflicting = any(
+ ability in llm_family.model_ability for ability in conflicting_abilities
+ )
+ if has_conflicting:
+ return MatchResult.failure(
+ reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}",
+ )
+
+ # All checks passed
+ return MatchResult.success()
@staticmethod
def _convert_request_output_to_completion_chunk(
@@ -1316,40 +1460,141 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if llm_spec.model_format not in [
- "pytorch",
- "gptq",
- "awq",
- "fp8",
- "bnb",
- "ggufv2",
- ]:
- return False
- if llm_spec.model_format == "pytorch":
- if quantization != "none" and quantization is not None:
- return False
- if llm_spec.model_format == "awq":
- if not any(q in quantization for q in ("4", "8")):
- return False
- if llm_spec.model_format == "gptq":
- if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
- if not any(q in quantization for q in ("3", "4", "8")):
- return False
- else:
- if "4" not in quantization:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Use base class validation first
+ base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ if not base_result.is_match:
+ return base_result
+
+ # Chat-specific format support (includes GGUFv2 for newer vLLM)
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"vLLM Chat does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
+ )
+
+ # GGUFv2 requires newer vLLM version
if llm_spec.model_format == "ggufv2":
if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")):
- return False
+ return MatchResult.failure(
+ reason="vLLM GGUF support requires version >= 0.8.2",
+ error_type=ErrorType.VERSION_REQUIREMENT,
+ technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2",
+ )
+
+ # AWQ chat models support more quantization levels
+ if llm_spec.model_format == "awq":
+ if not any(q in quantization for q in ("4", "8")):
+ return MatchResult.failure(
+ reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"Chat AWQ + {quantization} not supported",
+ )
+
+ # Check chat model compatibility with flexible matching
+ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
+ """Check if chat model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common chat model patterns
+ model_lower = model_name.lower()
+ if any(
+ family in model_lower
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "gemma",
+ "baichuan",
+ "deepseek",
+ "glm",
+ "chatglm",
+ ]
+ ):
+ # Check if there's a corresponding supported chat model with same family
+ for supported in supported_list:
+ if any(
+ family in supported.lower()
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "gemma",
+ "baichuan",
+ "deepseek",
+ "glm",
+ "chatglm",
+ ]
+ ):
+ return True
+
+ return False
+
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
- return False
+ if not is_chat_model_supported(
+ llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
+ ):
+ return MatchResult.failure(
+ reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom chat family: {llm_family.model_family}",
+ )
else:
- if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
- return False
- if "chat" not in llm_family.model_ability:
- return False
- return VLLM_INSTALLED
+ if not is_chat_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS],
+ ):
+ return MatchResult.failure(
+ reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported chat model: {llm_family.model_name}",
+ )
+
+ # Check chat ability with flexible logic
+ # vLLM Chat should work with models that have conversation capabilities
+ has_chat_capability = (
+ "chat" in llm_family.model_ability
+ or "generate" in llm_family.model_ability
+ or "reasoning" in llm_family.model_ability
+ )
+
+ if not has_chat_capability:
+ return MatchResult.failure(
+ reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ # Additional check: ensure model is not purely a tool model without conversation
+ if set(llm_family.model_ability) == {"tools"}:
+ return MatchResult.failure(
+ reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Tool-only model detected",
+ )
+
+ return MatchResult.success()
def _sanitize_chat_config(
self,
@@ -1494,38 +1739,110 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if not cls._has_cuda_device() and not cls._has_mlu_device():
- return False
- if not cls._is_linux():
- return False
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Use base class validation first
+ base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ if not base_result.is_match:
+ return base_result
+
+ # Vision models have the same format restrictions as base VLLM
+ supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
+ if llm_spec.model_format not in supported_formats:
+ return MatchResult.failure(
+ reason=f"vLLM Vision does not support model format: {llm_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+ )
+
+ # Vision models typically work with specific quantization settings
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return False
+ return MatchResult.failure(
+ reason=f"vLLM Vision pytorch format does not support quantization: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"Vision pytorch + {quantization} not supported",
+ )
+
+ # AWQ vision models support more quantization levels than base
if llm_spec.model_format == "awq":
if not any(q in quantization for q in ("4", "8")):
- return False
- if llm_spec.model_format == "gptq":
- if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
- if not any(q in quantization for q in ("3", "4", "8")):
- return False
- else:
- if "4" not in quantization:
- return False
+ return MatchResult.failure(
+ reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}",
+ error_type=ErrorType.QUANTIZATION,
+ technical_details=f"Vision AWQ + {quantization} not supported",
+ )
+
+ # Check vision model compatibility with flexible matching
+ def is_vision_model_supported(
+ model_name: str, supported_list: List[str]
+ ) -> bool:
+ """Check if vision model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common vision model patterns
+ model_lower = model_name.lower()
+ if any(
+ family in model_lower
+ for family in ["llama", "qwen", "internvl", "glm", "phi"]
+ ):
+ # Check if there's a corresponding supported vision model with same family
+ for supported in supported_list:
+ if any(
+ family in supported.lower()
+ for family in ["llama", "qwen", "internvl", "glm", "phi"]
+ ):
+ return True
+
+ return False
+
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
- return False
+ if not is_vision_model_supported(
+ llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST
+ ):
+ return MatchResult.failure(
+ reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Custom vision family: {llm_family.model_family}",
+ )
else:
- if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
- return False
- if (
- "vision" not in llm_family.model_ability
- and "audio" not in llm_family.model_ability
- and "omni" not in llm_family.model_ability
- ):
- return False
- return VLLM_INSTALLED
+ if not is_vision_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST],
+ ):
+ return MatchResult.failure(
+ reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported vision model: {llm_family.model_name}",
+ )
+
+ # Check vision ability
+ if "vision" not in llm_family.model_ability:
+ return MatchResult.failure(
+ reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}",
+ error_type=ErrorType.ABILITY_MISMATCH,
+ technical_details=f"Model abilities: {llm_family.model_ability}",
+ )
+
+ return MatchResult.success()
def _sanitize_model_config(
self, model_config: Optional[VLLMModelConfig]
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index ae27e7e85e..929522f23e 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -131,6 +131,46 @@ def match_json(
) -> bool:
pass
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: RerankModelFamilyV2,
+ model_spec: RerankSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ """
+ Check if the engine can handle the given rerank model with detailed error information.
+
+ This method provides detailed failure reasons and suggestions when an engine
+ cannot handle a specific model configuration. The default implementation
+ falls back to the boolean match_json method for backward compatibility.
+
+ Args:
+ model_family: The rerank model family information
+ model_spec: The model specification
+ quantization: The quantization method
+
+ Returns:
+ MatchResult: Detailed match result with reasons and suggestions
+ """
+ from .match_result import ErrorType, MatchResult
+
+ # Default implementation for backward compatibility
+ if cls.match_json(model_family, model_spec, quantization):
+ return MatchResult.success()
+ else:
+ # Get basic reason based on common failure patterns
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason=f"Required library for {cls.__name__} is not available",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ )
+ else:
+ return MatchResult.failure(
+ reason=f"Rerank model configuration is not compatible with {cls.__name__}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ )
+
@classmethod
def match(
cls,
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index fabbb6e593..ee81a9adac 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -191,7 +191,7 @@ def compute_logits(inputs, **kwargs):
from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
else:
raise RuntimeError(
- f"Unsupported Rank model type: {self.model_family.type}"
+ f"Unsupported Rerank model type: {self.model_family.type}"
)
except ImportError:
error_message = "Failed to import module 'FlagEmbedding'"
@@ -341,5 +341,74 @@ def match_json(
model_spec: RerankSpecV1,
quantization: str,
) -> bool:
- # As default embedding engine, sentence-transformer support all models
- return model_spec.model_format in ["pytorch"]
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: RerankModelFamilyV2,
+ model_spec: RerankSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="Sentence Transformers library is not installed for reranking",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="sentence_transformers package not found in Python environment",
+ )
+
+ # Check model format compatibility
+ if model_spec.model_format not in ["pytorch"]:
+ return MatchResult.failure(
+ reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+ )
+
+ # Check rerank-specific requirements
+ if not hasattr(model_family, "model_name"):
+ return MatchResult.failure(
+ reason="Rerank model family requires model name specification",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details="Missing model_name in rerank model family",
+ )
+
+ # Check model type compatibility
+ if model_family.type and model_family.type not in [
+ "rerank",
+ "unknown",
+ "cross-encoder",
+ "normal",
+ "LLM-based",
+ "LLM-based layerwise",
+ ]:
+ return MatchResult.failure(
+ reason=f"Model type '{model_family.type}' may not be compatible with reranking engines",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Model type: {model_family.type}",
+ )
+
+ # Check max tokens limit for reranking performance
+ max_tokens = model_family.max_tokens
+ if max_tokens and max_tokens > 8192: # High token limits for reranking
+ return MatchResult.failure(
+ reason=f"High max_tokens limit for reranking model: {max_tokens}",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details=f"High max_tokens for reranking: {max_tokens}",
+ )
+
+ # Check language compatibility
+ if not model_family.language or len(model_family.language) == 0:
+ return MatchResult.failure(
+ reason="Rerank model language information is missing",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details="Missing language information in rerank model",
+ )
+
+ return MatchResult.success()
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index eac173b40c..f9763b567a 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -149,8 +149,71 @@ def match_json(
model_spec: RerankSpecV1,
quantization: str,
) -> bool:
- if model_spec.model_format in ["pytorch"]:
- prefix = model_family.model_name.split("-", 1)[0]
- if prefix in SUPPORTED_MODELS_PREFIXES:
- return True
- return False
+ from ..match_result import MatchResult
+
+ result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ return result.is_match
+
+ @classmethod
+ def match_json_with_reason(
+ cls,
+ model_family: RerankModelFamilyV2,
+ model_spec: RerankSpecV1,
+ quantization: str,
+ ) -> "MatchResult":
+ from ..match_result import ErrorType, MatchResult
+
+ # Check library availability
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="vLLM library is not installed for reranking",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="vllm package not found in Python environment",
+ )
+
+ # Check model format compatibility
+ if model_spec.model_format not in ["pytorch"]:
+ return MatchResult.failure(
+ reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}",
+ error_type=ErrorType.MODEL_FORMAT,
+ technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+ )
+
+ # Check model name prefix matching
+ if model_spec.model_format == "pytorch":
+ try:
+ prefix = model_family.model_name.split("-", 1)[0].lower()
+ # Support both prefix matching and special cases
+ if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
+ # Special handling for Qwen3 models
+ if "qwen3" not in model_family.model_name.lower():
+ return MatchResult.failure(
+ reason=f"Model family prefix not supported by vLLM reranking: {prefix}",
+ error_type=ErrorType.MODEL_COMPATIBILITY,
+ technical_details=f"Unsupported prefix: {prefix}",
+ )
+ except (IndexError, AttributeError):
+ return MatchResult.failure(
+ reason="Unable to parse model family name for vLLM compatibility check",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details=f"Model name parsing failed: {model_family.model_name}",
+ )
+
+ # Check rerank-specific requirements
+ if not hasattr(model_family, "model_name"):
+ return MatchResult.failure(
+ reason="Rerank model family requires model name specification for vLLM",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details="Missing model_name in vLLM rerank model family",
+ )
+
+ # Check max tokens limit for vLLM reranking performance
+ max_tokens = model_family.max_tokens
+ if max_tokens and max_tokens > 4096: # vLLM has stricter limits
+ return MatchResult.failure(
+ reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}",
+ error_type=ErrorType.CONFIGURATION_ERROR,
+ technical_details=f"High max_tokens for vLLM reranking: {max_tokens}",
+ )
+
+ return MatchResult.success()
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 18de3c26e4..ad0dabbf35 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -494,59 +494,111 @@ def get_engine_params_by_name(
del param["available"]
engine_params[engine] = params
- # Check unavailable engines
+ # Check unavailable engines with detailed error information
for engine_name in all_supported_engines:
if engine_name not in engine_params: # Engine not in available list
try:
llm_engine_classes = SUPPORTED_ENGINES[engine_name]
- error_msg = None
- # Try to find specific error reasons
- for engine_class in llm_engine_classes:
+ # Try to get detailed error information from engine's match_json_with_reason
+ detailed_error = None
+
+ # We need a sample model to test against, use the first available spec
+ if model_name in LLM_ENGINES and LLM_ENGINES[model_name]:
+ # Try to get model family for testing
try:
- if hasattr(engine_class, "check_lib"):
- lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
- if not lib_available:
- error_msg = (
- f"Engine {engine_name} library is not available"
- )
+ from .llm.llm_family import match_llm
+
+ llm_family = match_llm(model_name, None, None, None, None)
+ if llm_family and llm_family.model_specs:
+ llm_spec = llm_family.model_specs[0]
+ quantization = llm_spec.quantization or "none"
+
+ # Test each engine class for detailed error info
+ for engine_class in llm_engine_classes:
+ try:
+ if hasattr(
+ engine_class, "match_json_with_reason"
+ ):
+ from .llm.match_result import MatchResult
+
+ result = (
+ engine_class.match_json_with_reason(
+ llm_family, llm_spec, quantization
+ )
+ )
+ if not result.is_match:
+ detailed_error = {
+ "error": result.reason,
+ "error_type": result.error_type,
+ "technical_details": result.technical_details,
+ }
+ break
+ except Exception:
+ # Fall back to next engine class
+ continue
+ except Exception:
+ # If we can't get model family, continue with basic checking
+ pass
+
+ if detailed_error:
+ engine_params[engine_name] = detailed_error
+ else:
+ # Fallback to basic error checking for backward compatibility
+ error_msg = None
+ for engine_class in llm_engine_classes:
+ try:
+ if hasattr(engine_class, "check_lib"):
+ lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
+ if not lib_available:
+ error_msg = {
+ "error": f"Engine {engine_name} library is not available",
+ "error_type": "dependency_missing",
+ }
+ break
+ else:
+ # If no check_lib method, try import check
+ module_name = engine_name.lower().replace(".", "")
+ if engine_name == "vLLM":
+ module_name = "vllm"
+ elif engine_name == "SGLang":
+ module_name = "sglang"
+ elif engine_name == "llama.cpp":
+ module_name = "llama_cpp"
+ elif engine_name == "MLX":
+ module_name = "mlx"
+ elif engine_name == "LMDEPLOY":
+ module_name = "lmdeploy"
+ elif engine_name == "Transformers":
+ module_name = "transformers"
+
+ importlib.import_module(module_name)
break
- else:
- # If no check_lib method, try import check
- module_name = engine_name.lower().replace(".", "")
- if engine_name == "vLLM":
- module_name = "vllm"
- elif engine_name == "SGLang":
- module_name = "sglang"
- elif engine_name == "llama.cpp":
- module_name = "llama_cpp"
- elif engine_name == "MLX":
- module_name = "mlx"
- elif engine_name == "LMDEPLOY":
- module_name = "lmdeploy"
- elif engine_name == "Transformers":
- module_name = "transformers"
-
- importlib.import_module(module_name)
- break
- except ImportError as e:
- error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
- except Exception as e:
- error_msg = (
- f"Engine {engine_name} is not available: {str(e)}"
- )
-
- if error_msg is None:
- error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
- # For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ except ImportError as e:
+ error_msg = {
+ "error": f"Engine {engine_name} library is not installed: {str(e)}",
+ "error_type": "dependency_missing",
+ }
+ except Exception as e:
+ error_msg = {
+ "error": f"Engine {engine_name} is not available: {str(e)}",
+ "error_type": "configuration_error",
+ }
+
+ if error_msg is None:
+ error_msg = {
+ "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error_type": "model_compatibility",
+ }
+
+ engine_params[engine_name] = error_msg
except Exception as e:
- # If exception occurs during checking, return error message string
- engine_params[engine_name] = (
- f"Error checking engine {engine_name}: {str(e)}"
- )
+ # If exception occurs during checking, return structured error
+ engine_params[engine_name] = {
+ "error": f"Error checking engine {engine_name}: {str(e)}",
+ "error_type": "configuration_error",
+ }
# Filter out llm_class field
for engine, params in engine_params.items():
From 08450ac0c283f29a84ca46ac816dba7d05434eb6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 11:31:00 +0800
Subject: [PATCH 13/37] modify accomplishment measure
---
xinference/model/llm/mlx/core.py | 36 ++++++++++++++++----------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index cf24d31fdf..d2d4b25697 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -422,15 +422,7 @@ def match_json_with_reason(
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
- # Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="MLX library (mlx_lm) is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="mlx_lm package not found in Python environment",
- )
-
- # Check platform compatibility - MLX only works on Apple Silicon
+ # Check platform compatibility first - MLX only works on Apple Silicon
if sys.platform != "darwin" or platform.processor() != "arm":
return MatchResult.failure(
reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
@@ -438,6 +430,14 @@ def match_json_with_reason(
technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
)
+ # Check library availability (only if platform is compatible)
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="MLX library (mlx_lm) is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="mlx_lm package not found in Python environment",
+ )
+
# Check model format compatibility
if llm_spec.model_format not in ["mlx"]:
return MatchResult.failure(
@@ -869,15 +869,7 @@ def match_json_with_reason(
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
- # Check library availability first - MLX Vision uses mlx_vlm
- if not cls.check_lib():
- return MatchResult.failure(
- reason="MLX Vision library (mlx_vlm) is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="mlx_vlm package not found in Python environment",
- )
-
- # Check platform compatibility
+ # Check platform compatibility first - MLX only works on Apple Silicon
if sys.platform != "darwin" or platform.processor() != "arm":
return MatchResult.failure(
reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
@@ -885,6 +877,14 @@ def match_json_with_reason(
technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
)
+ # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm
+ if not cls.check_lib():
+ return MatchResult.failure(
+ reason="MLX Vision library (mlx_vlm) is not installed",
+ error_type=ErrorType.DEPENDENCY_MISSING,
+ technical_details="mlx_vlm package not found in Python environment",
+ )
+
# Check model format compatibility
if llm_spec.model_format not in ["mlx"]:
return MatchResult.failure(
From e793cd4d1ed470971b03dd93ba0a47705ace27af Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:23:00 +0800
Subject: [PATCH 14/37] modify accomplishment measure
---
xinference/model/embedding/match_result.py | 76 +++++++++++++++++++++
xinference/model/llm/match_result.py | 76 +++++++++++++++++++++
xinference/model/rerank/match_result.py | 77 ++++++++++++++++++++++
3 files changed, 229 insertions(+)
create mode 100644 xinference/model/embedding/match_result.py
create mode 100644 xinference/model/llm/match_result.py
create mode 100644 xinference/model/rerank/match_result.py
diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py
new file mode 100644
index 0000000000..47775f20f9
--- /dev/null
+++ b/xinference/model/embedding/match_result.py
@@ -0,0 +1,76 @@
+"""
+Error handling result structures for embedding model engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+ """
+ Result of engine matching operation with detailed error information.
+
+ This class provides structured information about whether an engine can handle
+ a specific model configuration, and if not, why and what alternatives exist.
+ """
+
+ is_match: bool
+ reason: Optional[str] = None
+ error_type: Optional[str] = None
+ technical_details: Optional[str] = None
+
+ @classmethod
+ def success(cls) -> "MatchResult":
+ """Create a successful match result."""
+ return cls(is_match=True)
+
+ @classmethod
+ def failure(
+ cls,
+ reason: str,
+ error_type: Optional[str] = None,
+ technical_details: Optional[str] = None,
+ ) -> "MatchResult":
+ """Create a failed match result with optional details."""
+ return cls(
+ is_match=False,
+ reason=reason,
+ error_type=error_type,
+ technical_details=technical_details,
+ )
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for API responses."""
+ result = {"is_match": self.is_match}
+ if not self.is_match:
+ if self.reason:
+ result["reason"] = self.reason
+ if self.error_type:
+ result["error_type"] = self.error_type
+ if self.technical_details:
+ result["technical_details"] = self.technical_details
+ return result
+
+ def to_error_string(self) -> str:
+ """Convert to error string for backward compatibility."""
+ if self.is_match:
+ return "Available"
+ error_msg = self.reason or "Unknown error"
+ return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+ HARDWARE_REQUIREMENT = "hardware_requirement"
+ OS_REQUIREMENT = "os_requirement"
+ MODEL_FORMAT = "model_format"
+ DEPENDENCY_MISSING = "dependency_missing"
+ MODEL_COMPATIBILITY = "model_compatibility"
+ DIMENSION_MISMATCH = "dimension_mismatch"
+ VERSION_REQUIREMENT = "version_requirement"
+ CONFIGURATION_ERROR = "configuration_error"
+ ENGINE_UNAVAILABLE = "engine_unavailable"
diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py
new file mode 100644
index 0000000000..eeff2461f2
--- /dev/null
+++ b/xinference/model/llm/match_result.py
@@ -0,0 +1,76 @@
+"""
+Error handling result structures for engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+ """
+ Result of engine matching operation with detailed error information.
+
+ This class provides structured information about whether an engine can handle
+ a specific model configuration, and if not, why and what alternatives exist.
+ """
+
+ is_match: bool
+ reason: Optional[str] = None
+ error_type: Optional[str] = None
+ technical_details: Optional[str] = None
+
+ @classmethod
+ def success(cls) -> "MatchResult":
+ """Create a successful match result."""
+ return cls(is_match=True)
+
+ @classmethod
+ def failure(
+ cls,
+ reason: str,
+ error_type: Optional[str] = None,
+ technical_details: Optional[str] = None,
+ ) -> "MatchResult":
+ """Create a failed match result with optional details."""
+ return cls(
+ is_match=False,
+ reason=reason,
+ error_type=error_type,
+ technical_details=technical_details,
+ )
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for API responses."""
+ result = {"is_match": self.is_match}
+ if not self.is_match:
+ if self.reason:
+ result["reason"] = self.reason
+ if self.error_type:
+ result["error_type"] = self.error_type
+ if self.technical_details:
+ result["technical_details"] = self.technical_details
+ return result
+
+ def to_error_string(self) -> str:
+ """Convert to error string for backward compatibility."""
+ if self.is_match:
+ return "Available"
+ error_msg = self.reason or "Unknown error"
+ return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+ HARDWARE_REQUIREMENT = "hardware_requirement"
+ OS_REQUIREMENT = "os_requirement"
+ MODEL_FORMAT = "model_format"
+ QUANTIZATION = "quantization"
+ DEPENDENCY_MISSING = "dependency_missing"
+ MODEL_COMPATIBILITY = "model_compatibility"
+ ABILITY_MISMATCH = "ability_mismatch"
+ VERSION_REQUIREMENT = "version_requirement"
+ CONFIGURATION_ERROR = "configuration_error"
diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py
new file mode 100644
index 0000000000..125e791afd
--- /dev/null
+++ b/xinference/model/rerank/match_result.py
@@ -0,0 +1,77 @@
+"""
+Error handling result structures for rerank model engine matching.
+
+This module provides structured error handling for engine matching operations,
+allowing engines to provide detailed failure reasons and suggestions.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MatchResult:
+ """
+ Result of engine matching operation with detailed error information.
+
+ This class provides structured information about whether an engine can handle
+ a specific model configuration, and if not, why and what alternatives exist.
+ """
+
+ is_match: bool
+ reason: Optional[str] = None
+ error_type: Optional[str] = None
+ technical_details: Optional[str] = None
+
+ @classmethod
+ def success(cls) -> "MatchResult":
+ """Create a successful match result."""
+ return cls(is_match=True)
+
+ @classmethod
+ def failure(
+ cls,
+ reason: str,
+ error_type: Optional[str] = None,
+ technical_details: Optional[str] = None,
+ ) -> "MatchResult":
+ """Create a failed match result with optional details."""
+ return cls(
+ is_match=False,
+ reason=reason,
+ error_type=error_type,
+ technical_details=technical_details,
+ )
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for API responses."""
+ result = {"is_match": self.is_match}
+ if not self.is_match:
+ if self.reason:
+ result["reason"] = self.reason
+ if self.error_type:
+ result["error_type"] = self.error_type
+ if self.technical_details:
+ result["technical_details"] = self.technical_details
+ return result
+
+ def to_error_string(self) -> str:
+ """Convert to error string for backward compatibility."""
+ if self.is_match:
+ return "Available"
+ error_msg = self.reason or "Unknown error"
+ return error_msg
+
+
+# Error type constants for better categorization
+class ErrorType:
+ HARDWARE_REQUIREMENT = "hardware_requirement"
+ OS_REQUIREMENT = "os_requirement"
+ MODEL_FORMAT = "model_format"
+ DEPENDENCY_MISSING = "dependency_missing"
+ MODEL_COMPATIBILITY = "model_compatibility"
+ DIMENSION_MISMATCH = "dimension_mismatch"
+ VERSION_REQUIREMENT = "version_requirement"
+ CONFIGURATION_ERROR = "configuration_error"
+ ENGINE_UNAVAILABLE = "engine_unavailable"
+ RERANK_SPECIFIC = "rerank_specific"
From 27ea341e43e2c15e96276f1a770104f9bb346691 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:40:08 +0800
Subject: [PATCH 15/37] modify accomplishment measure
---
xinference/model/embedding/core.py | 1 +
xinference/model/embedding/llama_cpp/core.py | 2 +-
.../model/embedding/sentence_transformers/core.py | 2 +-
xinference/model/llm/core.py | 2 +-
xinference/model/llm/llama_cpp/core.py | 2 +-
xinference/model/llm/lmdeploy/core.py | 3 +--
xinference/model/llm/mlx/core.py | 5 +----
xinference/model/llm/sglang/core.py | 4 +---
xinference/model/llm/transformers/core.py | 2 +-
xinference/model/llm/transformers/multimodal/core.py | 6 ------
xinference/model/llm/vllm/core.py | 10 +++-------
xinference/model/rerank/core.py | 1 +
xinference/model/rerank/sentence_transformers/core.py | 3 ++-
xinference/model/rerank/vllm/core.py | 2 +-
xinference/model/utils.py | 2 +-
15 files changed, 17 insertions(+), 30 deletions(-)
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index 299ec4c5d1..c7f5ddb554 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -20,6 +20,7 @@
from collections import defaultdict
from typing import Annotated, Dict, List, Literal, Optional, Union
+from .match_result import MatchResult
from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError
from ...device_utils import empty_cache
from ..core import VirtualEnvSettings
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 6e2908ffdd..932df57f16 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -24,6 +24,7 @@
from packaging import version
+from ..match_result import MatchResult
from ....types import Embedding
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
@@ -235,7 +236,6 @@ def match_json(
model_spec: EmbeddingSpecV1,
quantization: str,
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(model_family, model_spec, quantization)
return result.is_match
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 843d68ea37..6cb66f7ca2 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -19,6 +19,7 @@
import numpy as np
import torch
+from ..match_result import MatchResult
from ....types import Embedding, EmbeddingData, EmbeddingUsage
from ...utils import is_flash_attn_available
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
@@ -434,7 +435,6 @@ def match_json(
model_spec: EmbeddingSpecV1,
quantization: str,
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(model_family, model_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index ee446d024a..2626060579 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,7 +31,7 @@
if TYPE_CHECKING:
from .llm_family import LLMFamilyV2, LLMSpecV1
- from .match_result import ErrorType, MatchResult
+ from .match_result import MatchResult
logger = logging.getLogger(__name__)
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index f35fae9f6e..5790c3a3ca 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -21,6 +21,7 @@
from packaging import version
+from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
from ..core import LLM, chat_context_var
@@ -86,7 +87,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index cd0aa892cf..134e668d7a 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -18,6 +18,7 @@
import torch
+from ..match_result import MatchResult
from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
from ..core import LLM
from ..llm_family import LLMFamilyV2, LLMSpecV1
@@ -121,7 +122,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -189,7 +189,6 @@ def load(self):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index d2d4b25697..7f53112ab3 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -39,6 +39,7 @@
import xoscar as xo
+from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....fields import max_tokens_field
from ....types import (
@@ -411,7 +412,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -772,7 +772,6 @@ def _sanitize_generate_config(
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -858,8 +857,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
-
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index f3658b5ed7..9365f2833b 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -24,6 +24,7 @@
from xoscar.utils import get_next_port
+from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import (
ChatCompletion,
@@ -342,7 +343,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -728,7 +728,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -860,7 +859,6 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 89a966136d..bc828d65b3 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -20,6 +20,7 @@
import torch
+from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....device_utils import (
get_device_preferred_dtype,
@@ -500,7 +501,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
diff --git a/xinference/model/llm/transformers/multimodal/core.py b/xinference/model/llm/transformers/multimodal/core.py
index ae67e102b5..4d6451f42e 100644
--- a/xinference/model/llm/transformers/multimodal/core.py
+++ b/xinference/model/llm/transformers/multimodal/core.py
@@ -39,21 +39,18 @@ def decide_device(self):
"""
Update self._device
"""
- pass
@abstractmethod
def load_processor(self):
"""
Load self._processor and self._tokenizer
"""
- pass
@abstractmethod
def load_multimodal_model(self):
"""
Load self._model
"""
- pass
def load(self):
self.decide_device()
@@ -71,7 +68,6 @@ def build_inputs_from_messages(
actual parameters needed for inference,
e.g. input_ids, attention_masks, etc.
"""
- pass
@abstractmethod
def build_generate_kwargs(
@@ -82,7 +78,6 @@ def build_generate_kwargs(
Hyperparameters needed for generation,
e.g. temperature, max_new_tokens, etc.
"""
- pass
@abstractmethod
def build_streaming_iter(
@@ -95,7 +90,6 @@ def build_streaming_iter(
The length of prompt token usually comes from the input_ids.
In this interface you need to call the `build_inputs_from_messages` and `build_generate_kwargs`.
"""
- pass
def get_stop_strs(self) -> List[str]:
return []
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 9d76d5685e..7e9d6d3865 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -42,6 +42,7 @@
from packaging import version
from typing_extensions import NotRequired
+from ..match_result import MatchResult, ErrorType
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import (
ChatCompletion,
@@ -881,7 +882,6 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -1460,7 +1460,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -1739,7 +1738,6 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@@ -1748,7 +1746,6 @@ def match_json(
def match_json_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
# Use base class validation first
base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
@@ -1816,7 +1813,7 @@ def is_vision_model_supported(
if isinstance(llm_family, CustomLLMFamilyV2):
if not is_vision_model_supported(
- llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST
+ llm_family.model_family.lower()
):
return MatchResult.failure(
reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
@@ -1825,8 +1822,7 @@ def is_vision_model_supported(
)
else:
if not is_vision_model_supported(
- llm_family.model_name.lower(),
- [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST],
+ llm_family.model_name.lower()
):
return MatchResult.failure(
reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index 929522f23e..d3e3b5702c 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -17,6 +17,7 @@
from collections import defaultdict
from typing import Dict, List, Literal, Optional
+from .match_result import MatchResult
from ..._compat import BaseModel
from ...types import Rerank
from ..core import VirtualEnvSettings
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index ee81a9adac..87efe31b5b 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -22,6 +22,7 @@
import torch
import torch.nn as nn
+from ..match_result import MatchResult
from ....device_utils import empty_cache
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
from ...utils import is_flash_attn_available
@@ -341,7 +342,7 @@ def match_json(
model_spec: RerankSpecV1,
quantization: str,
) -> bool:
- from ..match_result import MatchResult
+ pass
result = cls.match_json_with_reason(model_family, model_spec, quantization)
return result.is_match
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index f9763b567a..114eef5907 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -2,6 +2,7 @@
import uuid
from typing import List, Optional
+from ..match_result import MatchResult
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
from ...utils import cache_clean
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
@@ -149,7 +150,6 @@ def match_json(
model_spec: RerankSpecV1,
quantization: str,
) -> bool:
- from ..match_result import MatchResult
result = cls.match_json_with_reason(model_family, model_spec, quantization)
return result.is_match
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ad0dabbf35..383f188382 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -520,7 +520,7 @@ def get_engine_params_by_name(
if hasattr(
engine_class, "match_json_with_reason"
):
- from .llm.match_result import MatchResult
+ pass
result = (
engine_class.match_json_with_reason(
From 114ec633ea524f493a3b509fab49b94d1ea444b3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:41:38 +0800
Subject: [PATCH 16/37] modify accomplishment measure
---
xinference/model/embedding/core.py | 2 +-
xinference/model/embedding/llama_cpp/core.py | 2 +-
.../model/embedding/sentence_transformers/core.py | 2 +-
xinference/model/llm/llama_cpp/core.py | 2 +-
xinference/model/llm/lmdeploy/core.py | 2 +-
xinference/model/llm/mlx/core.py | 2 +-
xinference/model/llm/sglang/core.py | 2 +-
xinference/model/llm/transformers/core.py | 2 +-
xinference/model/llm/vllm/core.py | 10 +++-------
xinference/model/rerank/core.py | 2 +-
xinference/model/rerank/sentence_transformers/core.py | 2 +-
xinference/model/rerank/vllm/core.py | 2 +-
12 files changed, 14 insertions(+), 18 deletions(-)
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index c7f5ddb554..6f934b6e5f 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -20,12 +20,12 @@
from collections import defaultdict
from typing import Annotated, Dict, List, Literal, Optional, Union
-from .match_result import MatchResult
from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError
from ...device_utils import empty_cache
from ..core import VirtualEnvSettings
from ..utils import ModelInstanceInfoMixin
from .embed_family import match_embedding
+from .match_result import MatchResult
logger = logging.getLogger(__name__)
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 932df57f16..4b3d6ed125 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -24,9 +24,9 @@
from packaging import version
-from ..match_result import MatchResult
from ....types import Embedding
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
+from ..match_result import MatchResult
logger = logging.getLogger(__name__)
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 6cb66f7ca2..29bcb66a33 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -19,10 +19,10 @@
import numpy as np
import torch
-from ..match_result import MatchResult
from ....types import Embedding, EmbeddingData, EmbeddingUsage
from ...utils import is_flash_attn_available
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
+from ..match_result import MatchResult
logger = logging.getLogger(__name__)
SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = []
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 5790c3a3ca..386f8eb662 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -21,11 +21,11 @@
from packaging import version
-from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
from ..utils import ChatModelMixin
logger = logging.getLogger(__name__)
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 134e668d7a..f1c2605a24 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -18,10 +18,10 @@
import torch
-from ..match_result import MatchResult
from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
from ..core import LLM
from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
logger = logging.getLogger(__name__)
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 7f53112ab3..943dddd7c4 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -39,7 +39,6 @@
import xoscar as xo
-from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....fields import max_tokens_field
from ....types import (
@@ -52,6 +51,7 @@
)
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 9365f2833b..7095289a5d 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -24,7 +24,6 @@
from xoscar.utils import get_next_port
-from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import (
ChatCompletion,
@@ -38,6 +37,7 @@
from .. import LLM, LLMFamilyV2, LLMSpecV1
from ..core import chat_context_var
from ..llm_family import CustomLLMFamilyV2
+from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index bc828d65b3..8fae36576d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -20,7 +20,6 @@
import torch
-from ..match_result import MatchResult
from ....constants import XINFERENCE_MAX_TOKENS
from ....device_utils import (
get_device_preferred_dtype,
@@ -41,6 +40,7 @@
from ...utils import select_device
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
+from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
LLAMA3_TOOL_CALL_FAMILY,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 7e9d6d3865..7bb0664354 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -42,7 +42,6 @@
from packaging import version
from typing_extensions import NotRequired
-from ..match_result import MatchResult, ErrorType
from ....constants import XINFERENCE_MAX_TOKENS
from ....types import (
ChatCompletion,
@@ -57,6 +56,7 @@
from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
from ..core import chat_context_var
from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config
+from ..match_result import ErrorType, MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
@@ -1812,18 +1812,14 @@ def is_vision_model_supported(
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_vision_model_supported(
- llm_family.model_family.lower()
- ):
+ if not is_vision_model_supported(llm_family.model_family.lower()):
return MatchResult.failure(
reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
error_type=ErrorType.MODEL_COMPATIBILITY,
technical_details=f"Custom vision family: {llm_family.model_family}",
)
else:
- if not is_vision_model_supported(
- llm_family.model_name.lower()
- ):
+ if not is_vision_model_supported(llm_family.model_name.lower()):
return MatchResult.failure(
reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
error_type=ErrorType.MODEL_COMPATIBILITY,
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index d3e3b5702c..c02b230abd 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -17,11 +17,11 @@
from collections import defaultdict
from typing import Dict, List, Literal, Optional
-from .match_result import MatchResult
from ..._compat import BaseModel
from ...types import Rerank
from ..core import VirtualEnvSettings
from ..utils import ModelInstanceInfoMixin
+from .match_result import MatchResult
from .rerank_family import check_engine_by_model_name_and_engine, match_rerank
logger = logging.getLogger(__name__)
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index 87efe31b5b..a21d4f106a 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -22,7 +22,6 @@
import torch
import torch.nn as nn
-from ..match_result import MatchResult
from ....device_utils import empty_cache
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
from ...utils import is_flash_attn_available
@@ -32,6 +31,7 @@
RerankModelFamilyV2,
RerankSpecV1,
)
+from ..match_result import MatchResult
from ..utils import preprocess_sentence
logger = logging.getLogger(__name__)
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 114eef5907..339106f408 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -2,10 +2,10 @@
import uuid
from typing import List, Optional
-from ..match_result import MatchResult
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
from ...utils import cache_clean
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
+from ..match_result import MatchResult
SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
From c17b78e521c4b686b74ace48c95a3e7025542a79 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:47:39 +0800
Subject: [PATCH 17/37] mypy test
---
xinference/model/embedding/match_result.py | 2 +-
xinference/model/llm/match_result.py | 2 +-
xinference/model/llm/vllm/core.py | 8 ++++----
xinference/model/rerank/match_result.py | 2 +-
xinference/model/utils.py | 2 +-
5 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py
index 47775f20f9..3e33c268d4 100644
--- a/xinference/model/embedding/match_result.py
+++ b/xinference/model/embedding/match_result.py
@@ -45,7 +45,7 @@ def failure(
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for API responses."""
- result = {"is_match": self.is_match}
+ result: Dict[str, Any] = {"is_match": self.is_match}
if not self.is_match:
if self.reason:
result["reason"] = self.reason
diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py
index eeff2461f2..3ab90d2c37 100644
--- a/xinference/model/llm/match_result.py
+++ b/xinference/model/llm/match_result.py
@@ -45,7 +45,7 @@ def failure(
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for API responses."""
- result = {"is_match": self.is_match}
+ result: Dict[str, Any] = {"is_match": self.is_match}
if not self.is_match:
if self.reason:
result["reason"] = self.reason
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 7bb0664354..4aeccc0f21 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -1003,7 +1003,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_model_supported(
+ if not llm_family.model_family or not is_model_supported(
llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
):
return MatchResult.failure(
@@ -1551,7 +1551,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_chat_model_supported(
+ if not llm_family.model_family or not is_chat_model_supported(
llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
):
return MatchResult.failure(
@@ -1812,14 +1812,14 @@ def is_vision_model_supported(
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_vision_model_supported(llm_family.model_family.lower()):
+ if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
return MatchResult.failure(
reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
error_type=ErrorType.MODEL_COMPATIBILITY,
technical_details=f"Custom vision family: {llm_family.model_family}",
)
else:
- if not is_vision_model_supported(llm_family.model_name.lower()):
+ if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
return MatchResult.failure(
reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
error_type=ErrorType.MODEL_COMPATIBILITY,
diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py
index 125e791afd..1cd278aa5d 100644
--- a/xinference/model/rerank/match_result.py
+++ b/xinference/model/rerank/match_result.py
@@ -45,7 +45,7 @@ def failure(
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for API responses."""
- result = {"is_match": self.is_match}
+ result: Dict[str, Any] = {"is_match": self.is_match}
if not self.is_match:
if self.reason:
result["reason"] = self.reason
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 383f188382..158fd316c7 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def get_engine_params_by_name(
model_type: Optional[str], model_name: str
) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]:
- engine_params: Dict[str, Any] = {}
+ engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {}
if model_type == "LLM":
from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES
From b19475109dc12aa0e5266a293db01591ffa69318 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:48:43 +0800
Subject: [PATCH 18/37] mypy test
---
xinference/model/llm/vllm/core.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 4aeccc0f21..bf9f07b813 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -1812,14 +1812,18 @@ def is_vision_model_supported(
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
+ if not llm_family.model_family or not is_vision_model_supported(
+ llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
+ ):
return MatchResult.failure(
reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
error_type=ErrorType.MODEL_COMPATIBILITY,
technical_details=f"Custom vision family: {llm_family.model_family}",
)
else:
- if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST):
+ if not llm_family.model_name or not is_vision_model_supported(
+ llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
+ ):
return MatchResult.failure(
reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
error_type=ErrorType.MODEL_COMPATIBILITY,
From 2aa43d7439da5146906c40a767c3ba03a03f10cb Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 12:55:19 +0800
Subject: [PATCH 19/37] mypy test
---
xinference/model/utils.py | 75 ++++++++++++++++++---------------------
1 file changed, 35 insertions(+), 40 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 158fd316c7..f6db71ee8a 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -542,19 +542,24 @@ def get_engine_params_by_name(
pass
if detailed_error:
- engine_params[engine_name] = detailed_error
+ # Convert error dict to string format for consistency
+ error_parts = [detailed_error.get("error", "Unknown error")]
+ if detailed_error.get("error_type"):
+ error_parts.append(f"Type: {detailed_error['error_type']}")
+ if detailed_error.get("technical_details"):
+ error_parts.append(
+ f"Details: {detailed_error['technical_details']}"
+ )
+ engine_params[engine_name] = " | ".join(error_parts)
else:
# Fallback to basic error checking for backward compatibility
- error_msg = None
+ error_msg: Optional[str] = None
for engine_class in llm_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
if not lib_available:
- error_msg = {
- "error": f"Engine {engine_name} library is not available",
- "error_type": "dependency_missing",
- }
+ error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
break
else:
# If no check_lib method, try import check
@@ -575,30 +580,20 @@ def get_engine_params_by_name(
importlib.import_module(module_name)
break
except ImportError as e:
- error_msg = {
- "error": f"Engine {engine_name} library is not installed: {str(e)}",
- "error_type": "dependency_missing",
- }
+ error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
except Exception as e:
- error_msg = {
- "error": f"Engine {engine_name} is not available: {str(e)}",
- "error_type": "configuration_error",
- }
+ error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
if error_msg is None:
- error_msg = {
- "error": f"Engine {engine_name} is not compatible with current model or environment",
- "error_type": "model_compatibility",
- }
+ error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
engine_params[engine_name] = error_msg
except Exception as e:
- # If exception occurs during checking, return structured error
- engine_params[engine_name] = {
- "error": f"Error checking engine {engine_name}: {str(e)}",
- "error_type": "configuration_error",
- }
+ # If exception occurs during checking, return structured error as string
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)"
+ )
# Filter out llm_class field
for engine, params in engine_params.items():
@@ -606,7 +601,7 @@ def get_engine_params_by_name(
params, list
): # Only process parameter lists of available engines
for param in params:
- if "llm_class" in param:
+ if isinstance(param, dict) and "llm_class" in param:
del param["llm_class"]
return engine_params
@@ -638,7 +633,7 @@ def get_engine_params_by_name(
if engine_name not in engine_params: # Engine not in available list
try:
embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
- error_msg = None
+ embedding_error_msg: Optional[str] = None
# Try to find specific error reasons
for embedding_engine_class in embedding_engine_classes:
@@ -646,7 +641,7 @@ def get_engine_params_by_name(
if hasattr(embedding_engine_class, "check_lib"):
embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment]
if not embedding_lib_available:
- error_msg = (
+ embedding_error_msg = (
f"Engine {engine_name} library is not available"
)
break
@@ -671,17 +666,17 @@ def get_engine_params_by_name(
importlib.import_module(module_name)
break
except ImportError as e:
- error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
except Exception as e:
- error_msg = (
+ embedding_error_msg = (
f"Engine {engine_name} is not available: {str(e)}"
)
- if error_msg is None:
- error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ if embedding_error_msg is None:
+ embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ engine_params[engine_name] = embedding_error_msg
except Exception as e:
# If exception occurs during checking, return error message string
@@ -695,7 +690,7 @@ def get_engine_params_by_name(
params, list
): # Only process parameter lists of available engines
for param in params:
- if "embedding_class" in param:
+ if isinstance(param, dict) and "embedding_class" in param:
del param["embedding_class"]
return engine_params
@@ -725,7 +720,7 @@ def get_engine_params_by_name(
if engine_name not in engine_params: # Engine not in available list
try:
rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
- error_msg = None
+ rerank_error_msg: Optional[str] = None
# Try to find specific error reasons
for rerank_engine_class in rerank_engine_classes:
@@ -733,7 +728,7 @@ def get_engine_params_by_name(
if hasattr(rerank_engine_class, "check_lib"):
rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment]
if not rerank_lib_available:
- error_msg = (
+ rerank_error_msg = (
f"Engine {engine_name} library is not available"
)
break
@@ -758,17 +753,17 @@ def get_engine_params_by_name(
importlib.import_module(module_name)
break
except ImportError as e:
- error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
except Exception as e:
- error_msg = (
+ rerank_error_msg = (
f"Engine {engine_name} is not available: {str(e)}"
)
- if error_msg is None:
- error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ if rerank_error_msg is None:
+ rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
# For unavailable engines, directly return error message string
- engine_params[engine_name] = error_msg
+ engine_params[engine_name] = rerank_error_msg
except Exception as e:
# If exception occurs during checking, return error message string
@@ -782,7 +777,7 @@ def get_engine_params_by_name(
params, list
): # Only process parameter lists of available engines
for param in params:
- if "rerank_class" in param:
+ if isinstance(param, dict) and "rerank_class" in param:
del param["rerank_class"]
return engine_params
From 173e49410bdd6806a59ef6292e7d9d9b71b0f15d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:38:53 +0800
Subject: [PATCH 20/37] mypy test
---
xinference/model/utils.py | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index f6db71ee8a..c34e03ef46 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -35,6 +35,7 @@
Tuple,
Type,
Union,
+ cast,
)
import huggingface_hub
@@ -543,14 +544,16 @@ def get_engine_params_by_name(
if detailed_error:
# Convert error dict to string format for consistency
- error_parts = [detailed_error.get("error", "Unknown error")]
- if detailed_error.get("error_type"):
- error_parts.append(f"Type: {detailed_error['error_type']}")
- if detailed_error.get("technical_details"):
- error_parts.append(
- f"Details: {detailed_error['technical_details']}"
- )
- engine_params[engine_name] = " | ".join(error_parts)
+ error_parts = [detailed_error.get("error") or "Unknown error"]
+ error_type = detailed_error.get("error_type")
+ if error_type:
+ error_parts.append(f"Type: {error_type}")
+ technical_details = detailed_error.get("technical_details")
+ if technical_details:
+ error_parts.append(f"Details: {technical_details}")
+ # Filter out None values and join
+ error_parts_filtered = [part for part in error_parts if part is not None]
+ engine_params[engine_name] = " | ".join(error_parts_filtered)
else:
# Fallback to basic error checking for backward compatibility
error_msg: Optional[str] = None
@@ -600,7 +603,8 @@ def get_engine_params_by_name(
if isinstance(
params, list
): # Only process parameter lists of available engines
- for param in params:
+ assert isinstance(params, list)
+ for param in params: # type: ignore
if isinstance(param, dict) and "llm_class" in param:
del param["llm_class"]
@@ -689,7 +693,8 @@ def get_engine_params_by_name(
if isinstance(
params, list
): # Only process parameter lists of available engines
- for param in params:
+ assert isinstance(params, list)
+ for param in params: # type: ignore
if isinstance(param, dict) and "embedding_class" in param:
del param["embedding_class"]
@@ -776,7 +781,8 @@ def get_engine_params_by_name(
if isinstance(
params, list
): # Only process parameter lists of available engines
- for param in params:
+ assert isinstance(params, list)
+ for param in params: # type: ignore
if isinstance(param, dict) and "rerank_class" in param:
del param["rerank_class"]
From bc41700758bf5f10cbf7897a3d5c1c3ca7142dd9 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:40:43 +0800
Subject: [PATCH 21/37] mypy test
---
xinference/model/utils.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index c34e03ef46..3bd7cdb3c3 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -35,7 +35,6 @@
Tuple,
Type,
Union,
- cast,
)
import huggingface_hub
From fc9b422eeaa3752c8bf07b0974558c2305986b80 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 14:41:54 +0800
Subject: [PATCH 22/37] mypy test
---
xinference/model/utils.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 3bd7cdb3c3..6e4a47dda0 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -551,7 +551,9 @@ def get_engine_params_by_name(
if technical_details:
error_parts.append(f"Details: {technical_details}")
# Filter out None values and join
- error_parts_filtered = [part for part in error_parts if part is not None]
+ error_parts_filtered = [
+ part for part in error_parts if part is not None
+ ]
engine_params[engine_name] = " | ".join(error_parts_filtered)
else:
# Fallback to basic error checking for backward compatibility
From 5030b261cc9e57a4debd0ebb93339d7ec6421d29 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 16:44:04 +0800
Subject: [PATCH 23/37] mypy fix
---
xinference/model/utils.py | 27 +++++++++------------------
1 file changed, 9 insertions(+), 18 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 6e4a47dda0..780602dec2 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -600,12 +600,9 @@ def get_engine_params_by_name(
)
# Filter out llm_class field
- for engine, params in engine_params.items():
- if isinstance(
- params, list
- ): # Only process parameter lists of available engines
- assert isinstance(params, list)
- for param in params: # type: ignore
+ for engine in engine_params.keys():
+ if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "llm_class" in param:
del param["llm_class"]
@@ -690,12 +687,9 @@ def get_engine_params_by_name(
)
# Filter out embedding_class field
- for engine, params in engine_params.items():
- if isinstance(
- params, list
- ): # Only process parameter lists of available engines
- assert isinstance(params, list)
- for param in params: # type: ignore
+ for engine in engine_params.keys():
+ if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "embedding_class" in param:
del param["embedding_class"]
@@ -778,12 +772,9 @@ def get_engine_params_by_name(
)
# Filter out rerank_class field
- for engine, params in engine_params.items():
- if isinstance(
- params, list
- ): # Only process parameter lists of available engines
- assert isinstance(params, list)
- for param in params: # type: ignore
+ for engine in engine_params.keys():
+ if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "rerank_class" in param:
del param["rerank_class"]
From cf517326630651f59e5873e1fa501a3a67dc2908 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 16:47:59 +0800
Subject: [PATCH 24/37] mypy fix
---
xinference/model/utils.py | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 780602dec2..c0c5233128 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -601,7 +601,9 @@ def get_engine_params_by_name(
# Filter out llm_class field
for engine in engine_params.keys():
- if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ if isinstance(
+ engine_params[engine], list
+ ): # Only process parameter lists of available engines
for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "llm_class" in param:
del param["llm_class"]
@@ -688,7 +690,9 @@ def get_engine_params_by_name(
# Filter out embedding_class field
for engine in engine_params.keys():
- if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ if isinstance(
+ engine_params[engine], list
+ ): # Only process parameter lists of available engines
for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "embedding_class" in param:
del param["embedding_class"]
@@ -773,7 +777,9 @@ def get_engine_params_by_name(
# Filter out rerank_class field
for engine in engine_params.keys():
- if isinstance(engine_params[engine], list): # Only process parameter lists of available engines
+ if isinstance(
+ engine_params[engine], list
+ ): # Only process parameter lists of available engines
for param in engine_params[engine]: # type: ignore
if isinstance(param, dict) and "rerank_class" in param:
del param["rerank_class"]
From 0660aaba3e420a332b7f3934e3a70a321f3452c6 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 17:54:22 +0800
Subject: [PATCH 25/37] mypy fix
---
xinference/model/utils.py | 190 +++++++++++++++++++++++++++-----------
1 file changed, 137 insertions(+), 53 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index c0c5233128..96beec9618 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -14,7 +14,6 @@
import asyncio
import functools
-import importlib.util
import json
import logging
import os
@@ -566,22 +565,65 @@ def get_engine_params_by_name(
error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
break
else:
- # If no check_lib method, try import check
- module_name = engine_name.lower().replace(".", "")
- if engine_name == "vLLM":
- module_name = "vllm"
- elif engine_name == "SGLang":
- module_name = "sglang"
- elif engine_name == "llama.cpp":
- module_name = "llama_cpp"
- elif engine_name == "MLX":
- module_name = "mlx"
- elif engine_name == "LMDEPLOY":
- module_name = "lmdeploy"
- elif engine_name == "Transformers":
- module_name = "transformers"
-
- importlib.import_module(module_name)
+ # If no check_lib method, try to use engine's match method for compatibility check
+ # This provides more detailed and accurate error information
+ try:
+ # Create a minimal test spec if we don't have real model specs
+ from .llm.llm_family import (
+ LLMFamilyV2,
+ PytorchLLMSpecV2,
+ )
+
+ # Create a minimal test case
+ test_family = LLMFamilyV2(
+ model_name="test",
+ model_family="test",
+ model_specs=[
+ PytorchLLMSpecV2(
+ model_format="pytorch",
+ quantization="none",
+ )
+ ],
+ )
+ test_spec = test_family.model_specs[0]
+
+ # Use the engine's match method if available
+ if hasattr(
+ engine_class, "match_json_with_reason"
+ ):
+ result = (
+ engine_class.match_json_with_reason(
+ test_family, test_spec, "none"
+ )
+ )
+ if result.is_match:
+ break # Engine is available
+ else:
+ error_msg = f"Engine {engine_name}: {result.reason}"
+ if result.error_type:
+ error_msg += (
+ f" (Type: {result.error_type})"
+ )
+ break
+ elif hasattr(engine_class, "match_json"):
+ # Fallback to simple match method - use test data
+ if engine_class.match_json(
+ test_family, test_spec, "none"
+ ):
+ break
+ else:
+ error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
+ break
+ else:
+ # Final fallback: generic import check
+ raise ImportError(
+ "No compatibility check method available"
+ )
+
+ except ImportError as e:
+ error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+ except Exception as e:
+ error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
break
except ImportError as e:
error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
@@ -650,24 +692,45 @@ def get_engine_params_by_name(
)
break
else:
- # If no check_lib method, try import check
- module_name = engine_name.lower().replace(".", "")
- if engine_name == "vLLM":
- module_name = "vllm"
- elif engine_name == "SGLang":
- module_name = "sglang"
- elif engine_name == "llama.cpp":
- module_name = "llama_cpp"
- elif engine_name == "MLX":
- module_name = "mlx"
- elif engine_name == "LMDEPLOY":
- module_name = "lmdeploy"
- elif engine_name == "Transformers":
- module_name = "transformers"
- elif engine_name == "SentenceTransformers":
- module_name = "sentence_transformers"
-
- importlib.import_module(module_name)
+ # If no check_lib method, try to use engine's match method for compatibility check
+ try:
+ from .embedding.core import (
+ EmbeddingModelFamilyV2,
+ TransformersEmbeddingSpecV1,
+ )
+
+ # Use the engine's match method if available
+ if hasattr(embedding_engine_class, "match"):
+ # Create a minimal test case
+ test_family = EmbeddingModelFamilyV2(
+ model_name="test",
+ model_specs=[
+ TransformersEmbeddingSpecV1(
+ model_format="pytorch",
+ quantization="none",
+ )
+ ],
+ )
+ test_spec = test_family.model_specs[0]
+
+ # Use the engine's match method to check compatibility
+ if embedding_engine_class.match(
+ test_family, test_spec, "none"
+ ):
+ break # Engine is available
+ else:
+ embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ break
+ else:
+ # Final fallback: generic import check
+ raise ImportError(
+ "No compatibility check method available"
+ )
+
+ except ImportError as e:
+ embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ except Exception as e:
+ embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}"
break
except ImportError as e:
embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
@@ -737,24 +800,45 @@ def get_engine_params_by_name(
)
break
else:
- # If no check_lib method, try import check
- module_name = engine_name.lower().replace(".", "")
- if engine_name == "vLLM":
- module_name = "vllm"
- elif engine_name == "SGLang":
- module_name = "sglang"
- elif engine_name == "llama.cpp":
- module_name = "llama_cpp"
- elif engine_name == "MLX":
- module_name = "mlx"
- elif engine_name == "LMDEPLOY":
- module_name = "lmdeploy"
- elif engine_name == "Transformers":
- module_name = "transformers"
- elif engine_name == "SentenceTransformers":
- module_name = "sentence_transformers"
-
- importlib.import_module(module_name)
+ # If no check_lib method, try to use engine's match method for compatibility check
+ try:
+ from .rerank.core import (
+ RerankModelFamilyV2,
+ RerankSpecV1,
+ )
+
+ # Use the engine's match method if available
+ if hasattr(rerank_engine_class, "match"):
+ # Create a minimal test case
+ test_family = RerankModelFamilyV2(
+ model_name="test",
+ model_specs=[
+ RerankSpecV1(
+ model_format="pytorch",
+ quantization="none",
+ )
+ ],
+ )
+ test_spec = test_family.model_specs[0]
+
+ # Use the engine's match method to check compatibility
+ if rerank_engine_class.match(
+ test_family, test_spec, "none"
+ ):
+ break # Engine is available
+ else:
+ rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ break
+ else:
+ # Final fallback: generic import check
+ raise ImportError(
+ "No compatibility check method available"
+ )
+
+ except ImportError as e:
+ rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ except Exception as e:
+ rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}"
break
except ImportError as e:
rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
From 996f3cdc0040312c6f8d6587dffeaa74c925f656 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 21 Oct 2025 18:17:59 +0800
Subject: [PATCH 26/37] mypy fix
---
xinference/model/utils.py | 225 +++++++++++++++++++++++++-------------
1 file changed, 150 insertions(+), 75 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 96beec9618..0ed516085d 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -541,28 +541,19 @@ def get_engine_params_by_name(
pass
if detailed_error:
- # Convert error dict to string format for consistency
- error_parts = [detailed_error.get("error") or "Unknown error"]
- error_type = detailed_error.get("error_type")
- if error_type:
- error_parts.append(f"Type: {error_type}")
- technical_details = detailed_error.get("technical_details")
- if technical_details:
- error_parts.append(f"Details: {technical_details}")
- # Filter out None values and join
- error_parts_filtered = [
- part for part in error_parts if part is not None
+ # Convert error dict to array format with error, type, details fields
+ engine_params[engine_name] = [
+ f"error: {detailed_error.get('error') or 'Unknown error'}",
+ f"type: {detailed_error.get('error_type') or 'unknown'}",
+ f"details: {detailed_error.get('technical_details') or 'No additional details available'}",
]
- engine_params[engine_name] = " | ".join(error_parts_filtered)
else:
# Fallback to basic error checking for backward compatibility
- error_msg: Optional[str] = None
for engine_class in llm_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
if not lib_available:
- error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)"
break
else:
# If no check_lib method, try to use engine's match method for compatibility check
@@ -599,11 +590,12 @@ def get_engine_params_by_name(
if result.is_match:
break # Engine is available
else:
- error_msg = f"Engine {engine_name}: {result.reason}"
- if result.error_type:
- error_msg += (
- f" (Type: {result.error_type})"
- )
+ # Create array format for match method errors
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name}: {result.reason}",
+ f"type: {result.error_type or 'model_compatibility'}",
+ f"details: Engine {engine_name} compatibility check failed: {result.reason}",
+ ]
break
elif hasattr(engine_class, "match_json"):
# Fallback to simple match method - use test data
@@ -612,7 +604,6 @@ def get_engine_params_by_name(
):
break
else:
- error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
break
else:
# Final fallback: generic import check
@@ -621,25 +612,49 @@ def get_engine_params_by_name(
)
except ImportError as e:
- error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name} library is not installed: {str(e)}",
+ f"type: dependency_missing",
+ f"details: Missing required dependency for {engine_name} engine: {str(e)}",
+ ]
+ break
except Exception as e:
- error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
- break
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name} is not available: {str(e)}",
+ f"type: configuration_error",
+ f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ ]
+ break
except ImportError as e:
- error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)"
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name} library is not installed: {str(e)}",
+ f"type: dependency_missing",
+ f"details: Missing required dependency for {engine_name} engine: {str(e)}",
+ ]
+ break
except Exception as e:
- error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)"
-
- if error_msg is None:
- error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)"
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name} is not available: {str(e)}",
+ f"type: configuration_error",
+ f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ ]
+ break
- engine_params[engine_name] = error_msg
+ # Only set default error if not already set by one of the exception handlers
+ if engine_name not in engine_params:
+ engine_params[engine_name] = [
+ f"error: Engine {engine_name} is not compatible with current model or environment",
+ f"type: model_compatibility",
+ f"details: The {engine_name} engine cannot handle the current model configuration",
+ ]
except Exception as e:
- # If exception occurs during checking, return structured error as string
- engine_params[engine_name] = (
- f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)"
- )
+ # If exception occurs during checking, return structured error as array
+ engine_params[engine_name] = [
+ f"error: Error checking engine {engine_name}: {str(e)}",
+ f"type: configuration_error",
+ f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+ ]
# Filter out llm_class field
for engine in engine_params.keys():
@@ -679,7 +694,7 @@ def get_engine_params_by_name(
if engine_name not in engine_params: # Engine not in available list
try:
embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name]
- embedding_error_msg: Optional[str] = None
+ embedding_error_details: Optional[Dict[str, str]] = None
# Try to find specific error reasons
for embedding_engine_class in embedding_engine_classes:
@@ -687,9 +702,11 @@ def get_engine_params_by_name(
if hasattr(embedding_engine_class, "check_lib"):
embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment]
if not embedding_lib_available:
- embedding_error_msg = (
- f"Engine {engine_name} library is not available"
- )
+ embedding_error_details = {
+ "error": f"Engine {engine_name} library is not available",
+ "error_type": "dependency_missing",
+ "technical_details": f"The required library for {engine_name} engine is not installed or not accessible",
+ }
break
else:
# If no check_lib method, try to use engine's match method for compatibility check
@@ -719,7 +736,11 @@ def get_engine_params_by_name(
):
break # Engine is available
else:
- embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ embedding_error_details = {
+ "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error_type": "model_compatibility",
+ "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
+ }
break
else:
# Final fallback: generic import check
@@ -728,28 +749,52 @@ def get_engine_params_by_name(
)
except ImportError as e:
- embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ embedding_error_details = {
+ "error": f"Engine {engine_name} library is not installed: {str(e)}",
+ "error_type": "dependency_missing",
+ "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+ }
except Exception as e:
- embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}"
+ embedding_error_details = {
+ "error": f"Engine {engine_name} is not available: {str(e)}",
+ "error_type": "configuration_error",
+ "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ }
break
except ImportError as e:
- embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ embedding_error_details = {
+ "error": f"Engine {engine_name} library is not installed: {str(e)}",
+ "error_type": "dependency_missing",
+ "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+ }
except Exception as e:
- embedding_error_msg = (
- f"Engine {engine_name} is not available: {str(e)}"
- )
-
- if embedding_error_msg is None:
- embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
- # For unavailable engines, directly return error message string
- engine_params[engine_name] = embedding_error_msg
+ embedding_error_details = {
+ "error": f"Engine {engine_name} is not available: {str(e)}",
+ "error_type": "configuration_error",
+ "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ }
+
+ if embedding_error_details is None:
+ embedding_error_details = {
+ "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error_type": "model_compatibility",
+ "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
+ }
+
+ # For unavailable engines, format error message as array like LLM
+ engine_params[engine_name] = [
+ f"error: {embedding_error_details.get('error') or 'Unknown error'}",
+ f"type: {embedding_error_details.get('error_type') or 'unknown'}",
+ f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}",
+ ]
except Exception as e:
- # If exception occurs during checking, return error message string
- engine_params[engine_name] = (
- f"Error checking engine {engine_name}: {str(e)}"
- )
+ # If exception occurs during checking, return structured error as array like LLM
+ engine_params[engine_name] = [
+ f"error: Error checking engine {engine_name}: {str(e)}",
+ f"type: configuration_error",
+ f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+ ]
# Filter out embedding_class field
for engine in engine_params.keys():
@@ -787,7 +832,7 @@ def get_engine_params_by_name(
if engine_name not in engine_params: # Engine not in available list
try:
rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name]
- rerank_error_msg: Optional[str] = None
+ rerank_error_details: Optional[Dict[str, str]] = None
# Try to find specific error reasons
for rerank_engine_class in rerank_engine_classes:
@@ -795,9 +840,11 @@ def get_engine_params_by_name(
if hasattr(rerank_engine_class, "check_lib"):
rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment]
if not rerank_lib_available:
- rerank_error_msg = (
- f"Engine {engine_name} library is not available"
- )
+ rerank_error_details = {
+ "error": f"Engine {engine_name} library is not available",
+ "error_type": "dependency_missing",
+ "technical_details": f"The required library for {engine_name} engine is not installed or not accessible",
+ }
break
else:
# If no check_lib method, try to use engine's match method for compatibility check
@@ -827,7 +874,11 @@ def get_engine_params_by_name(
):
break # Engine is available
else:
- rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
+ rerank_error_details = {
+ "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error_type": "model_compatibility",
+ "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
+ }
break
else:
# Final fallback: generic import check
@@ -836,28 +887,52 @@ def get_engine_params_by_name(
)
except ImportError as e:
- rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ rerank_error_details = {
+ "error": f"Engine {engine_name} library is not installed: {str(e)}",
+ "error_type": "dependency_missing",
+ "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+ }
except Exception as e:
- rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}"
+ rerank_error_details = {
+ "error": f"Engine {engine_name} is not available: {str(e)}",
+ "error_type": "configuration_error",
+ "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ }
break
except ImportError as e:
- rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}"
+ rerank_error_details = {
+ "error": f"Engine {engine_name} library is not installed: {str(e)}",
+ "error_type": "dependency_missing",
+ "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}",
+ }
except Exception as e:
- rerank_error_msg = (
- f"Engine {engine_name} is not available: {str(e)}"
- )
-
- if rerank_error_msg is None:
- rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment"
-
- # For unavailable engines, directly return error message string
- engine_params[engine_name] = rerank_error_msg
+ rerank_error_details = {
+ "error": f"Engine {engine_name} is not available: {str(e)}",
+ "error_type": "configuration_error",
+ "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
+ }
+
+ if rerank_error_details is None:
+ rerank_error_details = {
+ "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error_type": "model_compatibility",
+ "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
+ }
+
+ # For unavailable engines, format error message as array like LLM
+ engine_params[engine_name] = [
+ f"error: {rerank_error_details.get('error') or 'Unknown error'}",
+ f"type: {rerank_error_details.get('error_type') or 'unknown'}",
+ f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}",
+ ]
except Exception as e:
- # If exception occurs during checking, return error message string
- engine_params[engine_name] = (
- f"Error checking engine {engine_name}: {str(e)}"
- )
+ # If exception occurs during checking, return structured error as array like LLM
+ engine_params[engine_name] = [
+ f"error: Error checking engine {engine_name}: {str(e)}",
+ f"type: configuration_error",
+ f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
+ ]
# Filter out rerank_class field
for engine in engine_params.keys():
From 41b0735eec9c293dabea4d8c6965c8c736b51e09 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 10:02:34 +0800
Subject: [PATCH 27/37] mypy fix
---
xinference/model/utils.py | 111 +++++++++++++++-----------------------
1 file changed, 44 insertions(+), 67 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 0ed516085d..146f145513 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -541,12 +541,10 @@ def get_engine_params_by_name(
pass
if detailed_error:
- # Convert error dict to array format with error, type, details fields
- engine_params[engine_name] = [
- f"error: {detailed_error.get('error') or 'Unknown error'}",
- f"type: {detailed_error.get('error_type') or 'unknown'}",
- f"details: {detailed_error.get('technical_details') or 'No additional details available'}",
- ]
+ # Return only the error message without engine_name prefix (key already contains engine name)
+ engine_params[engine_name] = (
+ detailed_error.get("error") or "Unknown error"
+ )
else:
# Fallback to basic error checking for backward compatibility
for engine_class in llm_engine_classes:
@@ -590,12 +588,11 @@ def get_engine_params_by_name(
if result.is_match:
break # Engine is available
else:
- # Create array format for match method errors
- engine_params[engine_name] = [
- f"error: Engine {engine_name}: {result.reason}",
- f"type: {result.error_type or 'model_compatibility'}",
- f"details: Engine {engine_name} compatibility check failed: {result.reason}",
- ]
+ # Return only the error message without engine_name prefix (key already contains engine name)
+ engine_params[engine_name] = (
+ result.reason
+ or "Unknown compatibility error"
+ )
break
elif hasattr(engine_class, "match_json"):
# Fallback to simple match method - use test data
@@ -612,49 +609,37 @@ def get_engine_params_by_name(
)
except ImportError as e:
- engine_params[engine_name] = [
- f"error: Engine {engine_name} library is not installed: {str(e)}",
- f"type: dependency_missing",
- f"details: Missing required dependency for {engine_name} engine: {str(e)}",
- ]
+ engine_params[engine_name] = (
+ f"Engine {engine_name} library is not installed: {str(e)}"
+ )
break
except Exception as e:
- engine_params[engine_name] = [
- f"error: Engine {engine_name} is not available: {str(e)}",
- f"type: configuration_error",
- f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
- ]
+ engine_params[engine_name] = (
+ f"Engine {engine_name} is not available: {str(e)}"
+ )
break
except ImportError as e:
- engine_params[engine_name] = [
- f"error: Engine {engine_name} library is not installed: {str(e)}",
- f"type: dependency_missing",
- f"details: Missing required dependency for {engine_name} engine: {str(e)}",
- ]
+ engine_params[engine_name] = (
+ f"Engine {engine_name} library is not installed: {str(e)}"
+ )
break
except Exception as e:
- engine_params[engine_name] = [
- f"error: Engine {engine_name} is not available: {str(e)}",
- f"type: configuration_error",
- f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}",
- ]
+ engine_params[engine_name] = (
+ f"Engine {engine_name} is not available: {str(e)}"
+ )
break
# Only set default error if not already set by one of the exception handlers
if engine_name not in engine_params:
- engine_params[engine_name] = [
- f"error: Engine {engine_name} is not compatible with current model or environment",
- f"type: model_compatibility",
- f"details: The {engine_name} engine cannot handle the current model configuration",
- ]
+ engine_params[engine_name] = (
+ f"Engine {engine_name} is not compatible with current model or environment"
+ )
except Exception as e:
- # If exception occurs during checking, return structured error as array
- engine_params[engine_name] = [
- f"error: Error checking engine {engine_name}: {str(e)}",
- f"type: configuration_error",
- f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
- ]
+ # If exception occurs during checking, return simple string format
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
# Filter out llm_class field
for engine in engine_params.keys():
@@ -781,20 +766,16 @@ def get_engine_params_by_name(
"technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
}
- # For unavailable engines, format error message as array like LLM
- engine_params[engine_name] = [
- f"error: {embedding_error_details.get('error') or 'Unknown error'}",
- f"type: {embedding_error_details.get('error_type') or 'unknown'}",
- f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}",
- ]
+ # For unavailable engines, return simple string format
+ engine_params[engine_name] = (
+ embedding_error_details.get("error") or "Unknown error"
+ )
except Exception as e:
- # If exception occurs during checking, return structured error as array like LLM
- engine_params[engine_name] = [
- f"error: Error checking engine {engine_name}: {str(e)}",
- f"type: configuration_error",
- f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
- ]
+ # If exception occurs during checking, return simple string format
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
# Filter out embedding_class field
for engine in engine_params.keys():
@@ -919,20 +900,16 @@ def get_engine_params_by_name(
"technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
}
- # For unavailable engines, format error message as array like LLM
- engine_params[engine_name] = [
- f"error: {rerank_error_details.get('error') or 'Unknown error'}",
- f"type: {rerank_error_details.get('error_type') or 'unknown'}",
- f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}",
- ]
+ # For unavailable engines, return simple string format
+ engine_params[engine_name] = (
+ rerank_error_details.get("error") or "Unknown error"
+ )
except Exception as e:
- # If exception occurs during checking, return structured error as array like LLM
- engine_params[engine_name] = [
- f"error: Error checking engine {engine_name}: {str(e)}",
- f"type: configuration_error",
- f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}",
- ]
+ # If exception occurs during checking, return simple string format
+ engine_params[engine_name] = (
+ f"Error checking engine {engine_name}: {str(e)}"
+ )
# Filter out rerank_class field
for engine in engine_params.keys():
From c760a589e971d5db5dbd97582010a5736e633c55 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:09:06 +0800
Subject: [PATCH 28/37] Modify class name
---
xinference/model/embedding/core.py | 2 +-
xinference/model/embedding/llama_cpp/core.py | 4 ++--
.../embedding/sentence_transformers/core.py | 4 ++--
xinference/model/llm/core.py | 2 +-
xinference/model/llm/llama_cpp/core.py | 4 ++--
xinference/model/llm/lmdeploy/core.py | 8 ++++----
xinference/model/llm/mlx/core.py | 14 +++++++-------
xinference/model/llm/sglang/core.py | 16 ++++++++--------
xinference/model/llm/transformers/core.py | 4 ++--
xinference/model/llm/vllm/core.py | 16 ++++++++--------
xinference/model/rerank/core.py | 2 +-
.../model/rerank/sentence_transformers/core.py | 4 ++--
xinference/model/rerank/vllm/core.py | 4 ++--
xinference/model/utils.py | 10 +++++-----
14 files changed, 47 insertions(+), 47 deletions(-)
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index 6f934b6e5f..b68e5236ca 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -173,7 +173,7 @@ def match_json(
pass
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index 4b3d6ed125..d84434384f 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -237,11 +237,11 @@ def match_json(
quantization: str,
) -> bool:
- result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ result = cls.match_with_reason(model_family, model_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 29bcb66a33..c1789f9912 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -436,11 +436,11 @@ def match_json(
quantization: str,
) -> bool:
- result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ result = cls.match_with_reason(model_family, model_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 2626060579..3020483219 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -161,7 +161,7 @@ def match_json(
raise NotImplementedError
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
"""
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 386f8eb662..e8ff96f83b 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -88,11 +88,11 @@ def match_json(
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index f1c2605a24..90115dec06 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -123,11 +123,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
@@ -190,11 +190,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index 943dddd7c4..ff6b2e51ea 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -413,11 +413,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
@@ -773,17 +773,17 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
# Use base class validation first
- base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ base_result = super().match_with_reason(llm_family, llm_spec, quantization)
if not base_result.is_match:
return base_result
@@ -857,11 +857,11 @@ def check_lib(cls) -> bool:
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index 7095289a5d..d22a157777 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -344,11 +344,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
@@ -729,17 +729,17 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
# Use base class validation first
- base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ base_result = super().match_with_reason(llm_family, llm_spec, quantization)
if not base_result.is_match:
return base_result
@@ -860,17 +860,17 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
# Use base class validation first
- base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ base_result = super().match_with_reason(llm_family, llm_spec, quantization)
if not base_result.is_match:
return base_result
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 8fae36576d..5a4a9f557d 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -502,11 +502,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index bf9f07b813..bc0eede4c0 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -883,11 +883,11 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
@@ -1461,17 +1461,17 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
from ..match_result import ErrorType, MatchResult
# Use base class validation first
- base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ base_result = super().match_with_reason(llm_family, llm_spec, quantization)
if not base_result.is_match:
return base_result
@@ -1739,16 +1739,16 @@ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+ result = cls.match_with_reason(llm_family, llm_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> "MatchResult":
# Use base class validation first
- base_result = super().match_json_with_reason(llm_family, llm_spec, quantization)
+ base_result = super().match_with_reason(llm_family, llm_spec, quantization)
if not base_result.is_match:
return base_result
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index c02b230abd..2d3edde1c2 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -133,7 +133,7 @@ def match_json(
pass
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index a21d4f106a..42332bc477 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -344,11 +344,11 @@ def match_json(
) -> bool:
pass
- result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ result = cls.match_with_reason(model_family, model_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 339106f408..c2ee75cfef 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -151,11 +151,11 @@ def match_json(
quantization: str,
) -> bool:
- result = cls.match_json_with_reason(model_family, model_spec, quantization)
+ result = cls.match_with_reason(model_family, model_spec, quantization)
return result.is_match
@classmethod
- def match_json_with_reason(
+ def match_with_reason(
cls,
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 146f145513..e27c93d851 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -499,7 +499,7 @@ def get_engine_params_by_name(
try:
llm_engine_classes = SUPPORTED_ENGINES[engine_name]
- # Try to get detailed error information from engine's match_json_with_reason
+ # Try to get detailed error information from engine's match_with_reason
detailed_error = None
# We need a sample model to test against, use the first available spec
@@ -517,12 +517,12 @@ def get_engine_params_by_name(
for engine_class in llm_engine_classes:
try:
if hasattr(
- engine_class, "match_json_with_reason"
+ engine_class, "match_with_reason"
):
pass
result = (
- engine_class.match_json_with_reason(
+ engine_class.match_with_reason(
llm_family, llm_spec, quantization
)
)
@@ -578,10 +578,10 @@ def get_engine_params_by_name(
# Use the engine's match method if available
if hasattr(
- engine_class, "match_json_with_reason"
+ engine_class, "match_with_reason"
):
result = (
- engine_class.match_json_with_reason(
+ engine_class.match_with_reason(
test_family, test_spec, "none"
)
)
From 6615014c8faae1821c90b0862339672ee215ca9a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:12:15 +0800
Subject: [PATCH 29/37] Modify class name
---
xinference/model/utils.py | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index e27c93d851..377259af77 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -516,15 +516,11 @@ def get_engine_params_by_name(
# Test each engine class for detailed error info
for engine_class in llm_engine_classes:
try:
- if hasattr(
- engine_class, "match_with_reason"
- ):
+ if hasattr(engine_class, "match_with_reason"):
pass
- result = (
- engine_class.match_with_reason(
- llm_family, llm_spec, quantization
- )
+ result = engine_class.match_with_reason(
+ llm_family, llm_spec, quantization
)
if not result.is_match:
detailed_error = {
@@ -577,13 +573,9 @@ def get_engine_params_by_name(
test_spec = test_family.model_specs[0]
# Use the engine's match method if available
- if hasattr(
- engine_class, "match_with_reason"
- ):
- result = (
- engine_class.match_with_reason(
- test_family, test_spec, "none"
- )
+ if hasattr(engine_class, "match_with_reason"):
+ result = engine_class.match_with_reason(
+ test_family, test_spec, "none"
)
if result.is_match:
break # Engine is available
From 2105c83392399ae5eb800bb6f00a19422e81d25d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 16:44:35 +0800
Subject: [PATCH 30/37] commit
---
xinference/model/utils.py | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 377259af77..ea7adb309e 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -529,12 +529,21 @@ def get_engine_params_by_name(
"technical_details": result.technical_details,
}
break
- except Exception:
- # Fall back to next engine class
+ except Exception as e:
+ # Fall back to next engine class with clear error logging
+ logger.warning(
+ f"Engine class {engine_class.__name__} match_with_reason failed: {e}"
+ )
+ # Continue to try next engine class, but this is expected behavior for fallback
continue
- except Exception:
- # If we can't get model family, continue with basic checking
- pass
+ except Exception as e:
+ # If we can't get model family, fail with clear error
+ logger.error(
+ f"Failed to get model family for {model_name} (LLM): {e}"
+ )
+ raise RuntimeError(
+ f"Unable to process LLM model {model_name}: {e}"
+ )
if detailed_error:
# Return only the error message without engine_name prefix (key already contains engine name)
From eb1bb43dc6358228ff7462f002ff1d62348eda56 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:21:15 +0800
Subject: [PATCH 31/37] new engine ability display
---
xinference/model/embedding/core.py | 53 +--
xinference/model/embedding/flag/core.py | 17 +-
xinference/model/embedding/llama_cpp/core.py | 59 +--
.../embedding/sentence_transformers/core.py | 70 +---
xinference/model/embedding/vllm/core.py | 17 +-
xinference/model/llm/core.py | 48 +--
xinference/model/llm/llama_cpp/core.py | 68 +--
xinference/model/llm/lmdeploy/core.py | 70 +---
xinference/model/llm/mlx/core.py | 165 ++------
xinference/model/llm/sglang/core.py | 387 ++++++++++--------
xinference/model/llm/transformers/core.py | 74 +---
xinference/model/llm/vllm/core.py | 274 +++----------
xinference/model/rerank/core.py | 55 +--
.../rerank/sentence_transformers/core.py | 68 +--
xinference/model/rerank/vllm/core.py | 67 +--
xinference/model/utils.py | 165 ++++++--
16 files changed, 591 insertions(+), 1066 deletions(-)
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
index b68e5236ca..42f39049f6 100644
--- a/xinference/model/embedding/core.py
+++ b/xinference/model/embedding/core.py
@@ -25,7 +25,6 @@
from ..core import VirtualEnvSettings
from ..utils import ModelInstanceInfoMixin
from .embed_family import match_embedding
-from .match_result import MatchResult
logger = logging.getLogger(__name__)
@@ -159,7 +158,7 @@ def __init__(
@classmethod
@abstractmethod
- def check_lib(cls) -> bool:
+ def check_lib(cls) -> Union[bool, str]:
pass
@classmethod
@@ -169,62 +168,24 @@ def match_json(
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ) -> bool:
+ ) -> Union[bool, str]:
pass
- @classmethod
- def match_with_reason(
- cls,
- model_family: EmbeddingModelFamilyV2,
- model_spec: EmbeddingSpecV1,
- quantization: str,
- ) -> "MatchResult":
- """
- Check if the engine can handle the given embedding model with detailed error information.
-
- This method provides detailed failure reasons and suggestions when an engine
- cannot handle a specific model configuration. The default implementation
- falls back to the boolean match_json method for backward compatibility.
-
- Args:
- model_family: The embedding model family information
- model_spec: The model specification
- quantization: The quantization method
-
- Returns:
- MatchResult: Detailed match result with reasons and suggestions
- """
- from .match_result import ErrorType, MatchResult
-
- # Default implementation for backward compatibility
- if cls.match_json(model_family, model_spec, quantization):
- return MatchResult.success()
- else:
- # Get basic reason based on common failure patterns
- if not cls.check_lib():
- return MatchResult.failure(
- reason=f"Required library for {cls.__name__} is not available",
- error_type=ErrorType.DEPENDENCY_MISSING,
- )
- else:
- return MatchResult.failure(
- reason=f"Embedding model configuration is not compatible with {cls.__name__}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- )
-
@classmethod
def match(
cls,
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ):
+ ) -> bool:
"""
Return if the model_spec can be matched.
"""
- if not cls.check_lib():
+ lib_result = cls.check_lib()
+ if lib_result != True:
return False
- return cls.match_json(model_family, model_spec, quantization)
+ match_result = cls.match_json(model_family, model_spec, quantization)
+ return match_result == True
@abstractmethod
def load(self):
diff --git a/xinference/model/embedding/flag/core.py b/xinference/model/embedding/flag/core.py
index a53036449e..174a860d91 100644
--- a/xinference/model/embedding/flag/core.py
+++ b/xinference/model/embedding/flag/core.py
@@ -285,8 +285,12 @@ def encode(
return result
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("FlagEmbedding") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("FlagEmbedding") is not None
+ else "FlagEmbedding library is not installed"
+ )
@classmethod
def match_json(
@@ -294,10 +298,15 @@ def match_json(
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ) -> bool:
+ ) -> Union[bool, str]:
+ # Check library availability first
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
+
if (
model_spec.model_format in ["pytorch"]
and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST
):
return True
- return False
+ return f"FlagEmbedding engine only supports pytorch format and models in FLAG_EMBEDDER_MODEL_LIST, got format: {model_spec.model_format}, model: {model_family.model_name}"
diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py
index d84434384f..a8e68f450b 100644
--- a/xinference/model/embedding/llama_cpp/core.py
+++ b/xinference/model/embedding/llama_cpp/core.py
@@ -26,7 +26,6 @@
from ....types import Embedding
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
-from ..match_result import MatchResult
logger = logging.getLogger(__name__)
@@ -226,8 +225,12 @@ def _handle_embedding():
return Embedding(**r) # type: ignore
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("xllamacpp") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("xllamacpp") is not None
+ else "xllamacpp library is not installed"
+ )
@classmethod
def match_json(
@@ -235,52 +238,24 @@ def match_json(
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ) -> bool:
-
- result = cls.match_with_reason(model_family, model_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls,
- model_family: EmbeddingModelFamilyV2,
- model_spec: EmbeddingSpecV1,
- quantization: str,
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="llama.cpp library (xllamacpp) is not installed for embedding",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="xllamacpp package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if model_spec.model_format not in ["ggufv2"]:
- return MatchResult.failure(
- reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2",
- )
+ return f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}"
# Check embedding-specific requirements
if not hasattr(model_spec, "model_file_name_template"):
- return MatchResult.failure(
- reason="GGUF embedding model requires proper file configuration",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details="Missing model_file_name_template for GGUF embedding",
- )
+ return "GGUF embedding model requires proper file configuration (missing model_file_name_template)"
# Check model dimensions for llama.cpp compatibility
model_dimensions = model_family.dimensions
if model_dimensions > 4096: # llama.cpp may have limitations
- return MatchResult.failure(
- reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Large embedding dimensions: {model_dimensions}",
- )
+ return f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)"
# Check platform-specific considerations
import platform
@@ -289,10 +264,6 @@ def match_with_reason(
# llama.cpp works across platforms but may have performance differences
if current_platform == "Windows":
- return MatchResult.failure(
- reason="llama.cpp embedding may have limited performance on Windows",
- error_type=ErrorType.OS_REQUIREMENT,
- technical_details=f"Windows platform: {current_platform}",
- )
+ return "llama.cpp embedding may have limited performance on Windows"
- return MatchResult.success()
+ return True
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index c1789f9912..4e1c7b8b73 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -22,7 +22,6 @@
from ....types import Embedding, EmbeddingData, EmbeddingUsage
from ...utils import is_flash_attn_available
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
-from ..match_result import MatchResult
logger = logging.getLogger(__name__)
SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = []
@@ -425,8 +424,12 @@ def base64_to_image(base64_str: str) -> Image.Image:
return result
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("sentence_transformers") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("sentence_transformers") is not None
+ else "sentence_transformers library is not installed"
+ )
@classmethod
def match_json(
@@ -434,53 +437,25 @@ def match_json(
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ) -> bool:
-
- result = cls.match_with_reason(model_family, model_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls,
- model_family: EmbeddingModelFamilyV2,
- model_spec: EmbeddingSpecV1,
- quantization: str,
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="Sentence Transformers library is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="sentence_transformers package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if model_spec.model_format not in ["pytorch"]:
- return MatchResult.failure(
- reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
- )
+ return f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}"
# Check model dimensions compatibility
model_dimensions = model_family.dimensions
if model_dimensions > 1536: # Very large embedding models
- return MatchResult.failure(
- reason=f"Large embedding model detected ({model_dimensions} dimensions)",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Large embedding dimensions: {model_dimensions}",
- )
+ return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues"
# Check token limits
max_tokens = model_family.max_tokens
if max_tokens > 8192: # Very high token limits
- return MatchResult.failure(
- reason=f"High token limit model detected (max_tokens: {max_tokens})",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details=f"High max_tokens: {max_tokens}",
- )
+ return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
# Check for special model requirements
model_name = model_family.model_name.lower()
@@ -489,23 +464,16 @@ def match_with_reason(
if "gte" in model_name and "qwen2" in model_name:
# These models have specific requirements
if not hasattr(cls, "_check_qwen_gte_requirements"):
- return MatchResult.failure(
- reason="Qwen2 GTE models require special handling",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details="Qwen2 GTE model special requirements",
- )
+ return "Qwen2 GTE models require special handling"
# Check Qwen3 models
if "qwen3" in model_name:
- # Qwen3 has flash attention requirements
+ # Qwen3 has flash attention requirements - basic check
try:
- # This would be checked during actual loading
pass
+
+ # This would be checked during actual loading
except Exception:
- return MatchResult.failure(
- reason="Qwen3 embedding model may have compatibility issues",
- error_type=ErrorType.VERSION_REQUIREMENT,
- technical_details="Qwen3 model compatibility check",
- )
+ return "Qwen3 embedding model may have compatibility issues"
- return MatchResult.success()
+ return True
diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 8905d36297..8fc32ebac8 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -149,8 +149,12 @@ def create_embedding(
return result
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("vllm") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("vllm") is not None
+ else "vllm library is not installed"
+ )
@classmethod
def match_json(
@@ -158,12 +162,17 @@ def match_json(
model_family: EmbeddingModelFamilyV2,
model_spec: EmbeddingSpecV1,
quantization: str,
- ) -> bool:
+ ) -> Union[bool, str]:
+ # Check library availability first
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
+
if model_spec.model_format in ["pytorch"]:
prefix = model_family.model_name.split("-", 1)[0]
if prefix in SUPPORTED_MODELS_PREFIXES:
return True
- return False
+ return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}"
def wait_for_load(self):
# set context length after engine inited
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index 3020483219..5942a42879 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -31,7 +31,6 @@
if TYPE_CHECKING:
from .llm_family import LLMFamilyV2, LLMSpecV1
- from .match_result import MatchResult
logger = logging.getLogger(__name__)
@@ -71,7 +70,7 @@ def __init__(
@classmethod
@abstractmethod
- def check_lib(cls) -> bool:
+ def check_lib(cls) -> Union[bool, str]:
raise NotImplementedError
@staticmethod
@@ -149,54 +148,19 @@ def load(self):
def match(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> bool:
- if not cls.check_lib():
+ lib_result = cls.check_lib()
+ if lib_result != True:
return False
- return cls.match_json(llm_family, llm_spec, quantization)
+ match_result = cls.match_json(llm_family, llm_spec, quantization)
+ return match_result == True
@classmethod
@abstractmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
+ ) -> Union[bool, str]:
raise NotImplementedError
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- """
- Check if the engine can handle the given model with detailed error information.
-
- This method provides detailed failure reasons and suggestions when an engine
- cannot handle a specific model configuration. The default implementation
- falls back to the boolean match_json method for backward compatibility.
-
- Args:
- llm_family: The model family information
- llm_spec: The model specification
- quantization: The quantization method
-
- Returns:
- MatchResult: Detailed match result with reasons and suggestions
- """
- from .match_result import ErrorType, MatchResult
-
- # Default implementation for backward compatibility
- if cls.match_json(llm_family, llm_spec, quantization):
- return MatchResult.success()
- else:
- # Get basic reason based on common failure patterns
- if not cls.check_lib():
- return MatchResult.failure(
- reason=f"Required library for {cls.__name__} is not available",
- error_type=ErrorType.DEPENDENCY_MISSING,
- )
- else:
- return MatchResult.failure(
- reason=f"Model configuration is not compatible with {cls.__name__}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- )
-
def prepare_parse_reasoning_content(
self, reasoning_content: bool, enable_thinking: bool = True
):
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index e8ff96f83b..5d379e642d 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -25,7 +25,6 @@
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
from ..utils import ChatModelMixin
logger = logging.getLogger(__name__)
@@ -80,73 +79,34 @@ def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
return llamacpp_model_config
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("xllamacpp") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("xllamacpp") is not None
+ else "xllamacpp library is not installed"
+ )
@classmethod
def match_json(
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="llama.cpp library (xllamacpp) is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="xllamacpp package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if llm_spec.model_format not in ["ggufv2"]:
- return MatchResult.failure(
- reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2",
+ return (
+ f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}"
)
- # Check model abilities - llama.cpp supports both chat and generation
- if (
- "chat" not in llm_family.model_ability
- and "generate" not in llm_family.model_ability
- ):
- return MatchResult.failure(
- reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # Check platform-specific issues
- import platform
-
- current_platform = platform.system()
-
- # Check for ARM64 specific issues
- if current_platform == "Darwin" and platform.machine() == "arm64":
- # Apple Silicon specific checks could go here
- pass
- elif current_platform == "Windows":
- # Windows specific checks could go here
- pass
-
# Check memory requirements (basic heuristic)
model_size = float(str(llm_spec.model_size_in_billions))
if model_size > 70: # Very large models
- return MatchResult.failure(
- reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Large model size: {model_size}B parameters",
- )
+ return f"llama.cpp may struggle with very large models ({model_size}B parameters)"
- return MatchResult.success()
+ return True
def load(self):
try:
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
index 90115dec06..9689c3ddce 100644
--- a/xinference/model/llm/lmdeploy/core.py
+++ b/xinference/model/llm/lmdeploy/core.py
@@ -21,7 +21,6 @@
from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
from ..core import LLM
from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
logger = logging.getLogger(__name__)
@@ -115,28 +114,18 @@ def load(self):
raise ValueError("LMDEPLOY engine has not supported generate yet.")
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("lmdeploy") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("lmdeploy") is not None
+ else "lmdeploy library is not installed"
+ )
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- return MatchResult.failure(
- reason="LMDeploy base model does not support direct inference",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details="LMDeploy base model class is not intended for direct use",
- )
+ ) -> Union[bool, str]:
+ return "LMDeploy base model does not support direct inference, use specific LMDeploy model classes"
def generate(
self,
@@ -188,52 +177,23 @@ def load(self):
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability first
- if not LMDEPLOY_INSTALLED:
- return MatchResult.failure(
- reason="LMDeploy library is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="lmdeploy package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility and quantization
if llm_spec.model_format == "awq":
# LMDeploy has specific AWQ quantization requirements
if "4" not in quantization:
- return MatchResult.failure(
- reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"AWQ + {quantization} not supported by LMDeploy",
- )
+ return f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}"
# Check model compatibility
if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
- return MatchResult.failure(
- reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported chat model: {llm_family.model_name}",
- )
-
- # Check model abilities - LMDeploy primarily supports chat models
- if "chat" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
+ return f"Chat model not supported by LMDeploy: {llm_family.model_name}"
- return MatchResult.success()
+ return True
async def async_chat(
self,
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index ff6b2e51ea..ab8f1608db 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -18,7 +18,6 @@
import importlib.util
import logging
import pathlib
-import platform
import sys
import threading
import time
@@ -51,7 +50,6 @@
)
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
@@ -405,73 +403,32 @@ def wait_for_load(self):
self._context_length = get_context_length(config)
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("mlx_lm") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("mlx_lm") is not None
+ else "mlx_lm library is not installed"
+ )
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Check platform compatibility first - MLX only works on Apple Silicon
- if sys.platform != "darwin" or platform.processor() != "arm":
- return MatchResult.failure(
- reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)",
- error_type=ErrorType.OS_REQUIREMENT,
- technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
- )
-
- # Check library availability (only if platform is compatible)
- if not cls.check_lib():
- return MatchResult.failure(
- reason="MLX library (mlx_lm) is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="mlx_lm package not found in Python environment",
- )
+ ) -> Union[bool, str]:
+ # Check library availability first
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if llm_spec.model_format not in ["mlx"]:
- return MatchResult.failure(
- reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
- )
-
- # Check model abilities - MLX supports generation but not chat/vision in this base class
- if "generate" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # MLX base model doesn't support chat or vision
- if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability:
- return MatchResult.failure(
- reason="MLX base model does not support chat or vision abilities",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}",
- )
+ return f"MLX engine only supports MLX format, got: {llm_spec.model_format}"
# Check memory constraints for Apple Silicon
model_size = float(str(llm_spec.model_size_in_billions))
if model_size > 70: # Large models may be problematic
- return MatchResult.failure(
- reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Large model size: {model_size}B on Apple Silicon",
- )
+ return f"MLX may have memory limitations with very large models ({model_size}B parameters)"
- return MatchResult.success()
+ return True
def _get_prompt_cache(
self, prompt, lora_name: Optional[str] = None, model: Any = None
@@ -771,39 +728,13 @@ def _sanitize_generate_config(
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Use base class validation first
- base_result = super().match_with_reason(llm_family, llm_spec, quantization)
- if not base_result.is_match:
+ ) -> Union[bool, str]:
+ # First run base class checks
+ base_result = super().match_json(llm_family, llm_spec, quantization)
+ if base_result != True:
return base_result
- # Check chat ability
- if "chat" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # MLX Chat doesn't support vision
- if "vision" in llm_family.model_ability:
- return MatchResult.failure(
- reason="MLX Chat model does not support vision abilities",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Vision ability not supported in MLXChatModel",
- )
-
- return MatchResult.success()
+ return True
def chat(
self,
@@ -850,59 +781,27 @@ def chat(
class MLXVisionModel(MLXModel, ChatModelMixin):
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("mlx_vlm") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("mlx_vlm") is not None
+ else "mlx_vlm library is not installed"
+ )
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Check platform compatibility first - MLX only works on Apple Silicon
- if sys.platform != "darwin" or platform.processor() != "arm":
- return MatchResult.failure(
- reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)",
- error_type=ErrorType.OS_REQUIREMENT,
- technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm",
- )
-
- # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm
- if not cls.check_lib():
- return MatchResult.failure(
- reason="MLX Vision library (mlx_vlm) is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="mlx_vlm package not found in Python environment",
- )
+ ) -> Union[bool, str]:
+ # Check library availability first - MLX Vision uses mlx_vlm
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if llm_spec.model_format not in ["mlx"]:
- return MatchResult.failure(
- reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx",
- )
-
- # Check vision ability
- if "vision" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # Check for distributed inference limitations
- # MLX Vision models don't support distributed inference
- # This could be checked here if needed
+ return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}"
- return MatchResult.success()
+ return True
def _load_model(self, **kwargs):
try:
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index d22a157777..ccb44c00bd 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -15,7 +15,6 @@
import json
import logging
import multiprocessing
-import platform
import sys
import threading
import time
@@ -37,7 +36,6 @@
from .. import LLM, LLMFamilyV2, LLMSpecV1
from ..core import chat_context_var
from ..llm_family import CustomLLMFamilyV2
-from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
@@ -336,110 +334,130 @@ def _sanitize_generate_config(
return generate_config
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("sglang") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ # Check CUDA first - this is the most important requirement
+ try:
+ import torch
- @classmethod
- def match_json(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
+ if not torch.cuda.is_available():
+ return "SGLang requires CUDA support but no CUDA devices detected"
+ except ImportError:
+ return "SGLang requires PyTorch with CUDA support"
+
+ if importlib.util.find_spec("sglang") is None:
+ return "sglang library is not installed"
+
+ try:
+ if not getattr(sglang, "__version__", None):
+ return "SGLang version information is not available"
+
+ # Check version - SGLang requires recent version
+ from packaging import version
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
+ if version.parse(sglang.__version__) < version.parse("0.1.0"):
+ return f"SGLang version {sglang.__version__} is too old, minimum required is 0.1.0"
+
+ return True
+ except Exception as e:
+ return f"Error checking SGLang library: {str(e)}"
@classmethod
- def match_with_reason(
+ def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability first
- if not SGLANG_INSTALLED:
- return MatchResult.failure(
- reason="SGLang library is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="sglang package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
- # Check hardware requirements - SGLang requires CUDA
- if not cls._has_cuda_device():
- return MatchResult.failure(
- reason="SGLang requires CUDA GPU support",
- error_type=ErrorType.HARDWARE_REQUIREMENT,
- technical_details="No CUDA devices detected",
- )
+ # Check GPU requirements
+ try:
+ import torch
- # Check OS requirements
- if not cls._is_linux():
- return MatchResult.failure(
- reason="SGLang only supports Linux operating system",
- error_type=ErrorType.OS_REQUIREMENT,
- technical_details=f"Current OS: {platform.system()}, required: Linux",
- )
+ if torch.cuda.device_count() == 0:
+ return "SGLang requires CUDA support but no CUDA devices detected"
+ except ImportError:
+ return "SGLang requires PyTorch with CUDA support"
# Check model format compatibility
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"SGLang does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {llm_spec.model_format}",
- )
+ return f"SGLang does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
# Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return MatchResult.failure(
- reason=f"SGLang pytorch format does not support quantization: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"pytorch + {quantization} combination not supported",
- )
+ return f"SGLang pytorch format does not support quantization: {quantization}"
+
+ # Check model compatibility with more flexible matching
+ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
+ """Check if model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants (e.g., qwen3 variants)
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common patterns
+ model_lower = model_name.lower()
+ if any(
+ family in model_lower
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "mixtral",
+ "qwen2",
+ "qwen2.5",
+ "deepseek",
+ "yi",
+ "baichuan",
+ ]
+ ):
+ # Check if there's a corresponding supported model with same family
+ for supported in supported_list:
+ if any(
+ family in supported.lower()
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "mixtral",
+ "qwen2",
+ "qwen2.5",
+ "deepseek",
+ "yi",
+ "baichuan",
+ ]
+ ):
+ return True
+
+ return False
- # Check model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
- return MatchResult.failure(
- reason=f"Custom model family not supported by SGLang: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom family: {llm_family.model_family}",
+ if not llm_family.model_family or not is_model_supported(
+ llm_family.model_family.lower(), SGLANG_SUPPORTED_MODELS
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if llm_family.model_name not in SGLANG_SUPPORTED_MODELS:
- return MatchResult.failure(
- reason=f"Model not supported by SGLang: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported model: {llm_family.model_name}",
+ if not is_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in SGLANG_SUPPORTED_MODELS],
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Model may not be fully supported by SGLang: {llm_family.model_name}"
)
- # Check model abilities with flexible logic
- # SGLang can handle models with various text generation capabilities
- has_text_capability = (
- "generate" in llm_family.model_ability
- or "chat" in llm_family.model_ability
- or "reasoning" in llm_family.model_ability
- or "tools" in llm_family.model_ability
- )
-
- if not has_text_capability:
- return MatchResult.failure(
- reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # SGLang is primarily designed for text models, not specialized models
- specialized_abilities = ["embedding", "rerank", "audio", "vision"]
- has_specialized = any(
- ability in llm_family.model_ability for ability in specialized_abilities
- )
- if has_specialized:
- return MatchResult.failure(
- reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
- )
-
- return MatchResult.success()
+ return True
@staticmethod
def _convert_state_to_completion_chunk(
@@ -727,65 +745,76 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Use base class validation first
- base_result = super().match_with_reason(llm_family, llm_spec, quantization)
- if not base_result.is_match:
+ ) -> Union[bool, str]:
+ # First run base class checks
+ base_result = super().match_json(llm_family, llm_spec, quantization)
+ if base_result != True:
return base_result
# Check model format compatibility (same as base)
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"SGLang Chat does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
- )
+ return f"SGLang Chat does not support model format: {llm_spec.model_format}"
# Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return MatchResult.failure(
- reason=f"SGLang Chat pytorch format does not support quantization: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"Chat pytorch + {quantization} not supported",
- )
+ return f"SGLang Chat pytorch format does not support quantization: {quantization}"
+
+ # Check chat model compatibility with more flexible matching
+ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
+ """Check if chat model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common chat patterns
+ model_lower = model_name.lower()
+ if any(suffix in model_lower for suffix in ["chat", "instruct", "coder"]):
+ if any(
+ family in model_lower
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "mixtral",
+ "qwen2",
+ "qwen2.5",
+ "deepseek",
+ "yi",
+ "baichuan",
+ ]
+ ):
+ return True
+
+ return False
- # Check chat model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
- return MatchResult.failure(
- reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom chat family: {llm_family.model_family}",
+ if not is_chat_model_supported(
+ llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS:
- return MatchResult.failure(
- reason=f"Chat model not supported by SGLang: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported chat model: {llm_family.model_name}",
+ if not is_chat_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS],
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Chat model may not be fully supported by SGLang: {llm_family.model_name}"
)
- # Check chat ability
- if "chat" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- return MatchResult.success()
+ return True
def _sanitize_chat_config(
self,
@@ -858,65 +887,81 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Use base class validation first
- base_result = super().match_with_reason(llm_family, llm_spec, quantization)
- if not base_result.is_match:
+ ) -> Union[bool, str]:
+ # First run base class checks
+ base_result = super().match_json(llm_family, llm_spec, quantization)
+ if base_result != True:
return base_result
# Vision models have the same format restrictions as base SGLANG
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"SGLang Vision does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
+ return (
+ f"SGLang Vision does not support model format: {llm_spec.model_format}"
)
# Vision models typically work with specific quantization settings
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return MatchResult.failure(
- reason=f"SGLang Vision pytorch format does not support quantization: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"Vision pytorch + {quantization} not supported",
- )
+ return f"SGLang Vision pytorch format does not support quantization: {quantization}"
+
+ # Check vision model compatibility with more flexible matching
+ def is_vision_model_supported(
+ model_name: str, supported_list: List[str]
+ ) -> bool:
+ """Check if vision model is supported with flexible matching."""
+ # Direct match
+ if model_name in supported_list:
+ return True
+
+ # Partial matching for models with variants
+ for supported in supported_list:
+ if model_name.startswith(
+ supported.lower()
+ ) or supported.lower().startswith(model_name):
+ return True
+
+ # Family-based matching for common vision patterns
+ model_lower = model_name.lower()
+ if any(suffix in model_lower for suffix in ["vision", "vl", "multi", "mm"]):
+ if any(
+ family in model_lower
+ for family in [
+ "qwen3",
+ "llama",
+ "mistral",
+ "mixtral",
+ "qwen2",
+ "qwen2.5",
+ "deepseek",
+ "yi",
+ "baichuan",
+ "internvl",
+ ]
+ ):
+ return True
+
+ return False
- # Check vision model compatibility
if isinstance(llm_family, CustomLLMFamilyV2):
- if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
- return MatchResult.failure(
- reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom vision family: {llm_family.model_family}",
+ if not is_vision_model_supported(
+ llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
- return MatchResult.failure(
- reason=f"Vision model not supported by SGLang: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported vision model: {llm_family.model_name}",
+ if not is_vision_model_supported(
+ llm_family.model_name.lower(),
+ [s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST],
+ ):
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Vision model may not be fully supported by SGLang: {llm_family.model_name}"
)
- # Check vision ability
- if "vision" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- return MatchResult.success()
+ return True
def _sanitize_chat_config(
self,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
index 5a4a9f557d..39e963164b 100644
--- a/xinference/model/llm/transformers/core.py
+++ b/xinference/model/llm/transformers/core.py
@@ -40,7 +40,6 @@
from ...utils import select_device
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..match_result import MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
LLAMA3_TOOL_CALL_FAMILY,
@@ -494,78 +493,33 @@ def stop(self):
del self._tokenizer
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("transformers") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("transformers") is not None
+ else "transformers library is not installed"
+ )
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="Transformers library is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="transformers or torch package not found",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
supported_formats = ["pytorch", "gptq", "awq", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"Transformers does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Transformers unsupported format: {llm_spec.model_format}",
- )
+ return f"Transformers does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
# Check for models that shouldn't use Transformers by default
model_family = llm_family.model_family or llm_family.model_name
if model_family in NON_DEFAULT_MODEL_LIST:
- return MatchResult.failure(
- reason=f"Model {model_family} is not recommended for Transformers engine",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}",
- )
+ return f"Model {model_family} is not recommended for Transformers engine, has specialized engine preference"
- # Check model abilities with flexible logic
- # Transformers can handle models with various text processing capabilities
- has_text_capability = (
- "generate" in llm_family.model_ability
- or "chat" in llm_family.model_ability
- or "reasoning" in llm_family.model_ability
- or "tools" in llm_family.model_ability
- )
-
- if not has_text_capability:
- return MatchResult.failure(
- reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # Check for highly specialized models that might not work well with generic Transformers engine
- specialized_abilities = ["embedding", "rerank", "audio", "vision"]
- has_specialized = any(
- ability in llm_family.model_ability for ability in specialized_abilities
- )
- if has_specialized and not has_text_capability:
- return MatchResult.failure(
- reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}",
- )
-
- return MatchResult.success()
+ return True
def build_prefill_attention_mask(
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
@@ -1023,8 +977,6 @@ def match_json(
model_family = llm_family.model_family or llm_family.model_name
if model_family in NON_DEFAULT_MODEL_LIST:
return False
- if "chat" not in llm_family.model_ability:
- return False
return True
async def chat(
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index bc0eede4c0..7262053a50 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -19,7 +19,6 @@
import logging
import multiprocessing
import os
-import platform
import sys
import threading
import time
@@ -56,7 +55,6 @@
from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
from ..core import chat_context_var
from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config
-from ..match_result import ErrorType, MatchResult
from ..utils import (
DEEPSEEK_TOOL_CALL_FAMILY,
QWEN_TOOL_CALL_FAMILY,
@@ -852,111 +850,77 @@ def _sanitize_generate_config(
return sanitized
@classmethod
- def check_lib(cls) -> bool:
+ def check_lib(cls) -> Union[bool, str]:
+ # Check CUDA first - this is the most important requirement
+ try:
+ import torch
+
+ if not torch.cuda.is_available():
+ return "vLLM requires CUDA support but no CUDA devices detected"
+ except ImportError:
+ return "vLLM requires PyTorch with CUDA support"
+
if importlib.util.find_spec("vllm") is None:
- return False
+ return "vLLM library is not installed"
try:
import vllm
if not getattr(vllm, "__version__", None):
- return False
+ return "vLLM version information is not available"
# Check version
from packaging import version
if version.parse(vllm.__version__) < version.parse("0.3.0"):
- return False
-
- # Check CUDA
- import torch
-
- if not torch.cuda.is_available():
- return False
+ return f"vLLM version {vllm.__version__} is too old, minimum required is 0.3.0"
return True
- except Exception:
- return False
+ except Exception as e:
+ return f"Error checking vLLM library: {str(e)}"
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability first
if not VLLM_INSTALLED:
- return MatchResult.failure(
- reason="vLLM library is not installed",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="vllm package not found in Python environment",
- )
+ return "vLLM library is not installed"
- # Check hardware requirements
- if not cls._has_cuda_device() and not cls._has_mlu_device():
- return MatchResult.failure(
- reason="vLLM requires CUDA or MLU accelerator support",
- error_type=ErrorType.HARDWARE_REQUIREMENT,
- technical_details="No CUDA or MLU devices detected",
- )
+ # Check GPU device count
+ try:
+ import torch
- # Check OS requirements
- if not cls._is_linux():
- return MatchResult.failure(
- reason="vLLM only supports Linux operating system",
- error_type=ErrorType.OS_REQUIREMENT,
- technical_details=f"Current OS: {platform.system()}, required: Linux",
- )
+ if torch.cuda.device_count() == 0:
+ return "vLLM requires CUDA support but no CUDA devices detected"
+ except ImportError:
+ return "vLLM requires PyTorch with CUDA support"
# Check model format
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"vLLM does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {llm_spec.model_format}",
- )
+ return f"vLLM does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}"
# Check quantization compatibility with format
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return MatchResult.failure(
- reason=f"vLLM pytorch format does not support quantization: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"pytorch + {quantization} combination not supported",
+ return (
+ f"vLLM pytorch format does not support quantization: {quantization}"
)
if llm_spec.model_format == "awq":
if "4" not in quantization:
- return MatchResult.failure(
- reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"AWQ + {quantization} not supported, only 4-bit",
+ return (
+ f"vLLM AWQ format requires 4-bit quantization, got: {quantization}"
)
if llm_spec.model_format == "gptq":
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
if not any(q in quantization for q in ("3", "4", "8")):
- return MatchResult.failure(
- reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3",
- )
+ return f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}"
else:
if "4" not in quantization:
- return MatchResult.failure(
- reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}",
- error_type=ErrorType.VERSION_REQUIREMENT,
- technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3",
- )
+ return f"Older vLLM version only supports 4-bit GPTQ, got: {quantization} (requires vLLM >= 0.3.3 for 3/8-bit)"
# Check model compatibility with more flexible matching
def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
@@ -1006,53 +970,19 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
if not llm_family.model_family or not is_model_supported(
llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS
):
- return MatchResult.failure(
- reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom family: {llm_family.model_family}",
- )
+ return f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}"
else:
if not is_model_supported(
llm_family.model_name.lower(),
[s.lower() for s in VLLM_SUPPORTED_MODELS],
):
- return MatchResult.failure(
- reason=f"Model may not be supported by vLLM: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported model: {llm_family.model_name}",
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Model may not be fully supported by vLLM: {llm_family.model_name}"
)
- # Check model abilities with flexible logic
- # vLLM can handle models that have text generation capabilities
- # Models with 'chat' ability usually also support 'generate'
- has_text_capability = (
- "generate" in llm_family.model_ability
- or "chat" in llm_family.model_ability
- or "reasoning" in llm_family.model_ability
- or "tools" in llm_family.model_ability
- )
-
- if not has_text_capability:
- return MatchResult.failure(
- reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # Additional check: ensure model doesn't have conflicting abilities
- conflicting_abilities = ["embedding", "rerank"]
- has_conflicting = any(
- ability in llm_family.model_ability for ability in conflicting_abilities
- )
- if has_conflicting:
- return MatchResult.failure(
- reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}",
- )
-
# All checks passed
- return MatchResult.success()
+ return True
@staticmethod
def _convert_request_output_to_completion_chunk(
@@ -1459,48 +1389,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
- # Use base class validation first
- base_result = super().match_with_reason(llm_family, llm_spec, quantization)
- if not base_result.is_match:
+ ) -> Union[bool, str]:
+ # First run base class checks
+ base_result = super().match_json(llm_family, llm_spec, quantization)
+ if base_result != True:
return base_result
# Chat-specific format support (includes GGUFv2 for newer vLLM)
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"vLLM Chat does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Chat model unsupported format: {llm_spec.model_format}",
- )
+ return f"vLLM Chat does not support model format: {llm_spec.model_format}"
# GGUFv2 requires newer vLLM version
if llm_spec.model_format == "ggufv2":
if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")):
- return MatchResult.failure(
- reason="vLLM GGUF support requires version >= 0.8.2",
- error_type=ErrorType.VERSION_REQUIREMENT,
- technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2",
- )
+ return f"vLLM GGUF support requires version >= 0.8.2, current: {VLLM_VERSION}"
# AWQ chat models support more quantization levels
if llm_spec.model_format == "awq":
if not any(q in quantization for q in ("4", "8")):
- return MatchResult.failure(
- reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"Chat AWQ + {quantization} not supported",
- )
+ return f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}"
# Check chat model compatibility with flexible matching
def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
@@ -1554,46 +1462,18 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
if not llm_family.model_family or not is_chat_model_supported(
llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS
):
- return MatchResult.failure(
- reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom chat family: {llm_family.model_family}",
- )
+ return f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}"
else:
if not is_chat_model_supported(
llm_family.model_name.lower(),
[s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS],
):
- return MatchResult.failure(
- reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported chat model: {llm_family.model_name}",
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Chat model may not be fully supported by vLLM: {llm_family.model_name}"
)
- # Check chat ability with flexible logic
- # vLLM Chat should work with models that have conversation capabilities
- has_chat_capability = (
- "chat" in llm_family.model_ability
- or "generate" in llm_family.model_ability
- or "reasoning" in llm_family.model_ability
- )
-
- if not has_chat_capability:
- return MatchResult.failure(
- reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- # Additional check: ensure model is not purely a tool model without conversation
- if set(llm_family.model_ability) == {"tools"}:
- return MatchResult.failure(
- reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Tool-only model detected",
- )
-
- return MatchResult.success()
+ return True
def _sanitize_chat_config(
self,
@@ -1737,47 +1617,26 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin):
@classmethod
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> bool:
-
- result = cls.match_with_reason(llm_family, llm_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
- ) -> "MatchResult":
-
- # Use base class validation first
- base_result = super().match_with_reason(llm_family, llm_spec, quantization)
- if not base_result.is_match:
+ ) -> Union[bool, str]:
+ # First run base class checks
+ base_result = super().match_json(llm_family, llm_spec, quantization)
+ if base_result != True:
return base_result
# Vision models have the same format restrictions as base VLLM
supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"]
if llm_spec.model_format not in supported_formats:
- return MatchResult.failure(
- reason=f"vLLM Vision does not support model format: {llm_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Vision model unsupported format: {llm_spec.model_format}",
- )
+ return f"vLLM Vision does not support model format: {llm_spec.model_format}"
# Vision models typically work with specific quantization settings
if llm_spec.model_format == "pytorch":
if quantization != "none" and quantization is not None:
- return MatchResult.failure(
- reason=f"vLLM Vision pytorch format does not support quantization: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"Vision pytorch + {quantization} not supported",
- )
+ return f"vLLM Vision pytorch format does not support quantization: {quantization}"
# AWQ vision models support more quantization levels than base
if llm_spec.model_format == "awq":
if not any(q in quantization for q in ("4", "8")):
- return MatchResult.failure(
- reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}",
- error_type=ErrorType.QUANTIZATION,
- technical_details=f"Vision AWQ + {quantization} not supported",
- )
+ return f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}"
# Check vision model compatibility with flexible matching
def is_vision_model_supported(
@@ -1815,30 +1674,17 @@ def is_vision_model_supported(
if not llm_family.model_family or not is_vision_model_supported(
llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
):
- return MatchResult.failure(
- reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Custom vision family: {llm_family.model_family}",
- )
+ return f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}"
else:
if not llm_family.model_name or not is_vision_model_supported(
llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST
):
- return MatchResult.failure(
- reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported vision model: {llm_family.model_name}",
+ # Instead of hard rejection, give a warning but allow usage
+ logger.warning(
+ f"Vision model may not be fully supported by vLLM: {llm_family.model_name}"
)
- # Check vision ability
- if "vision" not in llm_family.model_ability:
- return MatchResult.failure(
- reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}",
- error_type=ErrorType.ABILITY_MISMATCH,
- technical_details=f"Model abilities: {llm_family.model_ability}",
- )
-
- return MatchResult.success()
+ return True
def _sanitize_model_config(
self, model_config: Optional[VLLMModelConfig]
diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py
index 2d3edde1c2..f844825d6c 100644
--- a/xinference/model/rerank/core.py
+++ b/xinference/model/rerank/core.py
@@ -15,13 +15,12 @@
import os
from abc import abstractmethod
from collections import defaultdict
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Union
from ..._compat import BaseModel
from ...types import Rerank
from ..core import VirtualEnvSettings
from ..utils import ModelInstanceInfoMixin
-from .match_result import MatchResult
from .rerank_family import check_engine_by_model_name_and_engine, match_rerank
logger = logging.getLogger(__name__)
@@ -119,7 +118,7 @@ def __init__(
@classmethod
@abstractmethod
- def check_lib(cls) -> bool:
+ def check_lib(cls) -> Union[bool, str]:
pass
@classmethod
@@ -129,62 +128,24 @@ def match_json(
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
quantization: str,
- ) -> bool:
+ ) -> Union[bool, str]:
pass
- @classmethod
- def match_with_reason(
- cls,
- model_family: RerankModelFamilyV2,
- model_spec: RerankSpecV1,
- quantization: str,
- ) -> "MatchResult":
- """
- Check if the engine can handle the given rerank model with detailed error information.
-
- This method provides detailed failure reasons and suggestions when an engine
- cannot handle a specific model configuration. The default implementation
- falls back to the boolean match_json method for backward compatibility.
-
- Args:
- model_family: The rerank model family information
- model_spec: The model specification
- quantization: The quantization method
-
- Returns:
- MatchResult: Detailed match result with reasons and suggestions
- """
- from .match_result import ErrorType, MatchResult
-
- # Default implementation for backward compatibility
- if cls.match_json(model_family, model_spec, quantization):
- return MatchResult.success()
- else:
- # Get basic reason based on common failure patterns
- if not cls.check_lib():
- return MatchResult.failure(
- reason=f"Required library for {cls.__name__} is not available",
- error_type=ErrorType.DEPENDENCY_MISSING,
- )
- else:
- return MatchResult.failure(
- reason=f"Rerank model configuration is not compatible with {cls.__name__}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- )
-
@classmethod
def match(
cls,
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
quantization: str,
- ):
+ ) -> bool:
"""
Return if the model_spec can be matched.
"""
- if not cls.check_lib():
+ lib_result = cls.check_lib()
+ if lib_result != True:
return False
- return cls.match_json(model_family, model_spec, quantization)
+ match_result = cls.match_json(model_family, model_spec, quantization)
+ return match_result == True
@staticmethod
def _get_tokenizer(model_path):
diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py
index 42332bc477..eddc58ac06 100644
--- a/xinference/model/rerank/sentence_transformers/core.py
+++ b/xinference/model/rerank/sentence_transformers/core.py
@@ -16,7 +16,7 @@
import logging
import threading
import uuid
-from typing import List, Optional, Sequence
+from typing import List, Optional, Sequence, Union
import numpy as np
import torch
@@ -31,7 +31,6 @@
RerankModelFamilyV2,
RerankSpecV1,
)
-from ..match_result import MatchResult
from ..utils import preprocess_sentence
logger = logging.getLogger(__name__)
@@ -332,8 +331,12 @@ def format_instruction(instruction, query, doc):
return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("sentence_transformers") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("sentence_transformers") is not None
+ else "sentence_transformers library is not installed"
+ )
@classmethod
def match_json(
@@ -341,44 +344,19 @@ def match_json(
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
quantization: str,
- ) -> bool:
- pass
-
- result = cls.match_with_reason(model_family, model_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls,
- model_family: RerankModelFamilyV2,
- model_spec: RerankSpecV1,
- quantization: str,
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="Sentence Transformers library is not installed for reranking",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="sentence_transformers package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if model_spec.model_format not in ["pytorch"]:
- return MatchResult.failure(
- reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
- )
+ return f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}"
# Check rerank-specific requirements
if not hasattr(model_family, "model_name"):
- return MatchResult.failure(
- reason="Rerank model family requires model name specification",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details="Missing model_name in rerank model family",
- )
+ return "Rerank model family requires model name specification"
# Check model type compatibility
if model_family.type and model_family.type not in [
@@ -389,27 +367,15 @@ def match_with_reason(
"LLM-based",
"LLM-based layerwise",
]:
- return MatchResult.failure(
- reason=f"Model type '{model_family.type}' may not be compatible with reranking engines",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Model type: {model_family.type}",
- )
+ return f"Model type '{model_family.type}' may not be compatible with reranking engines"
# Check max tokens limit for reranking performance
max_tokens = model_family.max_tokens
if max_tokens and max_tokens > 8192: # High token limits for reranking
- return MatchResult.failure(
- reason=f"High max_tokens limit for reranking model: {max_tokens}",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details=f"High max_tokens for reranking: {max_tokens}",
- )
+ return f"High max_tokens limit for reranking model: {max_tokens}, may cause performance issues"
# Check language compatibility
if not model_family.language or len(model_family.language) == 0:
- return MatchResult.failure(
- reason="Rerank model language information is missing",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details="Missing language information in rerank model",
- )
+ return "Rerank model language information is missing"
- return MatchResult.success()
+ return True
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index c2ee75cfef..4f63c0136c 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -1,11 +1,10 @@
import importlib.util
import uuid
-from typing import List, Optional
+from typing import List, Optional, Union
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
from ...utils import cache_clean
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
-from ..match_result import MatchResult
SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
@@ -140,8 +139,12 @@ def rerank(
return Rerank(id=str(uuid.uuid4()), results=reranked_docs, meta=metadata)
@classmethod
- def check_lib(cls) -> bool:
- return importlib.util.find_spec("vllm") is not None
+ def check_lib(cls) -> Union[bool, str]:
+ return (
+ True
+ if importlib.util.find_spec("vllm") is not None
+ else "vllm library is not installed"
+ )
@classmethod
def match_json(
@@ -149,35 +152,15 @@ def match_json(
model_family: RerankModelFamilyV2,
model_spec: RerankSpecV1,
quantization: str,
- ) -> bool:
-
- result = cls.match_with_reason(model_family, model_spec, quantization)
- return result.is_match
-
- @classmethod
- def match_with_reason(
- cls,
- model_family: RerankModelFamilyV2,
- model_spec: RerankSpecV1,
- quantization: str,
- ) -> "MatchResult":
- from ..match_result import ErrorType, MatchResult
-
+ ) -> Union[bool, str]:
# Check library availability
- if not cls.check_lib():
- return MatchResult.failure(
- reason="vLLM library is not installed for reranking",
- error_type=ErrorType.DEPENDENCY_MISSING,
- technical_details="vllm package not found in Python environment",
- )
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
# Check model format compatibility
if model_spec.model_format not in ["pytorch"]:
- return MatchResult.failure(
- reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}",
- error_type=ErrorType.MODEL_FORMAT,
- technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
- )
+ return f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}"
# Check model name prefix matching
if model_spec.model_format == "pytorch":
@@ -187,33 +170,17 @@ def match_with_reason(
if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
# Special handling for Qwen3 models
if "qwen3" not in model_family.model_name.lower():
- return MatchResult.failure(
- reason=f"Model family prefix not supported by vLLM reranking: {prefix}",
- error_type=ErrorType.MODEL_COMPATIBILITY,
- technical_details=f"Unsupported prefix: {prefix}",
- )
+ return f"Model family prefix not supported by vLLM reranking: {prefix}"
except (IndexError, AttributeError):
- return MatchResult.failure(
- reason="Unable to parse model family name for vLLM compatibility check",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details=f"Model name parsing failed: {model_family.model_name}",
- )
+ return f"Unable to parse model family name for vLLM compatibility check: {model_family.model_name}"
# Check rerank-specific requirements
if not hasattr(model_family, "model_name"):
- return MatchResult.failure(
- reason="Rerank model family requires model name specification for vLLM",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details="Missing model_name in vLLM rerank model family",
- )
+ return "Rerank model family requires model name specification for vLLM"
# Check max tokens limit for vLLM reranking performance
max_tokens = model_family.max_tokens
if max_tokens and max_tokens > 4096: # vLLM has stricter limits
- return MatchResult.failure(
- reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}",
- error_type=ErrorType.CONFIGURATION_ERROR,
- technical_details=f"High max_tokens for vLLM reranking: {max_tokens}",
- )
+ return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues"
- return MatchResult.success()
+ return True
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index ea7adb309e..3442d38ea1 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -506,33 +506,59 @@ def get_engine_params_by_name(
if model_name in LLM_ENGINES and LLM_ENGINES[model_name]:
# Try to get model family for testing
try:
- from .llm.llm_family import match_llm
+ pass
+
+ # Get the full model family instead of a single spec
+ from .llm.llm_family import BUILTIN_LLM_FAMILIES
+
+ llm_family = None
+ for family in BUILTIN_LLM_FAMILIES:
+ if model_name == family.model_name:
+ llm_family = family
+ break
- llm_family = match_llm(model_name, None, None, None, None)
if llm_family and llm_family.model_specs:
- llm_spec = llm_family.model_specs[0]
- quantization = llm_spec.quantization or "none"
# Test each engine class for detailed error info
for engine_class in llm_engine_classes:
try:
- if hasattr(engine_class, "match_with_reason"):
- pass
+ engine_compatible = False
+ error_details = None
- result = engine_class.match_with_reason(
- llm_family, llm_spec, quantization
+ # Try each model spec to find one compatible with this engine
+ for llm_spec in llm_family.model_specs:
+ quantization = (
+ llm_spec.quantization or "none"
)
- if not result.is_match:
- detailed_error = {
- "error": result.reason,
- "error_type": result.error_type,
- "technical_details": result.technical_details,
- }
- break
+
+ if hasattr(engine_class, "match_json"):
+ match_result = engine_class.match_json(
+ llm_family, llm_spec, quantization
+ )
+ if match_result == True:
+ engine_compatible = True
+ break # Found compatible spec
+ else:
+ # Save error details, but continue trying other specs
+ error_details = {
+ "error": (
+ match_result
+ if isinstance(
+ match_result, str
+ )
+ else "Engine is not compatible"
+ ),
+ "error_type": "model_compatibility",
+ "technical_details": f"The {engine_class.__name__} engine cannot handle the current model configuration: {llm_spec.model_format} format",
+ }
+
+ if not engine_compatible and error_details:
+ detailed_error = error_details
+ break
except Exception as e:
# Fall back to next engine class with clear error logging
logger.warning(
- f"Engine class {engine_class.__name__} match_with_reason failed: {e}"
+ f"Engine class {engine_class.__name__} match_json failed: {e}"
)
# Continue to try next engine class, but this is expected behavior for fallback
continue
@@ -555,8 +581,15 @@ def get_engine_params_by_name(
for engine_class in llm_engine_classes:
try:
if hasattr(engine_class, "check_lib"):
- lib_available: bool = engine_class.check_lib() # type: ignore[assignment]
- if not lib_available:
+ lib_result = engine_class.check_lib()
+ if lib_result != True:
+ # If check_lib returns a string, it's an error message
+ error_msg = (
+ lib_result
+ if isinstance(lib_result, str)
+ else f"Engine {engine_name} library check failed"
+ )
+ engine_params[engine_name] = error_msg
break
else:
# If no check_lib method, try to use engine's match method for compatibility check
@@ -564,17 +597,49 @@ def get_engine_params_by_name(
try:
# Create a minimal test spec if we don't have real model specs
from .llm.llm_family import (
+ AwqLLMSpecV2,
+ GgmlLLMSpecV2,
+ GptqLLMSpecV2,
LLMFamilyV2,
+ MLXLLMSpecV2,
PytorchLLMSpecV2,
)
- # Create a minimal test case
+ # Create appropriate test spec based on engine class
+ engine_name_lower = (
+ engine_class.__name__.lower()
+ )
+ if "mlx" in engine_name_lower:
+ # MLX engines need MLX format
+ test_spec_class = MLXLLMSpecV2
+ model_format = "mlx"
+ elif (
+ "ggml" in engine_name_lower
+ or "llamacpp" in engine_name_lower
+ ):
+ # GGML/llama.cpp engines need GGML format
+ test_spec_class = GgmlLLMSpecV2
+ model_format = "ggmlv3"
+ elif "gptq" in engine_name_lower:
+ # GPTQ engines need GPTQ format
+ test_spec_class = GptqLLMSpecV2
+ model_format = "gptq"
+ elif "awq" in engine_name_lower:
+ # AWQ engines need AWQ format
+ test_spec_class = AwqLLMSpecV2
+ model_format = "awq"
+ else:
+ # Default to PyTorch format
+ test_spec_class = PytorchLLMSpecV2
+ model_format = "pytorch"
+
+ # Create a minimal test case with appropriate format
test_family = LLMFamilyV2(
model_name="test",
model_family="test",
model_specs=[
- PytorchLLMSpecV2(
- model_format="pytorch",
+ test_spec_class(
+ model_format=model_format,
quantization="none",
)
],
@@ -597,11 +662,21 @@ def get_engine_params_by_name(
break
elif hasattr(engine_class, "match_json"):
# Fallback to simple match method - use test data
- if engine_class.match_json(
+ match_result = engine_class.match_json(
test_family, test_spec, "none"
- ):
- break
+ )
+ if match_result == True:
+ break # Engine is available
else:
+ # Get detailed error information
+ error_message = (
+ match_result
+ if isinstance(match_result, str)
+ else f"Engine {engine_name} is not compatible with current model or environment"
+ )
+ engine_params[engine_name] = (
+ error_message
+ )
break
else:
# Final fallback: generic import check
@@ -653,9 +728,7 @@ def get_engine_params_by_name(
return engine_params
elif model_type == "embedding":
- from .embedding.embed_family import (
- EMBEDDING_ENGINES,
- )
+ from .embedding.embed_family import EMBEDDING_ENGINES
from .embedding.embed_family import (
SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
)
@@ -716,14 +789,23 @@ def get_engine_params_by_name(
)
test_spec = test_family.model_specs[0]
- # Use the engine's match method to check compatibility
- if embedding_engine_class.match(
- test_family, test_spec, "none"
- ):
+ # Use the engine's match_json method to check compatibility and get detailed error
+ match_result = (
+ embedding_engine_class.match_json(
+ test_family, test_spec, "none"
+ )
+ )
+ if match_result == True:
break # Engine is available
else:
+ # Get detailed error information
+ error_message = (
+ match_result
+ if isinstance(match_result, str)
+ else f"Engine {engine_name} is not compatible with current model or environment"
+ )
embedding_error_details = {
- "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error": error_message,
"error_type": "model_compatibility",
"technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration",
}
@@ -789,9 +871,7 @@ def get_engine_params_by_name(
return engine_params
elif model_type == "rerank":
- from .rerank.rerank_family import (
- RERANK_ENGINES,
- )
+ from .rerank.rerank_family import RERANK_ENGINES
from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES
if model_name not in RERANK_ENGINES:
@@ -850,14 +930,21 @@ def get_engine_params_by_name(
)
test_spec = test_family.model_specs[0]
- # Use the engine's match method to check compatibility
- if rerank_engine_class.match(
+ # Use the engine's match_json method to check compatibility and get detailed error
+ match_result = rerank_engine_class.match_json(
test_family, test_spec, "none"
- ):
+ )
+ if match_result == True:
break # Engine is available
else:
+ # Get detailed error information
+ error_message = (
+ match_result
+ if isinstance(match_result, str)
+ else f"Engine {engine_name} is not compatible with current model or environment"
+ )
rerank_error_details = {
- "error": f"Engine {engine_name} is not compatible with current model or environment",
+ "error": error_message,
"error_type": "model_compatibility",
"technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration",
}
From 26ca06f9645f0691cded28dbd2243f27a70912c1 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:27:58 +0800
Subject: [PATCH 32/37] pre-commit
---
xinference/model/utils.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 3442d38ea1..12be38ec71 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -728,7 +728,9 @@ def get_engine_params_by_name(
return engine_params
elif model_type == "embedding":
- from .embedding.embed_family import EMBEDDING_ENGINES
+ from .embedding.embed_family import (
+ EMBEDDING_ENGINES,
+ )
from .embedding.embed_family import (
SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES,
)
From 48a272d2bed187982f95bbff0d5f7cc9ce517b19 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 14:47:14 +0800
Subject: [PATCH 33/37] mypy-error
---
xinference/model/llm/sglang/core.py | 10 +++++-----
xinference/model/utils.py | 18 ++++--------------
2 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
index ccb44c00bd..7d5d13d229 100644
--- a/xinference/model/llm/sglang/core.py
+++ b/xinference/model/llm/sglang/core.py
@@ -448,7 +448,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool:
f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if not is_model_supported(
+ if not llm_family.model_name or not is_model_supported(
llm_family.model_name.lower(),
[s.lower() for s in SGLANG_SUPPORTED_MODELS],
):
@@ -797,7 +797,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_chat_model_supported(
+ if not llm_family.model_family or not is_chat_model_supported(
llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS
):
# Instead of hard rejection, give a warning but allow usage
@@ -805,7 +805,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool:
f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if not is_chat_model_supported(
+ if not llm_family.model_name or not is_chat_model_supported(
llm_family.model_name.lower(),
[s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS],
):
@@ -944,7 +944,7 @@ def is_vision_model_supported(
return False
if isinstance(llm_family, CustomLLMFamilyV2):
- if not is_vision_model_supported(
+ if not llm_family.model_family or not is_vision_model_supported(
llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST
):
# Instead of hard rejection, give a warning but allow usage
@@ -952,7 +952,7 @@ def is_vision_model_supported(
f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}"
)
else:
- if not is_vision_model_supported(
+ if not llm_family.model_name or not is_vision_model_supported(
llm_family.model_name.lower(),
[s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST],
):
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 12be38ec71..35f5b21fdc 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -597,9 +597,7 @@ def get_engine_params_by_name(
try:
# Create a minimal test spec if we don't have real model specs
from .llm.llm_family import (
- AwqLLMSpecV2,
- GgmlLLMSpecV2,
- GptqLLMSpecV2,
+ LlamaCppLLMSpecV2,
LLMFamilyV2,
MLXLLMSpecV2,
PytorchLLMSpecV2,
@@ -618,18 +616,10 @@ def get_engine_params_by_name(
or "llamacpp" in engine_name_lower
):
# GGML/llama.cpp engines need GGML format
- test_spec_class = GgmlLLMSpecV2
- model_format = "ggmlv3"
- elif "gptq" in engine_name_lower:
- # GPTQ engines need GPTQ format
- test_spec_class = GptqLLMSpecV2
- model_format = "gptq"
- elif "awq" in engine_name_lower:
- # AWQ engines need AWQ format
- test_spec_class = AwqLLMSpecV2
- model_format = "awq"
+ test_spec_class = LlamaCppLLMSpecV2
+ model_format = "ggufv2"
else:
- # Default to PyTorch format
+ # Default to PyTorch format (supports gptq, awq, fp8, bnb)
test_spec_class = PytorchLLMSpecV2
model_format = "pytorch"
From 0acb4711751c2d295cbeb037763407b0735aa229 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 29 Oct 2025 17:54:11 +0800
Subject: [PATCH 34/37] fix mlx CI bug
---
xinference/model/llm/mlx/core.py | 44 +++++++++++++++++++++++++++++---
1 file changed, 40 insertions(+), 4 deletions(-)
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
index ab8f1608db..b391ac97b8 100644
--- a/xinference/model/llm/mlx/core.py
+++ b/xinference/model/llm/mlx/core.py
@@ -423,6 +423,14 @@ def match_json(
if llm_spec.model_format not in ["mlx"]:
return f"MLX engine only supports MLX format, got: {llm_spec.model_format}"
+ # Base MLX model should not handle chat or vision models
+ # Those should be handled by MLXChatModel and MLXVisionModel respectively
+ model_abilities = getattr(llm_family, "model_ability", [])
+ if "chat" in model_abilities:
+ return False # Let MLXChatModel handle this
+ if "vision" in model_abilities:
+ return False # Let MLXVisionModel handle this
+
# Check memory constraints for Apple Silicon
model_size = float(str(llm_spec.model_size_in_billions))
if model_size > 70: # Large models may be problematic
@@ -729,10 +737,28 @@ def _sanitize_generate_config(
def match_json(
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
) -> Union[bool, str]:
- # First run base class checks
- base_result = super().match_json(llm_family, llm_spec, quantization)
- if base_result != True:
- return base_result
+ # Check library availability first
+ lib_result = cls.check_lib()
+ if lib_result != True:
+ return lib_result
+
+ # Check model format compatibility
+ if llm_spec.model_format not in ["mlx"]:
+ return f"MLX Chat engine only supports MLX format, got: {llm_spec.model_format}"
+
+ # Check that this model has chat ability
+ model_abilities = getattr(llm_family, "model_ability", [])
+ if "chat" not in model_abilities:
+ return False # Not a chat model
+
+ # MLX Chat doesn't support vision
+ if "vision" in model_abilities:
+ return False # Let MLXVisionModel handle this
+
+ # Check memory constraints for Apple Silicon
+ model_size = float(str(llm_spec.model_size_in_billions))
+ if model_size > 70: # Large models may be problematic
+ return f"MLX Chat may have memory limitations with very large models ({model_size}B parameters)"
return True
@@ -801,6 +827,16 @@ def match_json(
if llm_spec.model_format not in ["mlx"]:
return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}"
+ # Check that this model has vision ability
+ model_abilities = getattr(llm_family, "model_ability", [])
+ if "vision" not in model_abilities:
+ return False # Not a vision model
+
+ # Check memory constraints for Apple Silicon
+ model_size = float(str(llm_spec.model_size_in_billions))
+ if model_size > 70: # Large models may be problematic
+ return f"MLX Vision may have memory limitations with very large models ({model_size}B parameters)"
+
return True
def _load_model(self, **kwargs):
From 1b973b41f50de563b97f256318ce47ca839abe3c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 30 Oct 2025 16:19:19 +0800
Subject: [PATCH 35/37] fix CI bug
---
xinference/model/embedding/vllm/core.py | 72 +++++++++++++++++--
xinference/model/rerank/vllm/core.py | 62 ++++++++++++++--
.../model/rerank/vllm/tests/test_vllm.py | 1 +
3 files changed, 124 insertions(+), 11 deletions(-)
diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 8fc32ebac8..674eeaa21e 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -22,7 +22,7 @@
from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
logger = logging.getLogger(__name__)
-SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
+SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
class VLLMEmbeddingModel(EmbeddingModel):
@@ -32,16 +32,44 @@ def __init__(self, *args, **kwargs):
def load(self):
try:
+ # Handle vLLM-transformers config conflict by setting environment variable
+ import os
+
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm"
+
from vllm import LLM
- except ImportError:
+ except ImportError as e:
error_message = "Failed to import module 'vllm'"
installation_guide = [
"Please make sure 'vllm' is installed. ",
"You can install it by `pip install vllm`\n",
]
+ # Check if it's a config conflict error
+ if "aimv2" in str(e):
+ error_message = (
+ "vLLM has a configuration conflict with transformers library"
+ )
+ installation_guide = [
+ "This is a known issue with certain vLLM and transformers versions.",
+ "Try upgrading transformers or using a different vLLM version.\n",
+ ]
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+ except Exception as e:
+ # Handle config registration conflicts
+ if "aimv2" in str(e) and "already used by a Transformers config" in str(e):
+ error_message = (
+ "vLLM has a configuration conflict with transformers library"
+ )
+ installation_guide = [
+ "This is a known issue with certain vLLM and transformers versions.",
+ "Try: pip install --upgrade transformers vllm\n",
+ ]
+ raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}")
+ raise
+
if self.model_family.model_name in {
"Qwen3-Embedding-0.6B",
"Qwen3-Embedding-4B",
@@ -168,11 +196,41 @@ def match_json(
if lib_result != True:
return lib_result
- if model_spec.model_format in ["pytorch"]:
- prefix = model_family.model_name.split("-", 1)[0]
- if prefix in SUPPORTED_MODELS_PREFIXES:
- return True
- return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}"
+ # Check model format compatibility
+ if model_spec.model_format not in ["pytorch"]:
+ return f"VLLM Embedding engine only supports pytorch format models, got format: {model_spec.model_format}"
+
+ # Check model name prefix matching
+ prefix = model_family.model_name.split("-", 1)[0]
+ if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]:
+ return f"VLLM Embedding engine only supports models with prefixes {SUPPORTED_MODELS_PREFIXES}, got model: {model_family.model_name}"
+
+ # Additional runtime compatibility checks for vLLM version
+ try:
+ import vllm
+ from packaging.version import Version
+
+ vllm_version = Version(vllm.__version__)
+
+ # Check for vLLM version compatibility issues
+ if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"):
+ # vLLM 0.10.x has V1 engine issues on CPU
+ import platform
+
+ if platform.system() == "Darwin" and platform.machine() in [
+ "arm64",
+ "arm",
+ ]:
+ # Check if this is likely to run on CPU (most common for testing)
+ return f"vLLM {vllm_version} has compatibility issues with embedding models on Apple Silicon CPUs. Consider using a different platform or vLLM version."
+ elif vllm_version >= Version("0.11.0"):
+ # vLLM 0.11+ should have fixed the config conflict issue
+ pass
+ except Exception:
+ # If version check fails, continue with basic validation
+ pass
+
+ return True
def wait_for_load(self):
# set context length after engine inited
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 4f63c0136c..2c6d9dbeed 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -6,22 +6,49 @@
from ...utils import cache_clean
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
-SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
+SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
class VLLMRerankModel(RerankModel):
def load(self):
try:
+ # Handle vLLM-transformers config conflict by setting environment variable
+ import os
+
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm"
+
from vllm import LLM
- except ImportError:
+ except ImportError as e:
error_message = "Failed to import module 'vllm'"
installation_guide = [
"Please make sure 'vllm' is installed. ",
"You can install it by `pip install vllm`\n",
]
+ # Check if it's a config conflict error
+ if "aimv2" in str(e):
+ error_message = (
+ "vLLM has a configuration conflict with transformers library"
+ )
+ installation_guide = [
+ "This is a known issue with certain vLLM and transformers versions.",
+ "Try upgrading transformers or using a different vLLM version.\n",
+ ]
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+ except Exception as e:
+ # Handle config registration conflicts
+ if "aimv2" in str(e) and "already used by a Transformers config" in str(e):
+ error_message = (
+ "vLLM has a configuration conflict with transformers library"
+ )
+ installation_guide = [
+ "This is a known issue with certain vLLM and transformers versions.",
+ "Try: pip install --upgrade transformers vllm\n",
+ ]
+ raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}")
+ raise
if self.model_family.model_name in {
"Qwen3-Reranker-0.6B",
@@ -180,7 +207,34 @@ def match_json(
# Check max tokens limit for vLLM reranking performance
max_tokens = model_family.max_tokens
- if max_tokens and max_tokens > 4096: # vLLM has stricter limits
- return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues"
+ if (
+ max_tokens and max_tokens > 32768
+ ): # vLLM has stricter limits, but Qwen3 can handle up to 32k
+ return f"Max tokens limit too high for vLLM reranking model: {max_tokens}, exceeds safe limit"
+
+ # Additional runtime compatibility checks for vLLM version
+ try:
+ import vllm
+ from packaging.version import Version
+
+ vllm_version = Version(vllm.__version__)
+
+ # Check for vLLM version compatibility issues
+ if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"):
+ # vLLM 0.10.x has V1 engine issues on CPU
+ import platform
+
+ if platform.system() == "Darwin" and platform.machine() in [
+ "arm64",
+ "arm",
+ ]:
+ # Check if this is likely to run on CPU (most common for testing)
+ return f"vLLM {vllm_version} has compatibility issues with reranking models on Apple Silicon CPUs. Consider using a different platform or vLLM version."
+ elif vllm_version >= Version("0.11.0"):
+ # vLLM 0.11+ should have fixed the config conflict issue
+ pass
+ except Exception:
+ # If version check fails, continue with basic validation
+ pass
return True
diff --git a/xinference/model/rerank/vllm/tests/test_vllm.py b/xinference/model/rerank/vllm/tests/test_vllm.py
index 37b948ac42..578b62bdd4 100644
--- a/xinference/model/rerank/vllm/tests/test_vllm.py
+++ b/xinference/model/rerank/vllm/tests/test_vllm.py
@@ -61,6 +61,7 @@ def test_qwen3_vllm(setup):
model_name="Qwen3-Reranker-0.6B",
model_type="rerank",
model_engine="vllm",
+ max_num_batched_tokens=81920, # Allow larger batch size for Qwen3
)
model = client.get_model(model_uid)
From f52824a70484083cd68ef82341d4f4e9b87d8863 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 10 Nov 2025 16:44:45 +0800
Subject: [PATCH 36/37] modify embedding sentence_transformers
---
xinference/model/embedding/sentence_transformers/core.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py
index 4e1c7b8b73..6521358a3f 100644
--- a/xinference/model/embedding/sentence_transformers/core.py
+++ b/xinference/model/embedding/sentence_transformers/core.py
@@ -449,13 +449,13 @@ def match_json(
# Check model dimensions compatibility
model_dimensions = model_family.dimensions
- if model_dimensions > 1536: # Very large embedding models
- return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues"
+ if model_dimensions > 8192: # Extremely large embedding models
+ return f"Extremely large embedding model detected ({model_dimensions} dimensions), may have performance issues"
# Check token limits
max_tokens = model_family.max_tokens
- if max_tokens > 8192: # Very high token limits
- return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
+ if max_tokens > 131072: # Extremely high token limits (128K)
+ return f"Extremely high token limit model detected (max_tokens: {max_tokens}), may cause memory issues"
# Check for special model requirements
model_name = model_family.model_name.lower()
From dd2f141d06d5716b274e41c95ac5dee7bcc64575 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 12 Nov 2025 10:44:08 +0800
Subject: [PATCH 37/37] modify embedding sentence_transformers
---
xinference/model/embedding/vllm/core.py | 47 +++++++++++++++++++++++--
xinference/model/rerank/vllm/core.py | 40 +++++++++++++++++++++
2 files changed, 85 insertions(+), 2 deletions(-)
diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
index 674eeaa21e..c037ce2b53 100644
--- a/xinference/model/embedding/vllm/core.py
+++ b/xinference/model/embedding/vllm/core.py
@@ -89,6 +89,34 @@ def load(self):
is_matryoshka=True,
)
+ # Set appropriate VLLM configuration parameters based on model capabilities
+ model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+ # Set max_model_len based on model family capabilities with reasonable limits
+ max_model_len = min(model_max_tokens, 8192)
+ if "max_model_len" not in self._kwargs:
+ self._kwargs["max_model_len"] = max_model_len
+
+ # Ensure max_num_batched_tokens is sufficient for large models
+ if "max_num_batched_tokens" not in self._kwargs:
+ # max_num_batched_tokens should be at least max_model_len
+ # Set to a reasonable minimum that satisfies the constraint
+ self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+ # Configure other reasonable defaults for embedding models
+ if "gpu_memory_utilization" not in self._kwargs:
+ self._kwargs["gpu_memory_utilization"] = 0.7
+
+ # Use a smaller block size for better compatibility
+ if "block_size" not in self._kwargs:
+ self._kwargs["block_size"] = 16
+
+ logger.debug(
+ f"VLLM configuration for {self.model_family.model_name}: "
+ f"max_model_len={self._kwargs.get('max_model_len')}, "
+ f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+ )
+
self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
self._tokenizer = self._model.get_tokenizer()
@@ -246,6 +274,21 @@ def _set_context_length(self):
self._model.llm_engine.vllm_config.model_config.max_model_len
)
else:
- # v1
- logger.warning("vLLM v1 is not supported, ignore context length setting")
+ # v1 - Get max_model_len from the v1 engine configuration
+ try:
+ # For v1, access the config differently
+ if hasattr(self._model.llm_engine, "vllm_config"):
+ self._context_length = (
+ self._model.llm_engine.vllm_config.model_config.max_model_len
+ )
+ elif hasattr(self._model.llm_engine, "model_config"):
+ self._context_length = (
+ self._model.llm_engine.model_config.max_model_len
+ )
+ else:
+ # Fallback to the configured value
+ self._context_length = self._kwargs.get("max_model_len", 512)
+ except Exception as e:
+ logger.warning(f"Failed to get context length from vLLM v1 engine: {e}")
+ self._context_length = self._kwargs.get("max_model_len", 512)
logger.debug("Model context length: %s", self._context_length)
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
index 2c6d9dbeed..9729a2ccc7 100644
--- a/xinference/model/rerank/vllm/core.py
+++ b/xinference/model/rerank/vllm/core.py
@@ -1,4 +1,6 @@
import importlib.util
+import json
+import logging
import uuid
from typing import List, Optional, Union
@@ -6,6 +8,8 @@
from ...utils import cache_clean
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
+logger = logging.getLogger(__name__)
+
SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
@@ -67,6 +71,42 @@ def load(self):
classifier_from_token=["no", "yes"],
is_original_qwen3_reranker=True,
)
+ elif isinstance(self._kwargs["hf_overrides"], str):
+ self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
+ self._kwargs["hf_overrides"].update(
+ architectures=["Qwen3ForSequenceClassification"],
+ classifier_from_token=["no", "yes"],
+ is_original_qwen3_reranker=True,
+ )
+
+ # Set appropriate VLLM configuration parameters based on model capabilities
+ model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+ # Set max_model_len based on model family capabilities with reasonable limits
+ max_model_len = min(model_max_tokens, 8192)
+ if "max_model_len" not in self._kwargs:
+ self._kwargs["max_model_len"] = max_model_len
+
+ # Ensure max_num_batched_tokens is sufficient for large models
+ if "max_num_batched_tokens" not in self._kwargs:
+ # max_num_batched_tokens should be at least max_model_len
+ # Set to a reasonable minimum that satisfies the constraint
+ self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+ # Configure other reasonable defaults for reranking models
+ if "gpu_memory_utilization" not in self._kwargs:
+ self._kwargs["gpu_memory_utilization"] = 0.7
+
+ # Use a smaller block size for better compatibility
+ if "block_size" not in self._kwargs:
+ self._kwargs["block_size"] = 16
+
+ logger.debug(
+ f"VLLM configuration for rerank model {self.model_family.model_name}: "
+ f"max_model_len={self._kwargs.get('max_model_len')}, "
+ f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+ )
+
self._model = LLM(model=self._model_path, task="score", **self._kwargs)
self._tokenizer = self._model.get_tokenizer()