From daa305adeab4a1b7b1332256257c036280bcef37 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 13 Oct 2025 17:39:17 +0800 Subject: [PATCH 01/37] FEAT: add engine ability display --- xinference/model/llm/vllm/core.py | 25 ++- xinference/model/utils.py | 255 ++++++++++++++++++++++++++++-- 2 files changed, 263 insertions(+), 17 deletions(-) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 4da42ed48b..58b0a523aa 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -851,7 +851,30 @@ def _sanitize_generate_config( @classmethod def check_lib(cls) -> bool: - return importlib.util.find_spec("vllm") is not None + if importlib.util.find_spec("vllm") is None: + return False + + try: + import vllm + + if not getattr(vllm, "__version__", None): + return False + + # Check version + from packaging import version + + if version.parse(vllm.__version__) < version.parse("0.3.0"): + return False + + # Check CUDA + import torch + + if not torch.cuda.is_available(): + return False + + return True + except Exception: + return False @classmethod def match_json( diff --git a/xinference/model/utils.py b/xinference/model/utils.py index ea5dec74d5..0d8e471bb0 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -14,6 +14,7 @@ import asyncio import functools +import importlib.util import json import logging import os @@ -472,44 +473,266 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str -) -> Optional[Dict[str, List[dict]]]: +) -> Optional[Dict[str, Union[List[dict], str]]]: if model_type == "LLM": - from .llm.llm_family import LLM_ENGINES + from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES if model_name not in LLM_ENGINES: return None - # filter llm_class - engine_params = deepcopy(LLM_ENGINES[model_name]) - for engine, params in engine_params.items(): + # Get all supported engines, not just currently available ones + all_supported_engines = list(SUPPORTED_ENGINES.keys()) + engine_params = {} + + # First add currently available engine parameters + available_engines = deepcopy(LLM_ENGINES[model_name]) + for engine, params in available_engines.items(): for param in params: - del param["llm_class"] + # Remove previous available attribute as available engines don't need this flag + if "available" in param: + del param["available"] + engine_params[engine] = params + + # Check unavailable engines + for engine_name in all_supported_engines: + if engine_name not in engine_params: # Engine not in available list + try: + engine_classes = SUPPORTED_ENGINES[engine_name] + error_msg = None + + # Try to find specific error reasons + for engine_class in engine_classes: + try: + if hasattr(engine_class, "check_lib"): + lib_available = engine_class.check_lib() + if not lib_available: + error_msg = ( + f"Engine {engine_name} library is not available" + ) + break + else: + # If no check_lib method, try import check + module_name = engine_name.lower().replace(".", "") + if engine_name == "vLLM": + module_name = "vllm" + elif engine_name == "SGLang": + module_name = "sglang" + elif engine_name == "llama.cpp": + module_name = "llama_cpp" + elif engine_name == "MLX": + module_name = "mlx" + elif engine_name == "LMDEPLOY": + module_name = "lmdeploy" + elif engine_name == "Transformers": + module_name = "transformers" + + importlib.import_module(module_name) + break + except ImportError as e: + error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + except Exception as e: + error_msg = ( + f"Engine {engine_name} is not available: {str(e)}" + ) + + if error_msg is None: + error_msg = f"Engine {engine_name} is not compatible with current model or environment" + + # For unavailable engines, directly return error message string + engine_params[engine_name] = error_msg + + except Exception as e: + # If exception occurs during checking, return error message string + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) + + # Filter out llm_class field + for engine, params in engine_params.items(): + if isinstance( + params, list + ): # Only process parameter lists of available engines + for param in params: + if "llm_class" in param: + del param["llm_class"] return engine_params elif model_type == "embedding": - from .embedding.embed_family import EMBEDDING_ENGINES + from .embedding.embed_family import ( + EMBEDDING_ENGINES, + ) + from .embedding.embed_family import ( + SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES, + ) if model_name not in EMBEDDING_ENGINES: return None - # filter embedding_class - engine_params = deepcopy(EMBEDDING_ENGINES[model_name]) - for engine, params in engine_params.items(): + # Get all supported engines, not just currently available ones + all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys()) + engine_params = {} + + # First add currently available engine parameters + available_engines = deepcopy(EMBEDDING_ENGINES[model_name]) + for engine, params in available_engines.items(): for param in params: - del param["embedding_class"] + # Remove previous available attribute as available engines don't need this flag + if "available" in param: + del param["available"] + engine_params[engine] = params + + # Check unavailable engines + for engine_name in all_supported_engines: + if engine_name not in engine_params: # Engine not in available list + try: + engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] + error_msg = None + + # Try to find specific error reasons + for engine_class in engine_classes: + try: + if hasattr(engine_class, "check_lib"): + lib_available = engine_class.check_lib() + if not lib_available: + error_msg = ( + f"Engine {engine_name} library is not available" + ) + break + else: + # If no check_lib method, try import check + module_name = engine_name.lower().replace(".", "") + if engine_name == "vLLM": + module_name = "vllm" + elif engine_name == "SGLang": + module_name = "sglang" + elif engine_name == "llama.cpp": + module_name = "llama_cpp" + elif engine_name == "MLX": + module_name = "mlx" + elif engine_name == "LMDEPLOY": + module_name = "lmdeploy" + elif engine_name == "Transformers": + module_name = "transformers" + elif engine_name == "SentenceTransformers": + module_name = "sentence_transformers" + + importlib.import_module(module_name) + break + except ImportError as e: + error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + except Exception as e: + error_msg = ( + f"Engine {engine_name} is not available: {str(e)}" + ) + + if error_msg is None: + error_msg = f"Engine {engine_name} is not compatible with current model or environment" + + # For unavailable engines, directly return error message string + engine_params[engine_name] = error_msg + + except Exception as e: + # If exception occurs during checking, return error message string + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) + + # Filter out embedding_class field + for engine, params in engine_params.items(): + if isinstance( + params, list + ): # Only process parameter lists of available engines + for param in params: + if "embedding_class" in param: + del param["embedding_class"] return engine_params elif model_type == "rerank": - from .rerank.rerank_family import RERANK_ENGINES + from .rerank.rerank_family import ( + RERANK_ENGINES, + ) + from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES if model_name not in RERANK_ENGINES: return None - # filter rerank_class - engine_params = deepcopy(RERANK_ENGINES[model_name]) - for engine, params in engine_params.items(): + # Get all supported engines, not just currently available ones + all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys()) + engine_params = {} + + # First add currently available engine parameters + available_engines = deepcopy(RERANK_ENGINES[model_name]) + for engine, params in available_engines.items(): for param in params: - del param["rerank_class"] + # Remove previous available attribute as available engines don't need this flag + if "available" in param: + del param["available"] + engine_params[engine] = params + + # Check unavailable engines + for engine_name in all_supported_engines: + if engine_name not in engine_params: # Engine not in available list + try: + engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] + error_msg = None + + # Try to find specific error reasons + for engine_class in engine_classes: + try: + if hasattr(engine_class, "check_lib"): + lib_available = engine_class.check_lib() + if not lib_available: + error_msg = ( + f"Engine {engine_name} library is not available" + ) + break + else: + # If no check_lib method, try import check + module_name = engine_name.lower().replace(".", "") + if engine_name == "vLLM": + module_name = "vllm" + elif engine_name == "SGLang": + module_name = "sglang" + elif engine_name == "llama.cpp": + module_name = "llama_cpp" + elif engine_name == "MLX": + module_name = "mlx" + elif engine_name == "LMDEPLOY": + module_name = "lmdeploy" + elif engine_name == "Transformers": + module_name = "transformers" + elif engine_name == "SentenceTransformers": + module_name = "sentence_transformers" + + importlib.import_module(module_name) + break + except ImportError as e: + error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + except Exception as e: + error_msg = ( + f"Engine {engine_name} is not available: {str(e)}" + ) + + if error_msg is None: + error_msg = f"Engine {engine_name} is not compatible with current model or environment" + + # For unavailable engines, directly return error message string + engine_params[engine_name] = error_msg + + except Exception as e: + # If exception occurs during checking, return error message string + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) + + # Filter out rerank_class field + for engine, params in engine_params.items(): + if isinstance( + params, list + ): # Only process parameter lists of available engines + for param in params: + if "rerank_class" in param: + del param["rerank_class"] return engine_params else: From 5347c4be930b4125382555c1328b78b4fd8a1fce Mon Sep 17 00:00:00 2001 From: yiboyasss <3359595624@qq.com> Date: Mon, 13 Oct 2025 18:16:12 +0800 Subject: [PATCH 02/37] feat: frontend supports engine ability display --- .../components/launchModelDrawer.js | 69 ++++++------------- .../launch_model/components/selectField.js | 42 +++++++++++ 2 files changed, 64 insertions(+), 47 deletions(-) create mode 100644 xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js index 1169f06269..ccff202111 100644 --- a/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js +++ b/xinference/ui/web/ui/src/scenes/launch_model/components/launchModelDrawer.js @@ -13,15 +13,11 @@ import { CircularProgress, Collapse, Drawer, - FormControl, FormControlLabel, - InputLabel, ListItemButton, ListItemText, - MenuItem, Radio, RadioGroup, - Select, Switch, TextField, Tooltip, @@ -39,45 +35,11 @@ import DynamicFieldList from './dynamicFieldList' import getModelFormConfig from './modelFormConfig' import PasteDialog from './pasteDialog' import Progress from './progress' +import SelectField from './selectField' const enginesWithNWorker = ['SGLang', 'vLLM', 'MLX'] const modelEngineType = ['LLM', 'embedding', 'rerank'] -const SelectField = ({ - label, - labelId, - name, - value, - onChange, - options = [], - disabled = false, - required = false, -}) => ( - - {label} - - -) - const LaunchModelDrawer = ({ modelData, modelType, @@ -549,19 +511,32 @@ const LaunchModelDrawer = ({ const engineItems = useMemo(() => { return engineOptions.map((engine) => { - const modelFormats = Array.from( - new Set(enginesObj[engine]?.map((item) => item.model_format)) - ) + const engineData = enginesObj[engine] + let modelFormats = [] + let label = engine + let disabled = false + + if (Array.isArray(engineData)) { + modelFormats = Array.from( + new Set(engineData.map((item) => item.model_format)) + ) - const relevantSpecs = modelData.model_specs.filter((spec) => - modelFormats.includes(spec.model_format) - ) + const relevantSpecs = modelData.model_specs.filter((spec) => + modelFormats.includes(spec.model_format) + ) + + const cached = relevantSpecs.some((spec) => isCached(spec)) - const cached = relevantSpecs.some((spec) => isCached(spec)) + label = cached ? `${engine} ${t('launchModel.cached')}` : engine + } else if (typeof engineData === 'string') { + label = `${engine} (${engineData})` + disabled = true + } return { value: engine, - label: cached ? `${engine} ${t('launchModel.cached')}` : engine, + label, + disabled, } }) }, [engineOptions, enginesObj, modelData]) diff --git a/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js new file mode 100644 index 0000000000..7e9a4af8ce --- /dev/null +++ b/xinference/ui/web/ui/src/scenes/launch_model/components/selectField.js @@ -0,0 +1,42 @@ +import { FormControl, InputLabel, MenuItem, Select } from '@mui/material' + +const SelectField = ({ + label, + labelId, + name, + value, + onChange, + options = [], + disabled = false, + required = false, +}) => ( + + {label} + + +) + +export default SelectField From 2466777ddf2a3431f35b7770b9003a78242cdbe3 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 09:52:09 +0800 Subject: [PATCH 03/37] FEAT: add engine ability display --- xinference/model/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 0d8e471bb0..ea1c18eec8 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -474,6 +474,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str ) -> Optional[Dict[str, Union[List[dict], str]]]: + engine_params: Optional[Dict[str, Union[List[dict], str]]] = None + if model_type == "LLM": from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES From 8e1fa20df8db50443bd75271424a0f2fba834a41 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 10:01:29 +0800 Subject: [PATCH 04/37] FEAT: add engine ability display --- xinference/model/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index ea1c18eec8..7763b6fba5 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -506,7 +506,7 @@ def get_engine_params_by_name( for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + lib_available: bool = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -587,14 +587,14 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] + engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + lib_available: bool = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -675,14 +675,14 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] + engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + lib_available: bool = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" From da58bf468322393589b63b788e7c5b78c32a6568 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 10:48:18 +0800 Subject: [PATCH 05/37] FEAT: add engine ability display --- xinference/model/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 7763b6fba5..d1bd6f072f 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -473,8 +473,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str -) -> Optional[Dict[str, Union[List[dict], str]]]: - engine_params: Optional[Dict[str, Union[List[dict], str]]] = None +) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]: + engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None if model_type == "LLM": from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES @@ -506,7 +506,7 @@ def get_engine_params_by_name( for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available: bool = engine_class.check_lib() + lib_available = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -587,14 +587,14 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes: Any = EMBEDDING_SUPPORTED_ENGINES[engine_name] + engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available: bool = engine_class.check_lib() + lib_available = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -675,14 +675,14 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes: Any = RERANK_SUPPORTED_ENGINES[engine_name] + engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons for engine_class in engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available: bool = engine_class.check_lib() + lib_available = engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" From 38aad40977460da0b3548005d545b2eb03d50bf6 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 10:52:46 +0800 Subject: [PATCH 06/37] FEAT: add engine ability display --- xinference/model/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index d1bd6f072f..42f1e5913d 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -499,11 +499,11 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes = SUPPORTED_ENGINES[engine_name] + llm_engine_classes = SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons - for engine_class in engine_classes: + for engine_class in llm_engine_classes: try: if hasattr(engine_class, "check_lib"): lib_available = engine_class.check_lib() @@ -587,11 +587,11 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] + embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons - for engine_class in engine_classes: + for engine_class in embedding_engine_classes: try: if hasattr(engine_class, "check_lib"): lib_available = engine_class.check_lib() @@ -675,11 +675,11 @@ def get_engine_params_by_name( for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: - engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] + rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] error_msg = None # Try to find specific error reasons - for engine_class in engine_classes: + for engine_class in rerank_engine_classes: try: if hasattr(engine_class, "check_lib"): lib_available = engine_class.check_lib() From a679c3b2be223097099b400f36775a4fd109ac68 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 11:02:01 +0800 Subject: [PATCH 07/37] FEAT: add engine ability display --- xinference/model/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 42f1e5913d..373a7d24d9 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]: - engine_params: Optional[Dict[str, Union[List[Dict[str, Any]], str]]] = None + engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {} if model_type == "LLM": from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES @@ -484,7 +484,6 @@ def get_engine_params_by_name( # Get all supported engines, not just currently available ones all_supported_engines = list(SUPPORTED_ENGINES.keys()) - engine_params = {} # First add currently available engine parameters available_engines = deepcopy(LLM_ENGINES[model_name]) @@ -572,7 +571,6 @@ def get_engine_params_by_name( # Get all supported engines, not just currently available ones all_supported_engines = list(EMBEDDING_SUPPORTED_ENGINES.keys()) - engine_params = {} # First add currently available engine parameters available_engines = deepcopy(EMBEDDING_ENGINES[model_name]) @@ -660,7 +658,6 @@ def get_engine_params_by_name( # Get all supported engines, not just currently available ones all_supported_engines = list(RERANK_SUPPORTED_ENGINES.keys()) - engine_params = {} # First add currently available engine parameters available_engines = deepcopy(RERANK_ENGINES[model_name]) From 340ff708d41410062c0386e14ffeb505b2a6fbe9 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 11:11:30 +0800 Subject: [PATCH 08/37] FEAT: add engine ability display --- xinference/model/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 373a7d24d9..5f2d437219 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -589,10 +589,10 @@ def get_engine_params_by_name( error_msg = None # Try to find specific error reasons - for engine_class in embedding_engine_classes: + for embedding_engine_class in embedding_engine_classes: try: - if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + if hasattr(embedding_engine_class, "check_lib"): + lib_available = embedding_engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -676,10 +676,10 @@ def get_engine_params_by_name( error_msg = None # Try to find specific error reasons - for engine_class in rerank_engine_classes: + for rerank_engine_class in rerank_engine_classes: try: - if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + if hasattr(rerank_engine_class, "check_lib"): + lib_available = rerank_engine_class.check_lib() if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" From 19e1e2a1fdea15472a18be13073784a11901c70e Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 11:26:09 +0800 Subject: [PATCH 09/37] FEAT: add engine ability display --- xinference/model/utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 5f2d437219..b073cc879b 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -505,7 +505,7 @@ def get_engine_params_by_name( for engine_class in llm_engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available = engine_class.check_lib() + lib_available: bool = engine_class.check_lib() # type: ignore[assignment] if not lib_available: error_msg = ( f"Engine {engine_name} library is not available" @@ -540,11 +540,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + engine_params[engine_name] = error_msg # type: ignore[arg-type] except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( + engine_params[engine_name] = ( # type: ignore[arg-type] f"Error checking engine {engine_name}: {str(e)}" ) @@ -592,8 +592,8 @@ def get_engine_params_by_name( for embedding_engine_class in embedding_engine_classes: try: if hasattr(embedding_engine_class, "check_lib"): - lib_available = embedding_engine_class.check_lib() - if not lib_available: + embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment] + if not embedding_lib_available: error_msg = ( f"Engine {engine_name} library is not available" ) @@ -629,11 +629,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + engine_params[engine_name] = error_msg # type: ignore[arg-type] except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( + engine_params[engine_name] = ( # type: ignore[arg-type] f"Error checking engine {engine_name}: {str(e)}" ) @@ -679,8 +679,8 @@ def get_engine_params_by_name( for rerank_engine_class in rerank_engine_classes: try: if hasattr(rerank_engine_class, "check_lib"): - lib_available = rerank_engine_class.check_lib() - if not lib_available: + rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment] + if not rerank_lib_available: error_msg = ( f"Engine {engine_name} library is not available" ) @@ -716,11 +716,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + engine_params[engine_name] = error_msg # type: ignore[arg-type] except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( + engine_params[engine_name] = ( # type: ignore[arg-type] f"Error checking engine {engine_name}: {str(e)}" ) From cc84a84bc2817c28268f965d9d161def9a458f2c Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 11:48:54 +0800 Subject: [PATCH 10/37] FEAT: add engine ability display --- xinference/model/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index b073cc879b..783ceba2e4 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -540,11 +540,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg # type: ignore[arg-type] + engine_params[engine_name] = error_msg except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( # type: ignore[arg-type] + engine_params[engine_name] = ( f"Error checking engine {engine_name}: {str(e)}" ) @@ -629,11 +629,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg # type: ignore[arg-type] + engine_params[engine_name] = error_msg except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( # type: ignore[arg-type] + engine_params[engine_name] = ( f"Error checking engine {engine_name}: {str(e)}" ) @@ -716,11 +716,11 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg # type: ignore[arg-type] + engine_params[engine_name] = error_msg except Exception as e: # If exception occurs during checking, return error message string - engine_params[engine_name] = ( # type: ignore[arg-type] + engine_params[engine_name] = ( f"Error checking engine {engine_name}: {str(e)}" ) From d9b3a434c09a4f2b552aedec487258a6b432ca3c Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 14 Oct 2025 11:57:11 +0800 Subject: [PATCH 11/37] FEAT: add engine ability display --- xinference/model/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 783ceba2e4..18de3c26e4 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]: - engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {} + engine_params: Dict[str, Any] = {} if model_type == "LLM": from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES From d9d313699613323e94b83b4ad0ff141986d2f209 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 11:22:30 +0800 Subject: [PATCH 12/37] modify accomplishment measure --- xinference/model/embedding/core.py | 40 ++ xinference/model/embedding/llama_cpp/core.py | 62 ++- .../embedding/sentence_transformers/core.py | 77 ++- xinference/model/llm/core.py | 38 ++ xinference/model/llm/llama_cpp/core.py | 59 ++- xinference/model/llm/lmdeploy/core.py | 64 ++- xinference/model/llm/mlx/core.py | 158 +++++- xinference/model/llm/sglang/core.py | 229 +++++++-- xinference/model/llm/transformers/core.py | 70 ++- xinference/model/llm/vllm/core.py | 461 +++++++++++++++--- xinference/model/rerank/core.py | 40 ++ .../rerank/sentence_transformers/core.py | 75 ++- xinference/model/rerank/vllm/core.py | 73 ++- xinference/model/utils.py | 140 ++++-- 14 files changed, 1390 insertions(+), 196 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index fffbc7633c..299ec4c5d1 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -171,6 +171,46 @@ def match_json( ) -> bool: pass + @classmethod + def match_json_with_reason( + cls, + model_family: EmbeddingModelFamilyV2, + model_spec: EmbeddingSpecV1, + quantization: str, + ) -> "MatchResult": + """ + Check if the engine can handle the given embedding model with detailed error information. + + This method provides detailed failure reasons and suggestions when an engine + cannot handle a specific model configuration. The default implementation + falls back to the boolean match_json method for backward compatibility. + + Args: + model_family: The embedding model family information + model_spec: The model specification + quantization: The quantization method + + Returns: + MatchResult: Detailed match result with reasons and suggestions + """ + from .match_result import ErrorType, MatchResult + + # Default implementation for backward compatibility + if cls.match_json(model_family, model_spec, quantization): + return MatchResult.success() + else: + # Get basic reason based on common failure patterns + if not cls.check_lib(): + return MatchResult.failure( + reason=f"Required library for {cls.__name__} is not available", + error_type=ErrorType.DEPENDENCY_MISSING, + ) + else: + return MatchResult.failure( + reason=f"Embedding model configuration is not compatible with {cls.__name__}", + error_type=ErrorType.MODEL_COMPATIBILITY, + ) + @classmethod def match( cls, diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py index fb8c4e45ca..6e2908ffdd 100644 --- a/xinference/model/embedding/llama_cpp/core.py +++ b/xinference/model/embedding/llama_cpp/core.py @@ -235,6 +235,64 @@ def match_json( model_spec: EmbeddingSpecV1, quantization: str, ) -> bool: + from ..match_result import MatchResult + + result = cls.match_json_with_reason(model_family, model_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, + model_family: EmbeddingModelFamilyV2, + model_spec: EmbeddingSpecV1, + quantization: str, + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="llama.cpp library (xllamacpp) is not installed for embedding", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="xllamacpp package not found in Python environment", + ) + + # Check model format compatibility if model_spec.model_format not in ["ggufv2"]: - return False - return True + return MatchResult.failure( + reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2", + ) + + # Check embedding-specific requirements + if not hasattr(model_spec, "model_file_name_template"): + return MatchResult.failure( + reason="GGUF embedding model requires proper file configuration", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details="Missing model_file_name_template for GGUF embedding", + ) + + # Check model dimensions for llama.cpp compatibility + model_dimensions = model_family.dimensions + if model_dimensions > 4096: # llama.cpp may have limitations + return MatchResult.failure( + reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Large embedding dimensions: {model_dimensions}", + ) + + # Check platform-specific considerations + import platform + + current_platform = platform.system() + + # llama.cpp works across platforms but may have performance differences + if current_platform == "Windows": + return MatchResult.failure( + reason="llama.cpp embedding may have limited performance on Windows", + error_type=ErrorType.OS_REQUIREMENT, + technical_details=f"Windows platform: {current_platform}", + ) + + return MatchResult.success() diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index 05f7753e8e..843d68ea37 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -434,5 +434,78 @@ def match_json( model_spec: EmbeddingSpecV1, quantization: str, ) -> bool: - # As default embedding engine, sentence-transformer support all models - return model_spec.model_format in ["pytorch"] + from ..match_result import MatchResult + + result = cls.match_json_with_reason(model_family, model_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, + model_family: EmbeddingModelFamilyV2, + model_spec: EmbeddingSpecV1, + quantization: str, + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="Sentence Transformers library is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="sentence_transformers package not found in Python environment", + ) + + # Check model format compatibility + if model_spec.model_format not in ["pytorch"]: + return MatchResult.failure( + reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", + ) + + # Check model dimensions compatibility + model_dimensions = model_family.dimensions + if model_dimensions > 1536: # Very large embedding models + return MatchResult.failure( + reason=f"Large embedding model detected ({model_dimensions} dimensions)", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Large embedding dimensions: {model_dimensions}", + ) + + # Check token limits + max_tokens = model_family.max_tokens + if max_tokens > 8192: # Very high token limits + return MatchResult.failure( + reason=f"High token limit model detected (max_tokens: {max_tokens})", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details=f"High max_tokens: {max_tokens}", + ) + + # Check for special model requirements + model_name = model_family.model_name.lower() + + # Check Qwen2 GTE models + if "gte" in model_name and "qwen2" in model_name: + # These models have specific requirements + if not hasattr(cls, "_check_qwen_gte_requirements"): + return MatchResult.failure( + reason="Qwen2 GTE models require special handling", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details="Qwen2 GTE model special requirements", + ) + + # Check Qwen3 models + if "qwen3" in model_name: + # Qwen3 has flash attention requirements + try: + # This would be checked during actual loading + pass + except Exception: + return MatchResult.failure( + reason="Qwen3 embedding model may have compatibility issues", + error_type=ErrorType.VERSION_REQUIREMENT, + technical_details="Qwen3 model compatibility check", + ) + + return MatchResult.success() diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py index 8abc8f04a6..ee446d024a 100644 --- a/xinference/model/llm/core.py +++ b/xinference/model/llm/core.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: from .llm_family import LLMFamilyV2, LLMSpecV1 + from .match_result import ErrorType, MatchResult logger = logging.getLogger(__name__) @@ -159,6 +160,43 @@ def match_json( ) -> bool: raise NotImplementedError + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + """ + Check if the engine can handle the given model with detailed error information. + + This method provides detailed failure reasons and suggestions when an engine + cannot handle a specific model configuration. The default implementation + falls back to the boolean match_json method for backward compatibility. + + Args: + llm_family: The model family information + llm_spec: The model specification + quantization: The quantization method + + Returns: + MatchResult: Detailed match result with reasons and suggestions + """ + from .match_result import ErrorType, MatchResult + + # Default implementation for backward compatibility + if cls.match_json(llm_family, llm_spec, quantization): + return MatchResult.success() + else: + # Get basic reason based on common failure patterns + if not cls.check_lib(): + return MatchResult.failure( + reason=f"Required library for {cls.__name__} is not available", + error_type=ErrorType.DEPENDENCY_MISSING, + ) + else: + return MatchResult.failure( + reason=f"Model configuration is not compatible with {cls.__name__}", + error_type=ErrorType.MODEL_COMPATIBILITY, + ) + def prepare_parse_reasoning_content( self, reasoning_content: bool, enable_thinking: bool = True ): diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index d009378dbe..f35fae9f6e 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -86,14 +86,67 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str ) -> bool: + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="llama.cpp library (xllamacpp) is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="xllamacpp package not found in Python environment", + ) + + # Check model format compatibility if llm_spec.model_format not in ["ggufv2"]: - return False + return MatchResult.failure( + reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2", + ) + + # Check model abilities - llama.cpp supports both chat and generation if ( "chat" not in llm_family.model_ability and "generate" not in llm_family.model_ability ): - return False - return True + return MatchResult.failure( + reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # Check platform-specific issues + import platform + + current_platform = platform.system() + + # Check for ARM64 specific issues + if current_platform == "Darwin" and platform.machine() == "arm64": + # Apple Silicon specific checks could go here + pass + elif current_platform == "Windows": + # Windows specific checks could go here + pass + + # Check memory requirements (basic heuristic) + model_size = float(str(llm_spec.model_size_in_billions)) + if model_size > 70: # Very large models + return MatchResult.failure( + reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Large model size: {model_size}B parameters", + ) + + return MatchResult.success() def load(self): try: diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index 0144a6f734..cd0aa892cf 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -121,7 +121,22 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + return MatchResult.failure( + reason="LMDeploy base model does not support direct inference", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details="LMDeploy base model class is not intended for direct use", + ) def generate( self, @@ -174,13 +189,52 @@ def load(self): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability first + if not LMDEPLOY_INSTALLED: + return MatchResult.failure( + reason="LMDeploy library is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="lmdeploy package not found in Python environment", + ) + + # Check model format compatibility and quantization if llm_spec.model_format == "awq": - # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits. + # LMDeploy has specific AWQ quantization requirements if "4" not in quantization: - return False + return MatchResult.failure( + reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"AWQ + {quantization} not supported by LMDeploy", + ) + + # Check model compatibility if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS: - return False - return LMDEPLOY_INSTALLED + return MatchResult.failure( + reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported chat model: {llm_family.model_name}", + ) + + # Check model abilities - LMDeploy primarily supports chat models + if "chat" not in llm_family.model_ability: + return MatchResult.failure( + reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + return MatchResult.success() async def async_chat( self, diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 80b9c4be2f..cf24d31fdf 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -411,17 +411,67 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in ["mlx"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="MLX library (mlx_lm) is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="mlx_lm package not found in Python environment", + ) + + # Check platform compatibility - MLX only works on Apple Silicon if sys.platform != "darwin" or platform.processor() != "arm": - # only work for Mac M chips - return False + return MatchResult.failure( + reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)", + error_type=ErrorType.OS_REQUIREMENT, + technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", + ) + + # Check model format compatibility + if llm_spec.model_format not in ["mlx"]: + return MatchResult.failure( + reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx", + ) + + # Check model abilities - MLX supports generation but not chat/vision in this base class if "generate" not in llm_family.model_ability: - return False + return MatchResult.failure( + reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # MLX base model doesn't support chat or vision if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability: - # do not process chat or vision - return False - return True + return MatchResult.failure( + reason="MLX base model does not support chat or vision abilities", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}", + ) + + # Check memory constraints for Apple Silicon + model_size = float(str(llm_spec.model_size_in_billions)) + if model_size > 70: # Large models may be problematic + return MatchResult.failure( + reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Large model size: {model_size}B on Apple Silicon", + ) + + return MatchResult.success() def _get_prompt_cache( self, prompt, lora_name: Optional[str] = None, model: Any = None @@ -722,17 +772,39 @@ def _sanitize_generate_config( def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in ["mlx"]: - return False - if sys.platform != "darwin" or platform.processor() != "arm": - # only work for Mac M chips - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Use base class validation first + base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + if not base_result.is_match: + return base_result + + # Check chat ability if "chat" not in llm_family.model_ability: - return False + return MatchResult.failure( + reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # MLX Chat doesn't support vision if "vision" in llm_family.model_ability: - # do not process vision - return False - return True + return MatchResult.failure( + reason="MLX Chat model does not support vision abilities", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Vision ability not supported in MLXChatModel", + ) + + return MatchResult.success() def chat( self, @@ -786,14 +858,54 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in ["mlx"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability first - MLX Vision uses mlx_vlm + if not cls.check_lib(): + return MatchResult.failure( + reason="MLX Vision library (mlx_vlm) is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="mlx_vlm package not found in Python environment", + ) + + # Check platform compatibility if sys.platform != "darwin" or platform.processor() != "arm": - # only work for Mac M chips - return False + return MatchResult.failure( + reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)", + error_type=ErrorType.OS_REQUIREMENT, + technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", + ) + + # Check model format compatibility + if llm_spec.model_format not in ["mlx"]: + return MatchResult.failure( + reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx", + ) + + # Check vision ability if "vision" not in llm_family.model_ability: - return False - return True + return MatchResult.failure( + reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # Check for distributed inference limitations + # MLX Vision models don't support distributed inference + # This could be checked here if needed + + return MatchResult.success() def _load_model(self, **kwargs): try: diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index d3bbfc1570..f3658b5ed7 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -15,6 +15,7 @@ import json import logging import multiprocessing +import platform import sys import threading import time @@ -341,24 +342,104 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability first + if not SGLANG_INSTALLED: + return MatchResult.failure( + reason="SGLang library is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="sglang package not found in Python environment", + ) + + # Check hardware requirements - SGLang requires CUDA if not cls._has_cuda_device(): - return False + return MatchResult.failure( + reason="SGLang requires CUDA GPU support", + error_type=ErrorType.HARDWARE_REQUIREMENT, + technical_details="No CUDA devices detected", + ) + + # Check OS requirements if not cls._is_linux(): - return False - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]: - return False + return MatchResult.failure( + reason="SGLang only supports Linux operating system", + error_type=ErrorType.OS_REQUIREMENT, + technical_details=f"Current OS: {platform.system()}, required: Linux", + ) + + # Check model format compatibility + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"SGLang does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {llm_spec.model_format}", + ) + + # Check quantization compatibility with format if llm_spec.model_format == "pytorch": - if quantization != "none" and not (quantization is None): - return False + if quantization != "none" and quantization is not None: + return MatchResult.failure( + reason=f"SGLang pytorch format does not support quantization: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"pytorch + {quantization} combination not supported", + ) + + # Check model compatibility if isinstance(llm_family, CustomLLMFamilyV2): if llm_family.model_family not in SGLANG_SUPPORTED_MODELS: - return False + return MatchResult.failure( + reason=f"Custom model family not supported by SGLang: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom family: {llm_family.model_family}", + ) else: if llm_family.model_name not in SGLANG_SUPPORTED_MODELS: - return False - if "generate" not in llm_family.model_ability: - return False - return SGLANG_INSTALLED + return MatchResult.failure( + reason=f"Model not supported by SGLang: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported model: {llm_family.model_name}", + ) + + # Check model abilities with flexible logic + # SGLang can handle models with various text generation capabilities + has_text_capability = ( + "generate" in llm_family.model_ability + or "chat" in llm_family.model_ability + or "reasoning" in llm_family.model_ability + or "tools" in llm_family.model_ability + ) + + if not has_text_capability: + return MatchResult.failure( + reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # SGLang is primarily designed for text models, not specialized models + specialized_abilities = ["embedding", "rerank", "audio", "vision"] + has_specialized = any( + ability in llm_family.model_ability for ability in specialized_abilities + ) + if has_specialized: + return MatchResult.failure( + reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}", + ) + + return MatchResult.success() @staticmethod def _convert_state_to_completion_chunk( @@ -647,20 +728,65 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Use base class validation first + base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + if not base_result.is_match: + return base_result + + # Check model format compatibility (same as base) + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"SGLang Chat does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Chat model unsupported format: {llm_spec.model_format}", + ) + + # Check quantization compatibility with format if llm_spec.model_format == "pytorch": - if quantization != "none" and not (quantization is None): - return False + if quantization != "none" and quantization is not None: + return MatchResult.failure( + reason=f"SGLang Chat pytorch format does not support quantization: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"Chat pytorch + {quantization} not supported", + ) + + # Check chat model compatibility if isinstance(llm_family, CustomLLMFamilyV2): if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS: - return False + return MatchResult.failure( + reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom chat family: {llm_family.model_family}", + ) else: if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS: - return False + return MatchResult.failure( + reason=f"Chat model not supported by SGLang: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported chat model: {llm_family.model_name}", + ) + + # Check chat ability if "chat" not in llm_family.model_ability: - return False - return SGLANG_INSTALLED + return MatchResult.failure( + reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + return MatchResult.success() def _sanitize_chat_config( self, @@ -734,24 +860,65 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if not cls._has_cuda_device(): - return False - if not cls._is_linux(): - return False - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Use base class validation first + base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + if not base_result.is_match: + return base_result + + # Vision models have the same format restrictions as base SGLANG + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"SGLang Vision does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Vision model unsupported format: {llm_spec.model_format}", + ) + + # Vision models typically work with specific quantization settings if llm_spec.model_format == "pytorch": - if quantization != "none" and not (quantization is None): - return False + if quantization != "none" and quantization is not None: + return MatchResult.failure( + reason=f"SGLang Vision pytorch format does not support quantization: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"Vision pytorch + {quantization} not supported", + ) + + # Check vision model compatibility if isinstance(llm_family, CustomLLMFamilyV2): if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST: - return False + return MatchResult.failure( + reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom vision family: {llm_family.model_family}", + ) else: if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST: - return False + return MatchResult.failure( + reason=f"Vision model not supported by SGLang: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported vision model: {llm_family.model_name}", + ) + + # Check vision ability if "vision" not in llm_family.model_ability: - return False - return SGLANG_INSTALLED + return MatchResult.failure( + reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + return MatchResult.success() def _sanitize_chat_config( self, diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 6ad98c38e8..89a966136d 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -500,14 +500,72 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="Transformers library is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="transformers or torch package not found", + ) + + # Check model format compatibility + supported_formats = ["pytorch", "gptq", "awq", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"Transformers does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Transformers unsupported format: {llm_spec.model_format}", + ) + + # Check for models that shouldn't use Transformers by default model_family = llm_family.model_family or llm_family.model_name if model_family in NON_DEFAULT_MODEL_LIST: - return False - if "generate" not in llm_family.model_ability: - return False - return True + return MatchResult.failure( + reason=f"Model {model_family} is not recommended for Transformers engine", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}", + ) + + # Check model abilities with flexible logic + # Transformers can handle models with various text processing capabilities + has_text_capability = ( + "generate" in llm_family.model_ability + or "chat" in llm_family.model_ability + or "reasoning" in llm_family.model_ability + or "tools" in llm_family.model_ability + ) + + if not has_text_capability: + return MatchResult.failure( + reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # Check for highly specialized models that might not work well with generic Transformers engine + specialized_abilities = ["embedding", "rerank", "audio", "vision"] + has_specialized = any( + ability in llm_family.model_ability for ability in specialized_abilities + ) + if has_specialized and not has_text_capability: + return MatchResult.failure( + reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}", + ) + + return MatchResult.success() def build_prefill_attention_mask( self, batch_size: int, seq_length: int, reqs: List[InferenceRequest] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 58b0a523aa..9d76d5685e 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -19,6 +19,7 @@ import logging import multiprocessing import os +import platform import sys import threading import time @@ -880,35 +881,178 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability first + if not VLLM_INSTALLED: + return MatchResult.failure( + reason="vLLM library is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="vllm package not found in Python environment", + ) + + # Check hardware requirements if not cls._has_cuda_device() and not cls._has_mlu_device(): - return False + return MatchResult.failure( + reason="vLLM requires CUDA or MLU accelerator support", + error_type=ErrorType.HARDWARE_REQUIREMENT, + technical_details="No CUDA or MLU devices detected", + ) + + # Check OS requirements if not cls._is_linux(): - return False - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]: - return False + return MatchResult.failure( + reason="vLLM only supports Linux operating system", + error_type=ErrorType.OS_REQUIREMENT, + technical_details=f"Current OS: {platform.system()}, required: Linux", + ) + + # Check model format + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"vLLM does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {llm_spec.model_format}", + ) + + # Check quantization compatibility with format if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return False + return MatchResult.failure( + reason=f"vLLM pytorch format does not support quantization: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"pytorch + {quantization} combination not supported", + ) + if llm_spec.model_format == "awq": - # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits. if "4" not in quantization: - return False + return MatchResult.failure( + reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"AWQ + {quantization} not supported, only 4-bit", + ) + if llm_spec.model_format == "gptq": if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"): if not any(q in quantization for q in ("3", "4", "8")): - return False + return MatchResult.failure( + reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3", + ) else: if "4" not in quantization: - return False + return MatchResult.failure( + reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}", + error_type=ErrorType.VERSION_REQUIREMENT, + technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3", + ) + + # Check model compatibility with more flexible matching + def is_model_supported(model_name: str, supported_list: List[str]) -> bool: + """Check if model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants (e.g., qwen3 variants) + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common patterns + model_lower = model_name.lower() + if any( + family in model_lower + for family in [ + "qwen3", + "llama", + "mistral", + "gemma", + "baichuan", + "deepseek", + ] + ): + # Check if there's a corresponding supported model with same family + for supported in supported_list: + if any( + family in supported.lower() + for family in [ + "qwen3", + "llama", + "mistral", + "gemma", + "baichuan", + "deepseek", + ] + ): + return True + + return False + if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in VLLM_SUPPORTED_MODELS: - return False + if not is_model_supported( + llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS + ): + return MatchResult.failure( + reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom family: {llm_family.model_family}", + ) else: - if llm_family.model_name not in VLLM_SUPPORTED_MODELS: - return False - if "generate" not in llm_family.model_ability: - return False - return VLLM_INSTALLED + if not is_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in VLLM_SUPPORTED_MODELS], + ): + return MatchResult.failure( + reason=f"Model may not be supported by vLLM: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported model: {llm_family.model_name}", + ) + + # Check model abilities with flexible logic + # vLLM can handle models that have text generation capabilities + # Models with 'chat' ability usually also support 'generate' + has_text_capability = ( + "generate" in llm_family.model_ability + or "chat" in llm_family.model_ability + or "reasoning" in llm_family.model_ability + or "tools" in llm_family.model_ability + ) + + if not has_text_capability: + return MatchResult.failure( + reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # Additional check: ensure model doesn't have conflicting abilities + conflicting_abilities = ["embedding", "rerank"] + has_conflicting = any( + ability in llm_family.model_ability for ability in conflicting_abilities + ) + if has_conflicting: + return MatchResult.failure( + reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}", + ) + + # All checks passed + return MatchResult.success() @staticmethod def _convert_request_output_to_completion_chunk( @@ -1316,40 +1460,141 @@ class VLLMChatModel(VLLMModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if llm_spec.model_format not in [ - "pytorch", - "gptq", - "awq", - "fp8", - "bnb", - "ggufv2", - ]: - return False - if llm_spec.model_format == "pytorch": - if quantization != "none" and quantization is not None: - return False - if llm_spec.model_format == "awq": - if not any(q in quantization for q in ("4", "8")): - return False - if llm_spec.model_format == "gptq": - if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"): - if not any(q in quantization for q in ("3", "4", "8")): - return False - else: - if "4" not in quantization: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Use base class validation first + base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + if not base_result.is_match: + return base_result + + # Chat-specific format support (includes GGUFv2 for newer vLLM) + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"vLLM Chat does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Chat model unsupported format: {llm_spec.model_format}", + ) + + # GGUFv2 requires newer vLLM version if llm_spec.model_format == "ggufv2": if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")): - return False + return MatchResult.failure( + reason="vLLM GGUF support requires version >= 0.8.2", + error_type=ErrorType.VERSION_REQUIREMENT, + technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2", + ) + + # AWQ chat models support more quantization levels + if llm_spec.model_format == "awq": + if not any(q in quantization for q in ("4", "8")): + return MatchResult.failure( + reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"Chat AWQ + {quantization} not supported", + ) + + # Check chat model compatibility with flexible matching + def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: + """Check if chat model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common chat model patterns + model_lower = model_name.lower() + if any( + family in model_lower + for family in [ + "qwen3", + "llama", + "mistral", + "gemma", + "baichuan", + "deepseek", + "glm", + "chatglm", + ] + ): + # Check if there's a corresponding supported chat model with same family + for supported in supported_list: + if any( + family in supported.lower() + for family in [ + "qwen3", + "llama", + "mistral", + "gemma", + "baichuan", + "deepseek", + "glm", + "chatglm", + ] + ): + return True + + return False + if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS: - return False + if not is_chat_model_supported( + llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS + ): + return MatchResult.failure( + reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom chat family: {llm_family.model_family}", + ) else: - if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS: - return False - if "chat" not in llm_family.model_ability: - return False - return VLLM_INSTALLED + if not is_chat_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS], + ): + return MatchResult.failure( + reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported chat model: {llm_family.model_name}", + ) + + # Check chat ability with flexible logic + # vLLM Chat should work with models that have conversation capabilities + has_chat_capability = ( + "chat" in llm_family.model_ability + or "generate" in llm_family.model_ability + or "reasoning" in llm_family.model_ability + ) + + if not has_chat_capability: + return MatchResult.failure( + reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + # Additional check: ensure model is not purely a tool model without conversation + if set(llm_family.model_ability) == {"tools"}: + return MatchResult.failure( + reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Tool-only model detected", + ) + + return MatchResult.success() def _sanitize_chat_config( self, @@ -1494,38 +1739,110 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if not cls._has_cuda_device() and not cls._has_mlu_device(): - return False - if not cls._is_linux(): - return False - if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]: - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Use base class validation first + base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + if not base_result.is_match: + return base_result + + # Vision models have the same format restrictions as base VLLM + supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] + if llm_spec.model_format not in supported_formats: + return MatchResult.failure( + reason=f"vLLM Vision does not support model format: {llm_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Vision model unsupported format: {llm_spec.model_format}", + ) + + # Vision models typically work with specific quantization settings if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return False + return MatchResult.failure( + reason=f"vLLM Vision pytorch format does not support quantization: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"Vision pytorch + {quantization} not supported", + ) + + # AWQ vision models support more quantization levels than base if llm_spec.model_format == "awq": if not any(q in quantization for q in ("4", "8")): - return False - if llm_spec.model_format == "gptq": - if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"): - if not any(q in quantization for q in ("3", "4", "8")): - return False - else: - if "4" not in quantization: - return False + return MatchResult.failure( + reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}", + error_type=ErrorType.QUANTIZATION, + technical_details=f"Vision AWQ + {quantization} not supported", + ) + + # Check vision model compatibility with flexible matching + def is_vision_model_supported( + model_name: str, supported_list: List[str] + ) -> bool: + """Check if vision model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common vision model patterns + model_lower = model_name.lower() + if any( + family in model_lower + for family in ["llama", "qwen", "internvl", "glm", "phi"] + ): + # Check if there's a corresponding supported vision model with same family + for supported in supported_list: + if any( + family in supported.lower() + for family in ["llama", "qwen", "internvl", "glm", "phi"] + ): + return True + + return False + if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST: - return False + if not is_vision_model_supported( + llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST + ): + return MatchResult.failure( + reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Custom vision family: {llm_family.model_family}", + ) else: - if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST: - return False - if ( - "vision" not in llm_family.model_ability - and "audio" not in llm_family.model_ability - and "omni" not in llm_family.model_ability - ): - return False - return VLLM_INSTALLED + if not is_vision_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST], + ): + return MatchResult.failure( + reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported vision model: {llm_family.model_name}", + ) + + # Check vision ability + if "vision" not in llm_family.model_ability: + return MatchResult.failure( + reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}", + error_type=ErrorType.ABILITY_MISMATCH, + technical_details=f"Model abilities: {llm_family.model_ability}", + ) + + return MatchResult.success() def _sanitize_model_config( self, model_config: Optional[VLLMModelConfig] diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py index ae27e7e85e..929522f23e 100644 --- a/xinference/model/rerank/core.py +++ b/xinference/model/rerank/core.py @@ -131,6 +131,46 @@ def match_json( ) -> bool: pass + @classmethod + def match_json_with_reason( + cls, + model_family: RerankModelFamilyV2, + model_spec: RerankSpecV1, + quantization: str, + ) -> "MatchResult": + """ + Check if the engine can handle the given rerank model with detailed error information. + + This method provides detailed failure reasons and suggestions when an engine + cannot handle a specific model configuration. The default implementation + falls back to the boolean match_json method for backward compatibility. + + Args: + model_family: The rerank model family information + model_spec: The model specification + quantization: The quantization method + + Returns: + MatchResult: Detailed match result with reasons and suggestions + """ + from .match_result import ErrorType, MatchResult + + # Default implementation for backward compatibility + if cls.match_json(model_family, model_spec, quantization): + return MatchResult.success() + else: + # Get basic reason based on common failure patterns + if not cls.check_lib(): + return MatchResult.failure( + reason=f"Required library for {cls.__name__} is not available", + error_type=ErrorType.DEPENDENCY_MISSING, + ) + else: + return MatchResult.failure( + reason=f"Rerank model configuration is not compatible with {cls.__name__}", + error_type=ErrorType.MODEL_COMPATIBILITY, + ) + @classmethod def match( cls, diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py index fabbb6e593..ee81a9adac 100644 --- a/xinference/model/rerank/sentence_transformers/core.py +++ b/xinference/model/rerank/sentence_transformers/core.py @@ -191,7 +191,7 @@ def compute_logits(inputs, **kwargs): from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker else: raise RuntimeError( - f"Unsupported Rank model type: {self.model_family.type}" + f"Unsupported Rerank model type: {self.model_family.type}" ) except ImportError: error_message = "Failed to import module 'FlagEmbedding'" @@ -341,5 +341,74 @@ def match_json( model_spec: RerankSpecV1, quantization: str, ) -> bool: - # As default embedding engine, sentence-transformer support all models - return model_spec.model_format in ["pytorch"] + from ..match_result import MatchResult + + result = cls.match_json_with_reason(model_family, model_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, + model_family: RerankModelFamilyV2, + model_spec: RerankSpecV1, + quantization: str, + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="Sentence Transformers library is not installed for reranking", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="sentence_transformers package not found in Python environment", + ) + + # Check model format compatibility + if model_spec.model_format not in ["pytorch"]: + return MatchResult.failure( + reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", + ) + + # Check rerank-specific requirements + if not hasattr(model_family, "model_name"): + return MatchResult.failure( + reason="Rerank model family requires model name specification", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details="Missing model_name in rerank model family", + ) + + # Check model type compatibility + if model_family.type and model_family.type not in [ + "rerank", + "unknown", + "cross-encoder", + "normal", + "LLM-based", + "LLM-based layerwise", + ]: + return MatchResult.failure( + reason=f"Model type '{model_family.type}' may not be compatible with reranking engines", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Model type: {model_family.type}", + ) + + # Check max tokens limit for reranking performance + max_tokens = model_family.max_tokens + if max_tokens and max_tokens > 8192: # High token limits for reranking + return MatchResult.failure( + reason=f"High max_tokens limit for reranking model: {max_tokens}", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details=f"High max_tokens for reranking: {max_tokens}", + ) + + # Check language compatibility + if not model_family.language or len(model_family.language) == 0: + return MatchResult.failure( + reason="Rerank model language information is missing", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details="Missing language information in rerank model", + ) + + return MatchResult.success() diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index eac173b40c..f9763b567a 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -149,8 +149,71 @@ def match_json( model_spec: RerankSpecV1, quantization: str, ) -> bool: - if model_spec.model_format in ["pytorch"]: - prefix = model_family.model_name.split("-", 1)[0] - if prefix in SUPPORTED_MODELS_PREFIXES: - return True - return False + from ..match_result import MatchResult + + result = cls.match_json_with_reason(model_family, model_spec, quantization) + return result.is_match + + @classmethod + def match_json_with_reason( + cls, + model_family: RerankModelFamilyV2, + model_spec: RerankSpecV1, + quantization: str, + ) -> "MatchResult": + from ..match_result import ErrorType, MatchResult + + # Check library availability + if not cls.check_lib(): + return MatchResult.failure( + reason="vLLM library is not installed for reranking", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="vllm package not found in Python environment", + ) + + # Check model format compatibility + if model_spec.model_format not in ["pytorch"]: + return MatchResult.failure( + reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}", + error_type=ErrorType.MODEL_FORMAT, + technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", + ) + + # Check model name prefix matching + if model_spec.model_format == "pytorch": + try: + prefix = model_family.model_name.split("-", 1)[0].lower() + # Support both prefix matching and special cases + if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]: + # Special handling for Qwen3 models + if "qwen3" not in model_family.model_name.lower(): + return MatchResult.failure( + reason=f"Model family prefix not supported by vLLM reranking: {prefix}", + error_type=ErrorType.MODEL_COMPATIBILITY, + technical_details=f"Unsupported prefix: {prefix}", + ) + except (IndexError, AttributeError): + return MatchResult.failure( + reason="Unable to parse model family name for vLLM compatibility check", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details=f"Model name parsing failed: {model_family.model_name}", + ) + + # Check rerank-specific requirements + if not hasattr(model_family, "model_name"): + return MatchResult.failure( + reason="Rerank model family requires model name specification for vLLM", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details="Missing model_name in vLLM rerank model family", + ) + + # Check max tokens limit for vLLM reranking performance + max_tokens = model_family.max_tokens + if max_tokens and max_tokens > 4096: # vLLM has stricter limits + return MatchResult.failure( + reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}", + error_type=ErrorType.CONFIGURATION_ERROR, + technical_details=f"High max_tokens for vLLM reranking: {max_tokens}", + ) + + return MatchResult.success() diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 18de3c26e4..ad0dabbf35 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -494,59 +494,111 @@ def get_engine_params_by_name( del param["available"] engine_params[engine] = params - # Check unavailable engines + # Check unavailable engines with detailed error information for engine_name in all_supported_engines: if engine_name not in engine_params: # Engine not in available list try: llm_engine_classes = SUPPORTED_ENGINES[engine_name] - error_msg = None - # Try to find specific error reasons - for engine_class in llm_engine_classes: + # Try to get detailed error information from engine's match_json_with_reason + detailed_error = None + + # We need a sample model to test against, use the first available spec + if model_name in LLM_ENGINES and LLM_ENGINES[model_name]: + # Try to get model family for testing try: - if hasattr(engine_class, "check_lib"): - lib_available: bool = engine_class.check_lib() # type: ignore[assignment] - if not lib_available: - error_msg = ( - f"Engine {engine_name} library is not available" - ) + from .llm.llm_family import match_llm + + llm_family = match_llm(model_name, None, None, None, None) + if llm_family and llm_family.model_specs: + llm_spec = llm_family.model_specs[0] + quantization = llm_spec.quantization or "none" + + # Test each engine class for detailed error info + for engine_class in llm_engine_classes: + try: + if hasattr( + engine_class, "match_json_with_reason" + ): + from .llm.match_result import MatchResult + + result = ( + engine_class.match_json_with_reason( + llm_family, llm_spec, quantization + ) + ) + if not result.is_match: + detailed_error = { + "error": result.reason, + "error_type": result.error_type, + "technical_details": result.technical_details, + } + break + except Exception: + # Fall back to next engine class + continue + except Exception: + # If we can't get model family, continue with basic checking + pass + + if detailed_error: + engine_params[engine_name] = detailed_error + else: + # Fallback to basic error checking for backward compatibility + error_msg = None + for engine_class in llm_engine_classes: + try: + if hasattr(engine_class, "check_lib"): + lib_available: bool = engine_class.check_lib() # type: ignore[assignment] + if not lib_available: + error_msg = { + "error": f"Engine {engine_name} library is not available", + "error_type": "dependency_missing", + } + break + else: + # If no check_lib method, try import check + module_name = engine_name.lower().replace(".", "") + if engine_name == "vLLM": + module_name = "vllm" + elif engine_name == "SGLang": + module_name = "sglang" + elif engine_name == "llama.cpp": + module_name = "llama_cpp" + elif engine_name == "MLX": + module_name = "mlx" + elif engine_name == "LMDEPLOY": + module_name = "lmdeploy" + elif engine_name == "Transformers": + module_name = "transformers" + + importlib.import_module(module_name) break - else: - # If no check_lib method, try import check - module_name = engine_name.lower().replace(".", "") - if engine_name == "vLLM": - module_name = "vllm" - elif engine_name == "SGLang": - module_name = "sglang" - elif engine_name == "llama.cpp": - module_name = "llama_cpp" - elif engine_name == "MLX": - module_name = "mlx" - elif engine_name == "LMDEPLOY": - module_name = "lmdeploy" - elif engine_name == "Transformers": - module_name = "transformers" - - importlib.import_module(module_name) - break - except ImportError as e: - error_msg = f"Engine {engine_name} library is not installed: {str(e)}" - except Exception as e: - error_msg = ( - f"Engine {engine_name} is not available: {str(e)}" - ) - - if error_msg is None: - error_msg = f"Engine {engine_name} is not compatible with current model or environment" - - # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + except ImportError as e: + error_msg = { + "error": f"Engine {engine_name} library is not installed: {str(e)}", + "error_type": "dependency_missing", + } + except Exception as e: + error_msg = { + "error": f"Engine {engine_name} is not available: {str(e)}", + "error_type": "configuration_error", + } + + if error_msg is None: + error_msg = { + "error": f"Engine {engine_name} is not compatible with current model or environment", + "error_type": "model_compatibility", + } + + engine_params[engine_name] = error_msg except Exception as e: - # If exception occurs during checking, return error message string - engine_params[engine_name] = ( - f"Error checking engine {engine_name}: {str(e)}" - ) + # If exception occurs during checking, return structured error + engine_params[engine_name] = { + "error": f"Error checking engine {engine_name}: {str(e)}", + "error_type": "configuration_error", + } # Filter out llm_class field for engine, params in engine_params.items(): From 08450ac0c283f29a84ca46ac816dba7d05434eb6 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 11:31:00 +0800 Subject: [PATCH 13/37] modify accomplishment measure --- xinference/model/llm/mlx/core.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index cf24d31fdf..d2d4b25697 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -422,15 +422,7 @@ def match_json_with_reason( ) -> "MatchResult": from ..match_result import ErrorType, MatchResult - # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="MLX library (mlx_lm) is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="mlx_lm package not found in Python environment", - ) - - # Check platform compatibility - MLX only works on Apple Silicon + # Check platform compatibility first - MLX only works on Apple Silicon if sys.platform != "darwin" or platform.processor() != "arm": return MatchResult.failure( reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)", @@ -438,6 +430,14 @@ def match_json_with_reason( technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", ) + # Check library availability (only if platform is compatible) + if not cls.check_lib(): + return MatchResult.failure( + reason="MLX library (mlx_lm) is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="mlx_lm package not found in Python environment", + ) + # Check model format compatibility if llm_spec.model_format not in ["mlx"]: return MatchResult.failure( @@ -869,15 +869,7 @@ def match_json_with_reason( ) -> "MatchResult": from ..match_result import ErrorType, MatchResult - # Check library availability first - MLX Vision uses mlx_vlm - if not cls.check_lib(): - return MatchResult.failure( - reason="MLX Vision library (mlx_vlm) is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="mlx_vlm package not found in Python environment", - ) - - # Check platform compatibility + # Check platform compatibility first - MLX only works on Apple Silicon if sys.platform != "darwin" or platform.processor() != "arm": return MatchResult.failure( reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)", @@ -885,6 +877,14 @@ def match_json_with_reason( technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", ) + # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm + if not cls.check_lib(): + return MatchResult.failure( + reason="MLX Vision library (mlx_vlm) is not installed", + error_type=ErrorType.DEPENDENCY_MISSING, + technical_details="mlx_vlm package not found in Python environment", + ) + # Check model format compatibility if llm_spec.model_format not in ["mlx"]: return MatchResult.failure( From e793cd4d1ed470971b03dd93ba0a47705ace27af Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:23:00 +0800 Subject: [PATCH 14/37] modify accomplishment measure --- xinference/model/embedding/match_result.py | 76 +++++++++++++++++++++ xinference/model/llm/match_result.py | 76 +++++++++++++++++++++ xinference/model/rerank/match_result.py | 77 ++++++++++++++++++++++ 3 files changed, 229 insertions(+) create mode 100644 xinference/model/embedding/match_result.py create mode 100644 xinference/model/llm/match_result.py create mode 100644 xinference/model/rerank/match_result.py diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py new file mode 100644 index 0000000000..47775f20f9 --- /dev/null +++ b/xinference/model/embedding/match_result.py @@ -0,0 +1,76 @@ +""" +Error handling result structures for embedding model engine matching. + +This module provides structured error handling for engine matching operations, +allowing engines to provide detailed failure reasons and suggestions. +""" + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class MatchResult: + """ + Result of engine matching operation with detailed error information. + + This class provides structured information about whether an engine can handle + a specific model configuration, and if not, why and what alternatives exist. + """ + + is_match: bool + reason: Optional[str] = None + error_type: Optional[str] = None + technical_details: Optional[str] = None + + @classmethod + def success(cls) -> "MatchResult": + """Create a successful match result.""" + return cls(is_match=True) + + @classmethod + def failure( + cls, + reason: str, + error_type: Optional[str] = None, + technical_details: Optional[str] = None, + ) -> "MatchResult": + """Create a failed match result with optional details.""" + return cls( + is_match=False, + reason=reason, + error_type=error_type, + technical_details=technical_details, + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for API responses.""" + result = {"is_match": self.is_match} + if not self.is_match: + if self.reason: + result["reason"] = self.reason + if self.error_type: + result["error_type"] = self.error_type + if self.technical_details: + result["technical_details"] = self.technical_details + return result + + def to_error_string(self) -> str: + """Convert to error string for backward compatibility.""" + if self.is_match: + return "Available" + error_msg = self.reason or "Unknown error" + return error_msg + + +# Error type constants for better categorization +class ErrorType: + HARDWARE_REQUIREMENT = "hardware_requirement" + OS_REQUIREMENT = "os_requirement" + MODEL_FORMAT = "model_format" + DEPENDENCY_MISSING = "dependency_missing" + MODEL_COMPATIBILITY = "model_compatibility" + DIMENSION_MISMATCH = "dimension_mismatch" + VERSION_REQUIREMENT = "version_requirement" + CONFIGURATION_ERROR = "configuration_error" + ENGINE_UNAVAILABLE = "engine_unavailable" diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py new file mode 100644 index 0000000000..eeff2461f2 --- /dev/null +++ b/xinference/model/llm/match_result.py @@ -0,0 +1,76 @@ +""" +Error handling result structures for engine matching. + +This module provides structured error handling for engine matching operations, +allowing engines to provide detailed failure reasons and suggestions. +""" + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class MatchResult: + """ + Result of engine matching operation with detailed error information. + + This class provides structured information about whether an engine can handle + a specific model configuration, and if not, why and what alternatives exist. + """ + + is_match: bool + reason: Optional[str] = None + error_type: Optional[str] = None + technical_details: Optional[str] = None + + @classmethod + def success(cls) -> "MatchResult": + """Create a successful match result.""" + return cls(is_match=True) + + @classmethod + def failure( + cls, + reason: str, + error_type: Optional[str] = None, + technical_details: Optional[str] = None, + ) -> "MatchResult": + """Create a failed match result with optional details.""" + return cls( + is_match=False, + reason=reason, + error_type=error_type, + technical_details=technical_details, + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for API responses.""" + result = {"is_match": self.is_match} + if not self.is_match: + if self.reason: + result["reason"] = self.reason + if self.error_type: + result["error_type"] = self.error_type + if self.technical_details: + result["technical_details"] = self.technical_details + return result + + def to_error_string(self) -> str: + """Convert to error string for backward compatibility.""" + if self.is_match: + return "Available" + error_msg = self.reason or "Unknown error" + return error_msg + + +# Error type constants for better categorization +class ErrorType: + HARDWARE_REQUIREMENT = "hardware_requirement" + OS_REQUIREMENT = "os_requirement" + MODEL_FORMAT = "model_format" + QUANTIZATION = "quantization" + DEPENDENCY_MISSING = "dependency_missing" + MODEL_COMPATIBILITY = "model_compatibility" + ABILITY_MISMATCH = "ability_mismatch" + VERSION_REQUIREMENT = "version_requirement" + CONFIGURATION_ERROR = "configuration_error" diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py new file mode 100644 index 0000000000..125e791afd --- /dev/null +++ b/xinference/model/rerank/match_result.py @@ -0,0 +1,77 @@ +""" +Error handling result structures for rerank model engine matching. + +This module provides structured error handling for engine matching operations, +allowing engines to provide detailed failure reasons and suggestions. +""" + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class MatchResult: + """ + Result of engine matching operation with detailed error information. + + This class provides structured information about whether an engine can handle + a specific model configuration, and if not, why and what alternatives exist. + """ + + is_match: bool + reason: Optional[str] = None + error_type: Optional[str] = None + technical_details: Optional[str] = None + + @classmethod + def success(cls) -> "MatchResult": + """Create a successful match result.""" + return cls(is_match=True) + + @classmethod + def failure( + cls, + reason: str, + error_type: Optional[str] = None, + technical_details: Optional[str] = None, + ) -> "MatchResult": + """Create a failed match result with optional details.""" + return cls( + is_match=False, + reason=reason, + error_type=error_type, + technical_details=technical_details, + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for API responses.""" + result = {"is_match": self.is_match} + if not self.is_match: + if self.reason: + result["reason"] = self.reason + if self.error_type: + result["error_type"] = self.error_type + if self.technical_details: + result["technical_details"] = self.technical_details + return result + + def to_error_string(self) -> str: + """Convert to error string for backward compatibility.""" + if self.is_match: + return "Available" + error_msg = self.reason or "Unknown error" + return error_msg + + +# Error type constants for better categorization +class ErrorType: + HARDWARE_REQUIREMENT = "hardware_requirement" + OS_REQUIREMENT = "os_requirement" + MODEL_FORMAT = "model_format" + DEPENDENCY_MISSING = "dependency_missing" + MODEL_COMPATIBILITY = "model_compatibility" + DIMENSION_MISMATCH = "dimension_mismatch" + VERSION_REQUIREMENT = "version_requirement" + CONFIGURATION_ERROR = "configuration_error" + ENGINE_UNAVAILABLE = "engine_unavailable" + RERANK_SPECIFIC = "rerank_specific" From 27ea341e43e2c15e96276f1a770104f9bb346691 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:40:08 +0800 Subject: [PATCH 15/37] modify accomplishment measure --- xinference/model/embedding/core.py | 1 + xinference/model/embedding/llama_cpp/core.py | 2 +- .../model/embedding/sentence_transformers/core.py | 2 +- xinference/model/llm/core.py | 2 +- xinference/model/llm/llama_cpp/core.py | 2 +- xinference/model/llm/lmdeploy/core.py | 3 +-- xinference/model/llm/mlx/core.py | 5 +---- xinference/model/llm/sglang/core.py | 4 +--- xinference/model/llm/transformers/core.py | 2 +- xinference/model/llm/transformers/multimodal/core.py | 6 ------ xinference/model/llm/vllm/core.py | 10 +++------- xinference/model/rerank/core.py | 1 + xinference/model/rerank/sentence_transformers/core.py | 3 ++- xinference/model/rerank/vllm/core.py | 2 +- xinference/model/utils.py | 2 +- 15 files changed, 17 insertions(+), 30 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index 299ec4c5d1..c7f5ddb554 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -20,6 +20,7 @@ from collections import defaultdict from typing import Annotated, Dict, List, Literal, Optional, Union +from .match_result import MatchResult from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError from ...device_utils import empty_cache from ..core import VirtualEnvSettings diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py index 6e2908ffdd..932df57f16 100644 --- a/xinference/model/embedding/llama_cpp/core.py +++ b/xinference/model/embedding/llama_cpp/core.py @@ -24,6 +24,7 @@ from packaging import version +from ..match_result import MatchResult from ....types import Embedding from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 @@ -235,7 +236,6 @@ def match_json( model_spec: EmbeddingSpecV1, quantization: str, ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(model_family, model_spec, quantization) return result.is_match diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index 843d68ea37..6cb66f7ca2 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -19,6 +19,7 @@ import numpy as np import torch +from ..match_result import MatchResult from ....types import Embedding, EmbeddingData, EmbeddingUsage from ...utils import is_flash_attn_available from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 @@ -434,7 +435,6 @@ def match_json( model_spec: EmbeddingSpecV1, quantization: str, ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(model_family, model_spec, quantization) return result.is_match diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py index ee446d024a..2626060579 100644 --- a/xinference/model/llm/core.py +++ b/xinference/model/llm/core.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: from .llm_family import LLMFamilyV2, LLMSpecV1 - from .match_result import ErrorType, MatchResult + from .match_result import MatchResult logger = logging.getLogger(__name__) diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index f35fae9f6e..5790c3a3ca 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -21,6 +21,7 @@ from packaging import version +from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk from ..core import LLM, chat_context_var @@ -86,7 +87,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index cd0aa892cf..134e668d7a 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -18,6 +18,7 @@ import torch +from ..match_result import MatchResult from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA from ..core import LLM from ..llm_family import LLMFamilyV2, LLMSpecV1 @@ -121,7 +122,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -189,7 +189,6 @@ def load(self): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index d2d4b25697..7f53112ab3 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -39,6 +39,7 @@ import xoscar as xo +from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....fields import max_tokens_field from ....types import ( @@ -411,7 +412,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -772,7 +772,6 @@ def _sanitize_generate_config( def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -858,8 +857,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index f3658b5ed7..9365f2833b 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -24,6 +24,7 @@ from xoscar.utils import get_next_port +from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....types import ( ChatCompletion, @@ -342,7 +343,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -728,7 +728,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -860,7 +859,6 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 89a966136d..bc828d65b3 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -20,6 +20,7 @@ import torch +from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....device_utils import ( get_device_preferred_dtype, @@ -500,7 +501,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match diff --git a/xinference/model/llm/transformers/multimodal/core.py b/xinference/model/llm/transformers/multimodal/core.py index ae67e102b5..4d6451f42e 100644 --- a/xinference/model/llm/transformers/multimodal/core.py +++ b/xinference/model/llm/transformers/multimodal/core.py @@ -39,21 +39,18 @@ def decide_device(self): """ Update self._device """ - pass @abstractmethod def load_processor(self): """ Load self._processor and self._tokenizer """ - pass @abstractmethod def load_multimodal_model(self): """ Load self._model """ - pass def load(self): self.decide_device() @@ -71,7 +68,6 @@ def build_inputs_from_messages( actual parameters needed for inference, e.g. input_ids, attention_masks, etc. """ - pass @abstractmethod def build_generate_kwargs( @@ -82,7 +78,6 @@ def build_generate_kwargs( Hyperparameters needed for generation, e.g. temperature, max_new_tokens, etc. """ - pass @abstractmethod def build_streaming_iter( @@ -95,7 +90,6 @@ def build_streaming_iter( The length of prompt token usually comes from the input_ids. In this interface you need to call the `build_inputs_from_messages` and `build_generate_kwargs`. """ - pass def get_stop_strs(self) -> List[str]: return [] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 9d76d5685e..7e9d6d3865 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -42,6 +42,7 @@ from packaging import version from typing_extensions import NotRequired +from ..match_result import MatchResult, ErrorType from ....constants import XINFERENCE_MAX_TOKENS from ....types import ( ChatCompletion, @@ -881,7 +882,6 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -1460,7 +1460,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -1739,7 +1738,6 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin): def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(llm_family, llm_spec, quantization) return result.is_match @@ -1748,7 +1746,6 @@ def match_json( def match_json_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult # Use base class validation first base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) @@ -1816,7 +1813,7 @@ def is_vision_model_supported( if isinstance(llm_family, CustomLLMFamilyV2): if not is_vision_model_supported( - llm_family.model_family.lower(), VLLM_SUPPORTED_VISION_MODEL_LIST + llm_family.model_family.lower() ): return MatchResult.failure( reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", @@ -1825,8 +1822,7 @@ def is_vision_model_supported( ) else: if not is_vision_model_supported( - llm_family.model_name.lower(), - [s.lower() for s in VLLM_SUPPORTED_VISION_MODEL_LIST], + llm_family.model_name.lower() ): return MatchResult.failure( reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py index 929522f23e..d3e3b5702c 100644 --- a/xinference/model/rerank/core.py +++ b/xinference/model/rerank/core.py @@ -17,6 +17,7 @@ from collections import defaultdict from typing import Dict, List, Literal, Optional +from .match_result import MatchResult from ..._compat import BaseModel from ...types import Rerank from ..core import VirtualEnvSettings diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py index ee81a9adac..87efe31b5b 100644 --- a/xinference/model/rerank/sentence_transformers/core.py +++ b/xinference/model/rerank/sentence_transformers/core.py @@ -22,6 +22,7 @@ import torch import torch.nn as nn +from ..match_result import MatchResult from ....device_utils import empty_cache from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens from ...utils import is_flash_attn_available @@ -341,7 +342,7 @@ def match_json( model_spec: RerankSpecV1, quantization: str, ) -> bool: - from ..match_result import MatchResult + pass result = cls.match_json_with_reason(model_family, model_spec, quantization) return result.is_match diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index f9763b567a..114eef5907 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -2,6 +2,7 @@ import uuid from typing import List, Optional +from ..match_result import MatchResult from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens from ...utils import cache_clean from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1 @@ -149,7 +150,6 @@ def match_json( model_spec: RerankSpecV1, quantization: str, ) -> bool: - from ..match_result import MatchResult result = cls.match_json_with_reason(model_family, model_spec, quantization) return result.is_match diff --git a/xinference/model/utils.py b/xinference/model/utils.py index ad0dabbf35..383f188382 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -520,7 +520,7 @@ def get_engine_params_by_name( if hasattr( engine_class, "match_json_with_reason" ): - from .llm.match_result import MatchResult + pass result = ( engine_class.match_json_with_reason( From 114ec633ea524f493a3b509fab49b94d1ea444b3 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:41:38 +0800 Subject: [PATCH 16/37] modify accomplishment measure --- xinference/model/embedding/core.py | 2 +- xinference/model/embedding/llama_cpp/core.py | 2 +- .../model/embedding/sentence_transformers/core.py | 2 +- xinference/model/llm/llama_cpp/core.py | 2 +- xinference/model/llm/lmdeploy/core.py | 2 +- xinference/model/llm/mlx/core.py | 2 +- xinference/model/llm/sglang/core.py | 2 +- xinference/model/llm/transformers/core.py | 2 +- xinference/model/llm/vllm/core.py | 10 +++------- xinference/model/rerank/core.py | 2 +- xinference/model/rerank/sentence_transformers/core.py | 2 +- xinference/model/rerank/vllm/core.py | 2 +- 12 files changed, 14 insertions(+), 18 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index c7f5ddb554..6f934b6e5f 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -20,12 +20,12 @@ from collections import defaultdict from typing import Annotated, Dict, List, Literal, Optional, Union -from .match_result import MatchResult from ..._compat import ROOT_KEY, BaseModel, ErrorWrapper, Field, ValidationError from ...device_utils import empty_cache from ..core import VirtualEnvSettings from ..utils import ModelInstanceInfoMixin from .embed_family import match_embedding +from .match_result import MatchResult logger = logging.getLogger(__name__) diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py index 932df57f16..4b3d6ed125 100644 --- a/xinference/model/embedding/llama_cpp/core.py +++ b/xinference/model/embedding/llama_cpp/core.py @@ -24,9 +24,9 @@ from packaging import version -from ..match_result import MatchResult from ....types import Embedding from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 +from ..match_result import MatchResult logger = logging.getLogger(__name__) diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index 6cb66f7ca2..29bcb66a33 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -19,10 +19,10 @@ import numpy as np import torch -from ..match_result import MatchResult from ....types import Embedding, EmbeddingData, EmbeddingUsage from ...utils import is_flash_attn_available from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 +from ..match_result import MatchResult logger = logging.getLogger(__name__) SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = [] diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 5790c3a3ca..386f8eb662 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -21,11 +21,11 @@ from packaging import version -from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 +from ..match_result import MatchResult from ..utils import ChatModelMixin logger = logging.getLogger(__name__) diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index 134e668d7a..f1c2605a24 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -18,10 +18,10 @@ import torch -from ..match_result import MatchResult from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA from ..core import LLM from ..llm_family import LLMFamilyV2, LLMSpecV1 +from ..match_result import MatchResult from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk logger = logging.getLogger(__name__) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 7f53112ab3..943dddd7c4 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -39,7 +39,6 @@ import xoscar as xo -from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....fields import max_tokens_field from ....types import ( @@ -52,6 +51,7 @@ ) from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 +from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 9365f2833b..7095289a5d 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -24,7 +24,6 @@ from xoscar.utils import get_next_port -from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....types import ( ChatCompletion, @@ -38,6 +37,7 @@ from .. import LLM, LLMFamilyV2, LLMSpecV1 from ..core import chat_context_var from ..llm_family import CustomLLMFamilyV2 +from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index bc828d65b3..8fae36576d 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -20,7 +20,6 @@ import torch -from ..match_result import MatchResult from ....constants import XINFERENCE_MAX_TOKENS from ....device_utils import ( get_device_preferred_dtype, @@ -41,6 +40,7 @@ from ...utils import select_device from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 +from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, LLAMA3_TOOL_CALL_FAMILY, diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 7e9d6d3865..7bb0664354 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -42,7 +42,6 @@ from packaging import version from typing_extensions import NotRequired -from ..match_result import MatchResult, ErrorType from ....constants import XINFERENCE_MAX_TOKENS from ....types import ( ChatCompletion, @@ -57,6 +56,7 @@ from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1 from ..core import chat_context_var from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config +from ..match_result import ErrorType, MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, @@ -1812,18 +1812,14 @@ def is_vision_model_supported( return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_vision_model_supported( - llm_family.model_family.lower() - ): + if not is_vision_model_supported(llm_family.model_family.lower()): return MatchResult.failure( reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", error_type=ErrorType.MODEL_COMPATIBILITY, technical_details=f"Custom vision family: {llm_family.model_family}", ) else: - if not is_vision_model_supported( - llm_family.model_name.lower() - ): + if not is_vision_model_supported(llm_family.model_name.lower()): return MatchResult.failure( reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", error_type=ErrorType.MODEL_COMPATIBILITY, diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py index d3e3b5702c..c02b230abd 100644 --- a/xinference/model/rerank/core.py +++ b/xinference/model/rerank/core.py @@ -17,11 +17,11 @@ from collections import defaultdict from typing import Dict, List, Literal, Optional -from .match_result import MatchResult from ..._compat import BaseModel from ...types import Rerank from ..core import VirtualEnvSettings from ..utils import ModelInstanceInfoMixin +from .match_result import MatchResult from .rerank_family import check_engine_by_model_name_and_engine, match_rerank logger = logging.getLogger(__name__) diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py index 87efe31b5b..a21d4f106a 100644 --- a/xinference/model/rerank/sentence_transformers/core.py +++ b/xinference/model/rerank/sentence_transformers/core.py @@ -22,7 +22,6 @@ import torch import torch.nn as nn -from ..match_result import MatchResult from ....device_utils import empty_cache from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens from ...utils import is_flash_attn_available @@ -32,6 +31,7 @@ RerankModelFamilyV2, RerankSpecV1, ) +from ..match_result import MatchResult from ..utils import preprocess_sentence logger = logging.getLogger(__name__) diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index 114eef5907..339106f408 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -2,10 +2,10 @@ import uuid from typing import List, Optional -from ..match_result import MatchResult from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens from ...utils import cache_clean from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1 +from ..match_result import MatchResult SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"] From c17b78e521c4b686b74ace48c95a3e7025542a79 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:47:39 +0800 Subject: [PATCH 17/37] mypy test --- xinference/model/embedding/match_result.py | 2 +- xinference/model/llm/match_result.py | 2 +- xinference/model/llm/vllm/core.py | 8 ++++---- xinference/model/rerank/match_result.py | 2 +- xinference/model/utils.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xinference/model/embedding/match_result.py b/xinference/model/embedding/match_result.py index 47775f20f9..3e33c268d4 100644 --- a/xinference/model/embedding/match_result.py +++ b/xinference/model/embedding/match_result.py @@ -45,7 +45,7 @@ def failure( def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for API responses.""" - result = {"is_match": self.is_match} + result: Dict[str, Any] = {"is_match": self.is_match} if not self.is_match: if self.reason: result["reason"] = self.reason diff --git a/xinference/model/llm/match_result.py b/xinference/model/llm/match_result.py index eeff2461f2..3ab90d2c37 100644 --- a/xinference/model/llm/match_result.py +++ b/xinference/model/llm/match_result.py @@ -45,7 +45,7 @@ def failure( def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for API responses.""" - result = {"is_match": self.is_match} + result: Dict[str, Any] = {"is_match": self.is_match} if not self.is_match: if self.reason: result["reason"] = self.reason diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 7bb0664354..4aeccc0f21 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -1003,7 +1003,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool: return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_model_supported( + if not llm_family.model_family or not is_model_supported( llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS ): return MatchResult.failure( @@ -1551,7 +1551,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_chat_model_supported( + if not llm_family.model_family or not is_chat_model_supported( llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS ): return MatchResult.failure( @@ -1812,14 +1812,14 @@ def is_vision_model_supported( return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_vision_model_supported(llm_family.model_family.lower()): + if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST): return MatchResult.failure( reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", error_type=ErrorType.MODEL_COMPATIBILITY, technical_details=f"Custom vision family: {llm_family.model_family}", ) else: - if not is_vision_model_supported(llm_family.model_name.lower()): + if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST): return MatchResult.failure( reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", error_type=ErrorType.MODEL_COMPATIBILITY, diff --git a/xinference/model/rerank/match_result.py b/xinference/model/rerank/match_result.py index 125e791afd..1cd278aa5d 100644 --- a/xinference/model/rerank/match_result.py +++ b/xinference/model/rerank/match_result.py @@ -45,7 +45,7 @@ def failure( def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for API responses.""" - result = {"is_match": self.is_match} + result: Dict[str, Any] = {"is_match": self.is_match} if not self.is_match: if self.reason: result["reason"] = self.reason diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 383f188382..158fd316c7 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -474,7 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def get_engine_params_by_name( model_type: Optional[str], model_name: str ) -> Optional[Dict[str, Union[List[Dict[str, Any]], str]]]: - engine_params: Dict[str, Any] = {} + engine_params: Dict[str, Union[List[Dict[str, Any]], str]] = {} if model_type == "LLM": from .llm.llm_family import LLM_ENGINES, SUPPORTED_ENGINES From b19475109dc12aa0e5266a293db01591ffa69318 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:48:43 +0800 Subject: [PATCH 18/37] mypy test --- xinference/model/llm/vllm/core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 4aeccc0f21..bf9f07b813 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -1812,14 +1812,18 @@ def is_vision_model_supported( return False if isinstance(llm_family, CustomLLMFamilyV2): - if not llm_family.model_family or not is_vision_model_supported(llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST): + if not llm_family.model_family or not is_vision_model_supported( + llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST + ): return MatchResult.failure( reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", error_type=ErrorType.MODEL_COMPATIBILITY, technical_details=f"Custom vision family: {llm_family.model_family}", ) else: - if not llm_family.model_name or not is_vision_model_supported(llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST): + if not llm_family.model_name or not is_vision_model_supported( + llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST + ): return MatchResult.failure( reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", error_type=ErrorType.MODEL_COMPATIBILITY, From 2aa43d7439da5146906c40a767c3ba03a03f10cb Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 12:55:19 +0800 Subject: [PATCH 19/37] mypy test --- xinference/model/utils.py | 75 ++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 158fd316c7..f6db71ee8a 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -542,19 +542,24 @@ def get_engine_params_by_name( pass if detailed_error: - engine_params[engine_name] = detailed_error + # Convert error dict to string format for consistency + error_parts = [detailed_error.get("error", "Unknown error")] + if detailed_error.get("error_type"): + error_parts.append(f"Type: {detailed_error['error_type']}") + if detailed_error.get("technical_details"): + error_parts.append( + f"Details: {detailed_error['technical_details']}" + ) + engine_params[engine_name] = " | ".join(error_parts) else: # Fallback to basic error checking for backward compatibility - error_msg = None + error_msg: Optional[str] = None for engine_class in llm_engine_classes: try: if hasattr(engine_class, "check_lib"): lib_available: bool = engine_class.check_lib() # type: ignore[assignment] if not lib_available: - error_msg = { - "error": f"Engine {engine_name} library is not available", - "error_type": "dependency_missing", - } + error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)" break else: # If no check_lib method, try import check @@ -575,30 +580,20 @@ def get_engine_params_by_name( importlib.import_module(module_name) break except ImportError as e: - error_msg = { - "error": f"Engine {engine_name} library is not installed: {str(e)}", - "error_type": "dependency_missing", - } + error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)" except Exception as e: - error_msg = { - "error": f"Engine {engine_name} is not available: {str(e)}", - "error_type": "configuration_error", - } + error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)" if error_msg is None: - error_msg = { - "error": f"Engine {engine_name} is not compatible with current model or environment", - "error_type": "model_compatibility", - } + error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)" engine_params[engine_name] = error_msg except Exception as e: - # If exception occurs during checking, return structured error - engine_params[engine_name] = { - "error": f"Error checking engine {engine_name}: {str(e)}", - "error_type": "configuration_error", - } + # If exception occurs during checking, return structured error as string + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)" + ) # Filter out llm_class field for engine, params in engine_params.items(): @@ -606,7 +601,7 @@ def get_engine_params_by_name( params, list ): # Only process parameter lists of available engines for param in params: - if "llm_class" in param: + if isinstance(param, dict) and "llm_class" in param: del param["llm_class"] return engine_params @@ -638,7 +633,7 @@ def get_engine_params_by_name( if engine_name not in engine_params: # Engine not in available list try: embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] - error_msg = None + embedding_error_msg: Optional[str] = None # Try to find specific error reasons for embedding_engine_class in embedding_engine_classes: @@ -646,7 +641,7 @@ def get_engine_params_by_name( if hasattr(embedding_engine_class, "check_lib"): embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment] if not embedding_lib_available: - error_msg = ( + embedding_error_msg = ( f"Engine {engine_name} library is not available" ) break @@ -671,17 +666,17 @@ def get_engine_params_by_name( importlib.import_module(module_name) break except ImportError as e: - error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" except Exception as e: - error_msg = ( + embedding_error_msg = ( f"Engine {engine_name} is not available: {str(e)}" ) - if error_msg is None: - error_msg = f"Engine {engine_name} is not compatible with current model or environment" + if embedding_error_msg is None: + embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + engine_params[engine_name] = embedding_error_msg except Exception as e: # If exception occurs during checking, return error message string @@ -695,7 +690,7 @@ def get_engine_params_by_name( params, list ): # Only process parameter lists of available engines for param in params: - if "embedding_class" in param: + if isinstance(param, dict) and "embedding_class" in param: del param["embedding_class"] return engine_params @@ -725,7 +720,7 @@ def get_engine_params_by_name( if engine_name not in engine_params: # Engine not in available list try: rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] - error_msg = None + rerank_error_msg: Optional[str] = None # Try to find specific error reasons for rerank_engine_class in rerank_engine_classes: @@ -733,7 +728,7 @@ def get_engine_params_by_name( if hasattr(rerank_engine_class, "check_lib"): rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment] if not rerank_lib_available: - error_msg = ( + rerank_error_msg = ( f"Engine {engine_name} library is not available" ) break @@ -758,17 +753,17 @@ def get_engine_params_by_name( importlib.import_module(module_name) break except ImportError as e: - error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" except Exception as e: - error_msg = ( + rerank_error_msg = ( f"Engine {engine_name} is not available: {str(e)}" ) - if error_msg is None: - error_msg = f"Engine {engine_name} is not compatible with current model or environment" + if rerank_error_msg is None: + rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment" # For unavailable engines, directly return error message string - engine_params[engine_name] = error_msg + engine_params[engine_name] = rerank_error_msg except Exception as e: # If exception occurs during checking, return error message string @@ -782,7 +777,7 @@ def get_engine_params_by_name( params, list ): # Only process parameter lists of available engines for param in params: - if "rerank_class" in param: + if isinstance(param, dict) and "rerank_class" in param: del param["rerank_class"] return engine_params From 173e49410bdd6806a59ef6292e7d9d9b71b0f15d Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 14:38:53 +0800 Subject: [PATCH 20/37] mypy test --- xinference/model/utils.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index f6db71ee8a..c34e03ef46 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -35,6 +35,7 @@ Tuple, Type, Union, + cast, ) import huggingface_hub @@ -543,14 +544,16 @@ def get_engine_params_by_name( if detailed_error: # Convert error dict to string format for consistency - error_parts = [detailed_error.get("error", "Unknown error")] - if detailed_error.get("error_type"): - error_parts.append(f"Type: {detailed_error['error_type']}") - if detailed_error.get("technical_details"): - error_parts.append( - f"Details: {detailed_error['technical_details']}" - ) - engine_params[engine_name] = " | ".join(error_parts) + error_parts = [detailed_error.get("error") or "Unknown error"] + error_type = detailed_error.get("error_type") + if error_type: + error_parts.append(f"Type: {error_type}") + technical_details = detailed_error.get("technical_details") + if technical_details: + error_parts.append(f"Details: {technical_details}") + # Filter out None values and join + error_parts_filtered = [part for part in error_parts if part is not None] + engine_params[engine_name] = " | ".join(error_parts_filtered) else: # Fallback to basic error checking for backward compatibility error_msg: Optional[str] = None @@ -600,7 +603,8 @@ def get_engine_params_by_name( if isinstance( params, list ): # Only process parameter lists of available engines - for param in params: + assert isinstance(params, list) + for param in params: # type: ignore if isinstance(param, dict) and "llm_class" in param: del param["llm_class"] @@ -689,7 +693,8 @@ def get_engine_params_by_name( if isinstance( params, list ): # Only process parameter lists of available engines - for param in params: + assert isinstance(params, list) + for param in params: # type: ignore if isinstance(param, dict) and "embedding_class" in param: del param["embedding_class"] @@ -776,7 +781,8 @@ def get_engine_params_by_name( if isinstance( params, list ): # Only process parameter lists of available engines - for param in params: + assert isinstance(params, list) + for param in params: # type: ignore if isinstance(param, dict) and "rerank_class" in param: del param["rerank_class"] From bc41700758bf5f10cbf7897a3d5c1c3ca7142dd9 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 14:40:43 +0800 Subject: [PATCH 21/37] mypy test --- xinference/model/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index c34e03ef46..3bd7cdb3c3 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -35,7 +35,6 @@ Tuple, Type, Union, - cast, ) import huggingface_hub From fc9b422eeaa3752c8bf07b0974558c2305986b80 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 14:41:54 +0800 Subject: [PATCH 22/37] mypy test --- xinference/model/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 3bd7cdb3c3..6e4a47dda0 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -551,7 +551,9 @@ def get_engine_params_by_name( if technical_details: error_parts.append(f"Details: {technical_details}") # Filter out None values and join - error_parts_filtered = [part for part in error_parts if part is not None] + error_parts_filtered = [ + part for part in error_parts if part is not None + ] engine_params[engine_name] = " | ".join(error_parts_filtered) else: # Fallback to basic error checking for backward compatibility From 5030b261cc9e57a4debd0ebb93339d7ec6421d29 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 16:44:04 +0800 Subject: [PATCH 23/37] mypy fix --- xinference/model/utils.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 6e4a47dda0..780602dec2 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -600,12 +600,9 @@ def get_engine_params_by_name( ) # Filter out llm_class field - for engine, params in engine_params.items(): - if isinstance( - params, list - ): # Only process parameter lists of available engines - assert isinstance(params, list) - for param in params: # type: ignore + for engine in engine_params.keys(): + if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "llm_class" in param: del param["llm_class"] @@ -690,12 +687,9 @@ def get_engine_params_by_name( ) # Filter out embedding_class field - for engine, params in engine_params.items(): - if isinstance( - params, list - ): # Only process parameter lists of available engines - assert isinstance(params, list) - for param in params: # type: ignore + for engine in engine_params.keys(): + if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "embedding_class" in param: del param["embedding_class"] @@ -778,12 +772,9 @@ def get_engine_params_by_name( ) # Filter out rerank_class field - for engine, params in engine_params.items(): - if isinstance( - params, list - ): # Only process parameter lists of available engines - assert isinstance(params, list) - for param in params: # type: ignore + for engine in engine_params.keys(): + if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "rerank_class" in param: del param["rerank_class"] From cf517326630651f59e5873e1fa501a3a67dc2908 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 16:47:59 +0800 Subject: [PATCH 24/37] mypy fix --- xinference/model/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 780602dec2..c0c5233128 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -601,7 +601,9 @@ def get_engine_params_by_name( # Filter out llm_class field for engine in engine_params.keys(): - if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + if isinstance( + engine_params[engine], list + ): # Only process parameter lists of available engines for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "llm_class" in param: del param["llm_class"] @@ -688,7 +690,9 @@ def get_engine_params_by_name( # Filter out embedding_class field for engine in engine_params.keys(): - if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + if isinstance( + engine_params[engine], list + ): # Only process parameter lists of available engines for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "embedding_class" in param: del param["embedding_class"] @@ -773,7 +777,9 @@ def get_engine_params_by_name( # Filter out rerank_class field for engine in engine_params.keys(): - if isinstance(engine_params[engine], list): # Only process parameter lists of available engines + if isinstance( + engine_params[engine], list + ): # Only process parameter lists of available engines for param in engine_params[engine]: # type: ignore if isinstance(param, dict) and "rerank_class" in param: del param["rerank_class"] From 0660aaba3e420a332b7f3934e3a70a321f3452c6 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 17:54:22 +0800 Subject: [PATCH 25/37] mypy fix --- xinference/model/utils.py | 190 +++++++++++++++++++++++++++----------- 1 file changed, 137 insertions(+), 53 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index c0c5233128..96beec9618 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -14,7 +14,6 @@ import asyncio import functools -import importlib.util import json import logging import os @@ -566,22 +565,65 @@ def get_engine_params_by_name( error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)" break else: - # If no check_lib method, try import check - module_name = engine_name.lower().replace(".", "") - if engine_name == "vLLM": - module_name = "vllm" - elif engine_name == "SGLang": - module_name = "sglang" - elif engine_name == "llama.cpp": - module_name = "llama_cpp" - elif engine_name == "MLX": - module_name = "mlx" - elif engine_name == "LMDEPLOY": - module_name = "lmdeploy" - elif engine_name == "Transformers": - module_name = "transformers" - - importlib.import_module(module_name) + # If no check_lib method, try to use engine's match method for compatibility check + # This provides more detailed and accurate error information + try: + # Create a minimal test spec if we don't have real model specs + from .llm.llm_family import ( + LLMFamilyV2, + PytorchLLMSpecV2, + ) + + # Create a minimal test case + test_family = LLMFamilyV2( + model_name="test", + model_family="test", + model_specs=[ + PytorchLLMSpecV2( + model_format="pytorch", + quantization="none", + ) + ], + ) + test_spec = test_family.model_specs[0] + + # Use the engine's match method if available + if hasattr( + engine_class, "match_json_with_reason" + ): + result = ( + engine_class.match_json_with_reason( + test_family, test_spec, "none" + ) + ) + if result.is_match: + break # Engine is available + else: + error_msg = f"Engine {engine_name}: {result.reason}" + if result.error_type: + error_msg += ( + f" (Type: {result.error_type})" + ) + break + elif hasattr(engine_class, "match_json"): + # Fallback to simple match method - use test data + if engine_class.match_json( + test_family, test_spec, "none" + ): + break + else: + error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)" + break + else: + # Final fallback: generic import check + raise ImportError( + "No compatibility check method available" + ) + + except ImportError as e: + error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)" + except Exception as e: + error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)" break except ImportError as e: error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)" @@ -650,24 +692,45 @@ def get_engine_params_by_name( ) break else: - # If no check_lib method, try import check - module_name = engine_name.lower().replace(".", "") - if engine_name == "vLLM": - module_name = "vllm" - elif engine_name == "SGLang": - module_name = "sglang" - elif engine_name == "llama.cpp": - module_name = "llama_cpp" - elif engine_name == "MLX": - module_name = "mlx" - elif engine_name == "LMDEPLOY": - module_name = "lmdeploy" - elif engine_name == "Transformers": - module_name = "transformers" - elif engine_name == "SentenceTransformers": - module_name = "sentence_transformers" - - importlib.import_module(module_name) + # If no check_lib method, try to use engine's match method for compatibility check + try: + from .embedding.core import ( + EmbeddingModelFamilyV2, + TransformersEmbeddingSpecV1, + ) + + # Use the engine's match method if available + if hasattr(embedding_engine_class, "match"): + # Create a minimal test case + test_family = EmbeddingModelFamilyV2( + model_name="test", + model_specs=[ + TransformersEmbeddingSpecV1( + model_format="pytorch", + quantization="none", + ) + ], + ) + test_spec = test_family.model_specs[0] + + # Use the engine's match method to check compatibility + if embedding_engine_class.match( + test_family, test_spec, "none" + ): + break # Engine is available + else: + embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment" + break + else: + # Final fallback: generic import check + raise ImportError( + "No compatibility check method available" + ) + + except ImportError as e: + embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + except Exception as e: + embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}" break except ImportError as e: embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" @@ -737,24 +800,45 @@ def get_engine_params_by_name( ) break else: - # If no check_lib method, try import check - module_name = engine_name.lower().replace(".", "") - if engine_name == "vLLM": - module_name = "vllm" - elif engine_name == "SGLang": - module_name = "sglang" - elif engine_name == "llama.cpp": - module_name = "llama_cpp" - elif engine_name == "MLX": - module_name = "mlx" - elif engine_name == "LMDEPLOY": - module_name = "lmdeploy" - elif engine_name == "Transformers": - module_name = "transformers" - elif engine_name == "SentenceTransformers": - module_name = "sentence_transformers" - - importlib.import_module(module_name) + # If no check_lib method, try to use engine's match method for compatibility check + try: + from .rerank.core import ( + RerankModelFamilyV2, + RerankSpecV1, + ) + + # Use the engine's match method if available + if hasattr(rerank_engine_class, "match"): + # Create a minimal test case + test_family = RerankModelFamilyV2( + model_name="test", + model_specs=[ + RerankSpecV1( + model_format="pytorch", + quantization="none", + ) + ], + ) + test_spec = test_family.model_specs[0] + + # Use the engine's match method to check compatibility + if rerank_engine_class.match( + test_family, test_spec, "none" + ): + break # Engine is available + else: + rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment" + break + else: + # Final fallback: generic import check + raise ImportError( + "No compatibility check method available" + ) + + except ImportError as e: + rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + except Exception as e: + rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}" break except ImportError as e: rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" From 996f3cdc0040312c6f8d6587dffeaa74c925f656 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 21 Oct 2025 18:17:59 +0800 Subject: [PATCH 26/37] mypy fix --- xinference/model/utils.py | 225 +++++++++++++++++++++++++------------- 1 file changed, 150 insertions(+), 75 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 96beec9618..0ed516085d 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -541,28 +541,19 @@ def get_engine_params_by_name( pass if detailed_error: - # Convert error dict to string format for consistency - error_parts = [detailed_error.get("error") or "Unknown error"] - error_type = detailed_error.get("error_type") - if error_type: - error_parts.append(f"Type: {error_type}") - technical_details = detailed_error.get("technical_details") - if technical_details: - error_parts.append(f"Details: {technical_details}") - # Filter out None values and join - error_parts_filtered = [ - part for part in error_parts if part is not None + # Convert error dict to array format with error, type, details fields + engine_params[engine_name] = [ + f"error: {detailed_error.get('error') or 'Unknown error'}", + f"type: {detailed_error.get('error_type') or 'unknown'}", + f"details: {detailed_error.get('technical_details') or 'No additional details available'}", ] - engine_params[engine_name] = " | ".join(error_parts_filtered) else: # Fallback to basic error checking for backward compatibility - error_msg: Optional[str] = None for engine_class in llm_engine_classes: try: if hasattr(engine_class, "check_lib"): lib_available: bool = engine_class.check_lib() # type: ignore[assignment] if not lib_available: - error_msg = f"Engine {engine_name} library is not available (Type: dependency_missing)" break else: # If no check_lib method, try to use engine's match method for compatibility check @@ -599,11 +590,12 @@ def get_engine_params_by_name( if result.is_match: break # Engine is available else: - error_msg = f"Engine {engine_name}: {result.reason}" - if result.error_type: - error_msg += ( - f" (Type: {result.error_type})" - ) + # Create array format for match method errors + engine_params[engine_name] = [ + f"error: Engine {engine_name}: {result.reason}", + f"type: {result.error_type or 'model_compatibility'}", + f"details: Engine {engine_name} compatibility check failed: {result.reason}", + ] break elif hasattr(engine_class, "match_json"): # Fallback to simple match method - use test data @@ -612,7 +604,6 @@ def get_engine_params_by_name( ): break else: - error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)" break else: # Final fallback: generic import check @@ -621,25 +612,49 @@ def get_engine_params_by_name( ) except ImportError as e: - error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)" + engine_params[engine_name] = [ + f"error: Engine {engine_name} library is not installed: {str(e)}", + f"type: dependency_missing", + f"details: Missing required dependency for {engine_name} engine: {str(e)}", + ] + break except Exception as e: - error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)" - break + engine_params[engine_name] = [ + f"error: Engine {engine_name} is not available: {str(e)}", + f"type: configuration_error", + f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + ] + break except ImportError as e: - error_msg = f"Engine {engine_name} library is not installed: {str(e)} (Type: dependency_missing)" + engine_params[engine_name] = [ + f"error: Engine {engine_name} library is not installed: {str(e)}", + f"type: dependency_missing", + f"details: Missing required dependency for {engine_name} engine: {str(e)}", + ] + break except Exception as e: - error_msg = f"Engine {engine_name} is not available: {str(e)} (Type: configuration_error)" - - if error_msg is None: - error_msg = f"Engine {engine_name} is not compatible with current model or environment (Type: model_compatibility)" + engine_params[engine_name] = [ + f"error: Engine {engine_name} is not available: {str(e)}", + f"type: configuration_error", + f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + ] + break - engine_params[engine_name] = error_msg + # Only set default error if not already set by one of the exception handlers + if engine_name not in engine_params: + engine_params[engine_name] = [ + f"error: Engine {engine_name} is not compatible with current model or environment", + f"type: model_compatibility", + f"details: The {engine_name} engine cannot handle the current model configuration", + ] except Exception as e: - # If exception occurs during checking, return structured error as string - engine_params[engine_name] = ( - f"Error checking engine {engine_name}: {str(e)} (Type: configuration_error)" - ) + # If exception occurs during checking, return structured error as array + engine_params[engine_name] = [ + f"error: Error checking engine {engine_name}: {str(e)}", + f"type: configuration_error", + f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", + ] # Filter out llm_class field for engine in engine_params.keys(): @@ -679,7 +694,7 @@ def get_engine_params_by_name( if engine_name not in engine_params: # Engine not in available list try: embedding_engine_classes = EMBEDDING_SUPPORTED_ENGINES[engine_name] - embedding_error_msg: Optional[str] = None + embedding_error_details: Optional[Dict[str, str]] = None # Try to find specific error reasons for embedding_engine_class in embedding_engine_classes: @@ -687,9 +702,11 @@ def get_engine_params_by_name( if hasattr(embedding_engine_class, "check_lib"): embedding_lib_available: bool = embedding_engine_class.check_lib() # type: ignore[assignment] if not embedding_lib_available: - embedding_error_msg = ( - f"Engine {engine_name} library is not available" - ) + embedding_error_details = { + "error": f"Engine {engine_name} library is not available", + "error_type": "dependency_missing", + "technical_details": f"The required library for {engine_name} engine is not installed or not accessible", + } break else: # If no check_lib method, try to use engine's match method for compatibility check @@ -719,7 +736,11 @@ def get_engine_params_by_name( ): break # Engine is available else: - embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment" + embedding_error_details = { + "error": f"Engine {engine_name} is not compatible with current model or environment", + "error_type": "model_compatibility", + "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration", + } break else: # Final fallback: generic import check @@ -728,28 +749,52 @@ def get_engine_params_by_name( ) except ImportError as e: - embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + embedding_error_details = { + "error": f"Engine {engine_name} library is not installed: {str(e)}", + "error_type": "dependency_missing", + "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}", + } except Exception as e: - embedding_error_msg = f"Engine {engine_name} is not available: {str(e)}" + embedding_error_details = { + "error": f"Engine {engine_name} is not available: {str(e)}", + "error_type": "configuration_error", + "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + } break except ImportError as e: - embedding_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + embedding_error_details = { + "error": f"Engine {engine_name} library is not installed: {str(e)}", + "error_type": "dependency_missing", + "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}", + } except Exception as e: - embedding_error_msg = ( - f"Engine {engine_name} is not available: {str(e)}" - ) - - if embedding_error_msg is None: - embedding_error_msg = f"Engine {engine_name} is not compatible with current model or environment" - - # For unavailable engines, directly return error message string - engine_params[engine_name] = embedding_error_msg + embedding_error_details = { + "error": f"Engine {engine_name} is not available: {str(e)}", + "error_type": "configuration_error", + "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + } + + if embedding_error_details is None: + embedding_error_details = { + "error": f"Engine {engine_name} is not compatible with current model or environment", + "error_type": "model_compatibility", + "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration", + } + + # For unavailable engines, format error message as array like LLM + engine_params[engine_name] = [ + f"error: {embedding_error_details.get('error') or 'Unknown error'}", + f"type: {embedding_error_details.get('error_type') or 'unknown'}", + f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}", + ] except Exception as e: - # If exception occurs during checking, return error message string - engine_params[engine_name] = ( - f"Error checking engine {engine_name}: {str(e)}" - ) + # If exception occurs during checking, return structured error as array like LLM + engine_params[engine_name] = [ + f"error: Error checking engine {engine_name}: {str(e)}", + f"type: configuration_error", + f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", + ] # Filter out embedding_class field for engine in engine_params.keys(): @@ -787,7 +832,7 @@ def get_engine_params_by_name( if engine_name not in engine_params: # Engine not in available list try: rerank_engine_classes = RERANK_SUPPORTED_ENGINES[engine_name] - rerank_error_msg: Optional[str] = None + rerank_error_details: Optional[Dict[str, str]] = None # Try to find specific error reasons for rerank_engine_class in rerank_engine_classes: @@ -795,9 +840,11 @@ def get_engine_params_by_name( if hasattr(rerank_engine_class, "check_lib"): rerank_lib_available: bool = rerank_engine_class.check_lib() # type: ignore[assignment] if not rerank_lib_available: - rerank_error_msg = ( - f"Engine {engine_name} library is not available" - ) + rerank_error_details = { + "error": f"Engine {engine_name} library is not available", + "error_type": "dependency_missing", + "technical_details": f"The required library for {engine_name} engine is not installed or not accessible", + } break else: # If no check_lib method, try to use engine's match method for compatibility check @@ -827,7 +874,11 @@ def get_engine_params_by_name( ): break # Engine is available else: - rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment" + rerank_error_details = { + "error": f"Engine {engine_name} is not compatible with current model or environment", + "error_type": "model_compatibility", + "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration", + } break else: # Final fallback: generic import check @@ -836,28 +887,52 @@ def get_engine_params_by_name( ) except ImportError as e: - rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + rerank_error_details = { + "error": f"Engine {engine_name} library is not installed: {str(e)}", + "error_type": "dependency_missing", + "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}", + } except Exception as e: - rerank_error_msg = f"Engine {engine_name} is not available: {str(e)}" + rerank_error_details = { + "error": f"Engine {engine_name} is not available: {str(e)}", + "error_type": "configuration_error", + "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + } break except ImportError as e: - rerank_error_msg = f"Engine {engine_name} library is not installed: {str(e)}" + rerank_error_details = { + "error": f"Engine {engine_name} library is not installed: {str(e)}", + "error_type": "dependency_missing", + "technical_details": f"Missing required dependency for {engine_name} engine: {str(e)}", + } except Exception as e: - rerank_error_msg = ( - f"Engine {engine_name} is not available: {str(e)}" - ) - - if rerank_error_msg is None: - rerank_error_msg = f"Engine {engine_name} is not compatible with current model or environment" - - # For unavailable engines, directly return error message string - engine_params[engine_name] = rerank_error_msg + rerank_error_details = { + "error": f"Engine {engine_name} is not available: {str(e)}", + "error_type": "configuration_error", + "technical_details": f"Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", + } + + if rerank_error_details is None: + rerank_error_details = { + "error": f"Engine {engine_name} is not compatible with current model or environment", + "error_type": "model_compatibility", + "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration", + } + + # For unavailable engines, format error message as array like LLM + engine_params[engine_name] = [ + f"error: {rerank_error_details.get('error') or 'Unknown error'}", + f"type: {rerank_error_details.get('error_type') or 'unknown'}", + f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}", + ] except Exception as e: - # If exception occurs during checking, return error message string - engine_params[engine_name] = ( - f"Error checking engine {engine_name}: {str(e)}" - ) + # If exception occurs during checking, return structured error as array like LLM + engine_params[engine_name] = [ + f"error: Error checking engine {engine_name}: {str(e)}", + f"type: configuration_error", + f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", + ] # Filter out rerank_class field for engine in engine_params.keys(): From 41b0735eec9c293dabea4d8c6965c8c736b51e09 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 22 Oct 2025 10:02:34 +0800 Subject: [PATCH 27/37] mypy fix --- xinference/model/utils.py | 111 +++++++++++++++----------------------- 1 file changed, 44 insertions(+), 67 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 0ed516085d..146f145513 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -541,12 +541,10 @@ def get_engine_params_by_name( pass if detailed_error: - # Convert error dict to array format with error, type, details fields - engine_params[engine_name] = [ - f"error: {detailed_error.get('error') or 'Unknown error'}", - f"type: {detailed_error.get('error_type') or 'unknown'}", - f"details: {detailed_error.get('technical_details') or 'No additional details available'}", - ] + # Return only the error message without engine_name prefix (key already contains engine name) + engine_params[engine_name] = ( + detailed_error.get("error") or "Unknown error" + ) else: # Fallback to basic error checking for backward compatibility for engine_class in llm_engine_classes: @@ -590,12 +588,11 @@ def get_engine_params_by_name( if result.is_match: break # Engine is available else: - # Create array format for match method errors - engine_params[engine_name] = [ - f"error: Engine {engine_name}: {result.reason}", - f"type: {result.error_type or 'model_compatibility'}", - f"details: Engine {engine_name} compatibility check failed: {result.reason}", - ] + # Return only the error message without engine_name prefix (key already contains engine name) + engine_params[engine_name] = ( + result.reason + or "Unknown compatibility error" + ) break elif hasattr(engine_class, "match_json"): # Fallback to simple match method - use test data @@ -612,49 +609,37 @@ def get_engine_params_by_name( ) except ImportError as e: - engine_params[engine_name] = [ - f"error: Engine {engine_name} library is not installed: {str(e)}", - f"type: dependency_missing", - f"details: Missing required dependency for {engine_name} engine: {str(e)}", - ] + engine_params[engine_name] = ( + f"Engine {engine_name} library is not installed: {str(e)}" + ) break except Exception as e: - engine_params[engine_name] = [ - f"error: Engine {engine_name} is not available: {str(e)}", - f"type: configuration_error", - f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", - ] + engine_params[engine_name] = ( + f"Engine {engine_name} is not available: {str(e)}" + ) break except ImportError as e: - engine_params[engine_name] = [ - f"error: Engine {engine_name} library is not installed: {str(e)}", - f"type: dependency_missing", - f"details: Missing required dependency for {engine_name} engine: {str(e)}", - ] + engine_params[engine_name] = ( + f"Engine {engine_name} library is not installed: {str(e)}" + ) break except Exception as e: - engine_params[engine_name] = [ - f"error: Engine {engine_name} is not available: {str(e)}", - f"type: configuration_error", - f"details: Configuration or environment issue preventing {engine_name} engine from working: {str(e)}", - ] + engine_params[engine_name] = ( + f"Engine {engine_name} is not available: {str(e)}" + ) break # Only set default error if not already set by one of the exception handlers if engine_name not in engine_params: - engine_params[engine_name] = [ - f"error: Engine {engine_name} is not compatible with current model or environment", - f"type: model_compatibility", - f"details: The {engine_name} engine cannot handle the current model configuration", - ] + engine_params[engine_name] = ( + f"Engine {engine_name} is not compatible with current model or environment" + ) except Exception as e: - # If exception occurs during checking, return structured error as array - engine_params[engine_name] = [ - f"error: Error checking engine {engine_name}: {str(e)}", - f"type: configuration_error", - f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", - ] + # If exception occurs during checking, return simple string format + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) # Filter out llm_class field for engine in engine_params.keys(): @@ -781,20 +766,16 @@ def get_engine_params_by_name( "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration", } - # For unavailable engines, format error message as array like LLM - engine_params[engine_name] = [ - f"error: {embedding_error_details.get('error') or 'Unknown error'}", - f"type: {embedding_error_details.get('error_type') or 'unknown'}", - f"details: {embedding_error_details.get('technical_details') or 'No additional details available'}", - ] + # For unavailable engines, return simple string format + engine_params[engine_name] = ( + embedding_error_details.get("error") or "Unknown error" + ) except Exception as e: - # If exception occurs during checking, return structured error as array like LLM - engine_params[engine_name] = [ - f"error: Error checking engine {engine_name}: {str(e)}", - f"type: configuration_error", - f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", - ] + # If exception occurs during checking, return simple string format + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) # Filter out embedding_class field for engine in engine_params.keys(): @@ -919,20 +900,16 @@ def get_engine_params_by_name( "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration", } - # For unavailable engines, format error message as array like LLM - engine_params[engine_name] = [ - f"error: {rerank_error_details.get('error') or 'Unknown error'}", - f"type: {rerank_error_details.get('error_type') or 'unknown'}", - f"details: {rerank_error_details.get('technical_details') or 'No additional details available'}", - ] + # For unavailable engines, return simple string format + engine_params[engine_name] = ( + rerank_error_details.get("error") or "Unknown error" + ) except Exception as e: - # If exception occurs during checking, return structured error as array like LLM - engine_params[engine_name] = [ - f"error: Error checking engine {engine_name}: {str(e)}", - f"type: configuration_error", - f"details: An unexpected error occurred while checking {engine_name} engine availability: {str(e)}", - ] + # If exception occurs during checking, return simple string format + engine_params[engine_name] = ( + f"Error checking engine {engine_name}: {str(e)}" + ) # Filter out rerank_class field for engine in engine_params.keys(): From c760a589e971d5db5dbd97582010a5736e633c55 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 22 Oct 2025 16:09:06 +0800 Subject: [PATCH 28/37] Modify class name --- xinference/model/embedding/core.py | 2 +- xinference/model/embedding/llama_cpp/core.py | 4 ++-- .../embedding/sentence_transformers/core.py | 4 ++-- xinference/model/llm/core.py | 2 +- xinference/model/llm/llama_cpp/core.py | 4 ++-- xinference/model/llm/lmdeploy/core.py | 8 ++++---- xinference/model/llm/mlx/core.py | 14 +++++++------- xinference/model/llm/sglang/core.py | 16 ++++++++-------- xinference/model/llm/transformers/core.py | 4 ++-- xinference/model/llm/vllm/core.py | 16 ++++++++-------- xinference/model/rerank/core.py | 2 +- .../model/rerank/sentence_transformers/core.py | 4 ++-- xinference/model/rerank/vllm/core.py | 4 ++-- xinference/model/utils.py | 10 +++++----- 14 files changed, 47 insertions(+), 47 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index 6f934b6e5f..b68e5236ca 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -173,7 +173,7 @@ def match_json( pass @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py index 4b3d6ed125..d84434384f 100644 --- a/xinference/model/embedding/llama_cpp/core.py +++ b/xinference/model/embedding/llama_cpp/core.py @@ -237,11 +237,11 @@ def match_json( quantization: str, ) -> bool: - result = cls.match_json_with_reason(model_family, model_spec, quantization) + result = cls.match_with_reason(model_family, model_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index 29bcb66a33..c1789f9912 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -436,11 +436,11 @@ def match_json( quantization: str, ) -> bool: - result = cls.match_json_with_reason(model_family, model_spec, quantization) + result = cls.match_with_reason(model_family, model_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py index 2626060579..3020483219 100644 --- a/xinference/model/llm/core.py +++ b/xinference/model/llm/core.py @@ -161,7 +161,7 @@ def match_json( raise NotImplementedError @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": """ diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 386f8eb662..e8ff96f83b 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -88,11 +88,11 @@ def match_json( cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index f1c2605a24..90115dec06 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -123,11 +123,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult @@ -190,11 +190,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index 943dddd7c4..ff6b2e51ea 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -413,11 +413,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult @@ -773,17 +773,17 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult # Use base class validation first - base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + base_result = super().match_with_reason(llm_family, llm_spec, quantization) if not base_result.is_match: return base_result @@ -857,11 +857,11 @@ def check_lib(cls) -> bool: def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index 7095289a5d..d22a157777 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -344,11 +344,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult @@ -729,17 +729,17 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult # Use base class validation first - base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + base_result = super().match_with_reason(llm_family, llm_spec, quantization) if not base_result.is_match: return base_result @@ -860,17 +860,17 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult # Use base class validation first - base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + base_result = super().match_with_reason(llm_family, llm_spec, quantization) if not base_result.is_match: return base_result diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 8fae36576d..5a4a9f557d 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -502,11 +502,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index bf9f07b813..bc0eede4c0 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -883,11 +883,11 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult @@ -1461,17 +1461,17 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": from ..match_result import ErrorType, MatchResult # Use base class validation first - base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + base_result = super().match_with_reason(llm_family, llm_spec, quantization) if not base_result.is_match: return base_result @@ -1739,16 +1739,16 @@ def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - result = cls.match_json_with_reason(llm_family, llm_spec, quantization) + result = cls.match_with_reason(llm_family, llm_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> "MatchResult": # Use base class validation first - base_result = super().match_json_with_reason(llm_family, llm_spec, quantization) + base_result = super().match_with_reason(llm_family, llm_spec, quantization) if not base_result.is_match: return base_result diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py index c02b230abd..2d3edde1c2 100644 --- a/xinference/model/rerank/core.py +++ b/xinference/model/rerank/core.py @@ -133,7 +133,7 @@ def match_json( pass @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py index a21d4f106a..42332bc477 100644 --- a/xinference/model/rerank/sentence_transformers/core.py +++ b/xinference/model/rerank/sentence_transformers/core.py @@ -344,11 +344,11 @@ def match_json( ) -> bool: pass - result = cls.match_json_with_reason(model_family, model_spec, quantization) + result = cls.match_with_reason(model_family, model_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index 339106f408..c2ee75cfef 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -151,11 +151,11 @@ def match_json( quantization: str, ) -> bool: - result = cls.match_json_with_reason(model_family, model_spec, quantization) + result = cls.match_with_reason(model_family, model_spec, quantization) return result.is_match @classmethod - def match_json_with_reason( + def match_with_reason( cls, model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 146f145513..e27c93d851 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -499,7 +499,7 @@ def get_engine_params_by_name( try: llm_engine_classes = SUPPORTED_ENGINES[engine_name] - # Try to get detailed error information from engine's match_json_with_reason + # Try to get detailed error information from engine's match_with_reason detailed_error = None # We need a sample model to test against, use the first available spec @@ -517,12 +517,12 @@ def get_engine_params_by_name( for engine_class in llm_engine_classes: try: if hasattr( - engine_class, "match_json_with_reason" + engine_class, "match_with_reason" ): pass result = ( - engine_class.match_json_with_reason( + engine_class.match_with_reason( llm_family, llm_spec, quantization ) ) @@ -578,10 +578,10 @@ def get_engine_params_by_name( # Use the engine's match method if available if hasattr( - engine_class, "match_json_with_reason" + engine_class, "match_with_reason" ): result = ( - engine_class.match_json_with_reason( + engine_class.match_with_reason( test_family, test_spec, "none" ) ) From 6615014c8faae1821c90b0862339672ee215ca9a Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 22 Oct 2025 16:12:15 +0800 Subject: [PATCH 29/37] Modify class name --- xinference/model/utils.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index e27c93d851..377259af77 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -516,15 +516,11 @@ def get_engine_params_by_name( # Test each engine class for detailed error info for engine_class in llm_engine_classes: try: - if hasattr( - engine_class, "match_with_reason" - ): + if hasattr(engine_class, "match_with_reason"): pass - result = ( - engine_class.match_with_reason( - llm_family, llm_spec, quantization - ) + result = engine_class.match_with_reason( + llm_family, llm_spec, quantization ) if not result.is_match: detailed_error = { @@ -577,13 +573,9 @@ def get_engine_params_by_name( test_spec = test_family.model_specs[0] # Use the engine's match method if available - if hasattr( - engine_class, "match_with_reason" - ): - result = ( - engine_class.match_with_reason( - test_family, test_spec, "none" - ) + if hasattr(engine_class, "match_with_reason"): + result = engine_class.match_with_reason( + test_family, test_spec, "none" ) if result.is_match: break # Engine is available From 2105c83392399ae5eb800bb6f00a19422e81d25d Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 22 Oct 2025 16:44:35 +0800 Subject: [PATCH 30/37] commit --- xinference/model/utils.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 377259af77..ea7adb309e 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -529,12 +529,21 @@ def get_engine_params_by_name( "technical_details": result.technical_details, } break - except Exception: - # Fall back to next engine class + except Exception as e: + # Fall back to next engine class with clear error logging + logger.warning( + f"Engine class {engine_class.__name__} match_with_reason failed: {e}" + ) + # Continue to try next engine class, but this is expected behavior for fallback continue - except Exception: - # If we can't get model family, continue with basic checking - pass + except Exception as e: + # If we can't get model family, fail with clear error + logger.error( + f"Failed to get model family for {model_name} (LLM): {e}" + ) + raise RuntimeError( + f"Unable to process LLM model {model_name}: {e}" + ) if detailed_error: # Return only the error message without engine_name prefix (key already contains engine name) From eb1bb43dc6358228ff7462f002ff1d62348eda56 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 29 Oct 2025 14:21:15 +0800 Subject: [PATCH 31/37] new engine ability display --- xinference/model/embedding/core.py | 53 +-- xinference/model/embedding/flag/core.py | 17 +- xinference/model/embedding/llama_cpp/core.py | 59 +-- .../embedding/sentence_transformers/core.py | 70 +--- xinference/model/embedding/vllm/core.py | 17 +- xinference/model/llm/core.py | 48 +-- xinference/model/llm/llama_cpp/core.py | 68 +-- xinference/model/llm/lmdeploy/core.py | 70 +--- xinference/model/llm/mlx/core.py | 165 ++------ xinference/model/llm/sglang/core.py | 387 ++++++++++-------- xinference/model/llm/transformers/core.py | 74 +--- xinference/model/llm/vllm/core.py | 274 +++---------- xinference/model/rerank/core.py | 55 +-- .../rerank/sentence_transformers/core.py | 68 +-- xinference/model/rerank/vllm/core.py | 67 +-- xinference/model/utils.py | 165 ++++++-- 16 files changed, 591 insertions(+), 1066 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index b68e5236ca..42f39049f6 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -25,7 +25,6 @@ from ..core import VirtualEnvSettings from ..utils import ModelInstanceInfoMixin from .embed_family import match_embedding -from .match_result import MatchResult logger = logging.getLogger(__name__) @@ -159,7 +158,7 @@ def __init__( @classmethod @abstractmethod - def check_lib(cls) -> bool: + def check_lib(cls) -> Union[bool, str]: pass @classmethod @@ -169,62 +168,24 @@ def match_json( model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ) -> bool: + ) -> Union[bool, str]: pass - @classmethod - def match_with_reason( - cls, - model_family: EmbeddingModelFamilyV2, - model_spec: EmbeddingSpecV1, - quantization: str, - ) -> "MatchResult": - """ - Check if the engine can handle the given embedding model with detailed error information. - - This method provides detailed failure reasons and suggestions when an engine - cannot handle a specific model configuration. The default implementation - falls back to the boolean match_json method for backward compatibility. - - Args: - model_family: The embedding model family information - model_spec: The model specification - quantization: The quantization method - - Returns: - MatchResult: Detailed match result with reasons and suggestions - """ - from .match_result import ErrorType, MatchResult - - # Default implementation for backward compatibility - if cls.match_json(model_family, model_spec, quantization): - return MatchResult.success() - else: - # Get basic reason based on common failure patterns - if not cls.check_lib(): - return MatchResult.failure( - reason=f"Required library for {cls.__name__} is not available", - error_type=ErrorType.DEPENDENCY_MISSING, - ) - else: - return MatchResult.failure( - reason=f"Embedding model configuration is not compatible with {cls.__name__}", - error_type=ErrorType.MODEL_COMPATIBILITY, - ) - @classmethod def match( cls, model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ): + ) -> bool: """ Return if the model_spec can be matched. """ - if not cls.check_lib(): + lib_result = cls.check_lib() + if lib_result != True: return False - return cls.match_json(model_family, model_spec, quantization) + match_result = cls.match_json(model_family, model_spec, quantization) + return match_result == True @abstractmethod def load(self): diff --git a/xinference/model/embedding/flag/core.py b/xinference/model/embedding/flag/core.py index a53036449e..174a860d91 100644 --- a/xinference/model/embedding/flag/core.py +++ b/xinference/model/embedding/flag/core.py @@ -285,8 +285,12 @@ def encode( return result @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("FlagEmbedding") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("FlagEmbedding") is not None + else "FlagEmbedding library is not installed" + ) @classmethod def match_json( @@ -294,10 +298,15 @@ def match_json( model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ) -> bool: + ) -> Union[bool, str]: + # Check library availability first + lib_result = cls.check_lib() + if lib_result != True: + return lib_result + if ( model_spec.model_format in ["pytorch"] and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST ): return True - return False + return f"FlagEmbedding engine only supports pytorch format and models in FLAG_EMBEDDER_MODEL_LIST, got format: {model_spec.model_format}, model: {model_family.model_name}" diff --git a/xinference/model/embedding/llama_cpp/core.py b/xinference/model/embedding/llama_cpp/core.py index d84434384f..a8e68f450b 100644 --- a/xinference/model/embedding/llama_cpp/core.py +++ b/xinference/model/embedding/llama_cpp/core.py @@ -26,7 +26,6 @@ from ....types import Embedding from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 -from ..match_result import MatchResult logger = logging.getLogger(__name__) @@ -226,8 +225,12 @@ def _handle_embedding(): return Embedding(**r) # type: ignore @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("xllamacpp") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("xllamacpp") is not None + else "xllamacpp library is not installed" + ) @classmethod def match_json( @@ -235,52 +238,24 @@ def match_json( model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ) -> bool: - - result = cls.match_with_reason(model_family, model_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, - model_family: EmbeddingModelFamilyV2, - model_spec: EmbeddingSpecV1, - quantization: str, - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="llama.cpp library (xllamacpp) is not installed for embedding", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="xllamacpp package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if model_spec.model_format not in ["ggufv2"]: - return MatchResult.failure( - reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2", - ) + return f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}" # Check embedding-specific requirements if not hasattr(model_spec, "model_file_name_template"): - return MatchResult.failure( - reason="GGUF embedding model requires proper file configuration", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details="Missing model_file_name_template for GGUF embedding", - ) + return "GGUF embedding model requires proper file configuration (missing model_file_name_template)" # Check model dimensions for llama.cpp compatibility model_dimensions = model_family.dimensions if model_dimensions > 4096: # llama.cpp may have limitations - return MatchResult.failure( - reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Large embedding dimensions: {model_dimensions}", - ) + return f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)" # Check platform-specific considerations import platform @@ -289,10 +264,6 @@ def match_with_reason( # llama.cpp works across platforms but may have performance differences if current_platform == "Windows": - return MatchResult.failure( - reason="llama.cpp embedding may have limited performance on Windows", - error_type=ErrorType.OS_REQUIREMENT, - technical_details=f"Windows platform: {current_platform}", - ) + return "llama.cpp embedding may have limited performance on Windows" - return MatchResult.success() + return True diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index c1789f9912..4e1c7b8b73 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -22,7 +22,6 @@ from ....types import Embedding, EmbeddingData, EmbeddingUsage from ...utils import is_flash_attn_available from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 -from ..match_result import MatchResult logger = logging.getLogger(__name__) SENTENCE_TRANSFORMER_MODEL_LIST: List[str] = [] @@ -425,8 +424,12 @@ def base64_to_image(base64_str: str) -> Image.Image: return result @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("sentence_transformers") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("sentence_transformers") is not None + else "sentence_transformers library is not installed" + ) @classmethod def match_json( @@ -434,53 +437,25 @@ def match_json( model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ) -> bool: - - result = cls.match_with_reason(model_family, model_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, - model_family: EmbeddingModelFamilyV2, - model_spec: EmbeddingSpecV1, - quantization: str, - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="Sentence Transformers library is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="sentence_transformers package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if model_spec.model_format not in ["pytorch"]: - return MatchResult.failure( - reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", - ) + return f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}" # Check model dimensions compatibility model_dimensions = model_family.dimensions if model_dimensions > 1536: # Very large embedding models - return MatchResult.failure( - reason=f"Large embedding model detected ({model_dimensions} dimensions)", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Large embedding dimensions: {model_dimensions}", - ) + return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues" # Check token limits max_tokens = model_family.max_tokens if max_tokens > 8192: # Very high token limits - return MatchResult.failure( - reason=f"High token limit model detected (max_tokens: {max_tokens})", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details=f"High max_tokens: {max_tokens}", - ) + return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues" # Check for special model requirements model_name = model_family.model_name.lower() @@ -489,23 +464,16 @@ def match_with_reason( if "gte" in model_name and "qwen2" in model_name: # These models have specific requirements if not hasattr(cls, "_check_qwen_gte_requirements"): - return MatchResult.failure( - reason="Qwen2 GTE models require special handling", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details="Qwen2 GTE model special requirements", - ) + return "Qwen2 GTE models require special handling" # Check Qwen3 models if "qwen3" in model_name: - # Qwen3 has flash attention requirements + # Qwen3 has flash attention requirements - basic check try: - # This would be checked during actual loading pass + + # This would be checked during actual loading except Exception: - return MatchResult.failure( - reason="Qwen3 embedding model may have compatibility issues", - error_type=ErrorType.VERSION_REQUIREMENT, - technical_details="Qwen3 model compatibility check", - ) + return "Qwen3 embedding model may have compatibility issues" - return MatchResult.success() + return True diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py index 8905d36297..8fc32ebac8 100644 --- a/xinference/model/embedding/vllm/core.py +++ b/xinference/model/embedding/vllm/core.py @@ -149,8 +149,12 @@ def create_embedding( return result @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("vllm") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("vllm") is not None + else "vllm library is not installed" + ) @classmethod def match_json( @@ -158,12 +162,17 @@ def match_json( model_family: EmbeddingModelFamilyV2, model_spec: EmbeddingSpecV1, quantization: str, - ) -> bool: + ) -> Union[bool, str]: + # Check library availability first + lib_result = cls.check_lib() + if lib_result != True: + return lib_result + if model_spec.model_format in ["pytorch"]: prefix = model_family.model_name.split("-", 1)[0] if prefix in SUPPORTED_MODELS_PREFIXES: return True - return False + return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}" def wait_for_load(self): # set context length after engine inited diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py index 3020483219..5942a42879 100644 --- a/xinference/model/llm/core.py +++ b/xinference/model/llm/core.py @@ -31,7 +31,6 @@ if TYPE_CHECKING: from .llm_family import LLMFamilyV2, LLMSpecV1 - from .match_result import MatchResult logger = logging.getLogger(__name__) @@ -71,7 +70,7 @@ def __init__( @classmethod @abstractmethod - def check_lib(cls) -> bool: + def check_lib(cls) -> Union[bool, str]: raise NotImplementedError @staticmethod @@ -149,54 +148,19 @@ def load(self): def match( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> bool: - if not cls.check_lib(): + lib_result = cls.check_lib() + if lib_result != True: return False - return cls.match_json(llm_family, llm_spec, quantization) + match_result = cls.match_json(llm_family, llm_spec, quantization) + return match_result == True @classmethod @abstractmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: + ) -> Union[bool, str]: raise NotImplementedError - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - """ - Check if the engine can handle the given model with detailed error information. - - This method provides detailed failure reasons and suggestions when an engine - cannot handle a specific model configuration. The default implementation - falls back to the boolean match_json method for backward compatibility. - - Args: - llm_family: The model family information - llm_spec: The model specification - quantization: The quantization method - - Returns: - MatchResult: Detailed match result with reasons and suggestions - """ - from .match_result import ErrorType, MatchResult - - # Default implementation for backward compatibility - if cls.match_json(llm_family, llm_spec, quantization): - return MatchResult.success() - else: - # Get basic reason based on common failure patterns - if not cls.check_lib(): - return MatchResult.failure( - reason=f"Required library for {cls.__name__} is not available", - error_type=ErrorType.DEPENDENCY_MISSING, - ) - else: - return MatchResult.failure( - reason=f"Model configuration is not compatible with {cls.__name__}", - error_type=ErrorType.MODEL_COMPATIBILITY, - ) - def prepare_parse_reasoning_content( self, reasoning_content: bool, enable_thinking: bool = True ): diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index e8ff96f83b..5d379e642d 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -25,7 +25,6 @@ from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 -from ..match_result import MatchResult from ..utils import ChatModelMixin logger = logging.getLogger(__name__) @@ -80,73 +79,34 @@ def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict: return llamacpp_model_config @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("xllamacpp") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("xllamacpp") is not None + else "xllamacpp library is not installed" + ) @classmethod def match_json( cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="llama.cpp library (xllamacpp) is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="xllamacpp package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if llm_spec.model_format not in ["ggufv2"]: - return MatchResult.failure( - reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2", + return ( + f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}" ) - # Check model abilities - llama.cpp supports both chat and generation - if ( - "chat" not in llm_family.model_ability - and "generate" not in llm_family.model_ability - ): - return MatchResult.failure( - reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # Check platform-specific issues - import platform - - current_platform = platform.system() - - # Check for ARM64 specific issues - if current_platform == "Darwin" and platform.machine() == "arm64": - # Apple Silicon specific checks could go here - pass - elif current_platform == "Windows": - # Windows specific checks could go here - pass - # Check memory requirements (basic heuristic) model_size = float(str(llm_spec.model_size_in_billions)) if model_size > 70: # Very large models - return MatchResult.failure( - reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Large model size: {model_size}B parameters", - ) + return f"llama.cpp may struggle with very large models ({model_size}B parameters)" - return MatchResult.success() + return True def load(self): try: diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py index 90115dec06..9689c3ddce 100644 --- a/xinference/model/llm/lmdeploy/core.py +++ b/xinference/model/llm/lmdeploy/core.py @@ -21,7 +21,6 @@ from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA from ..core import LLM from ..llm_family import LLMFamilyV2, LLMSpecV1 -from ..match_result import MatchResult from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk logger = logging.getLogger(__name__) @@ -115,28 +114,18 @@ def load(self): raise ValueError("LMDEPLOY engine has not supported generate yet.") @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("lmdeploy") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("lmdeploy") is not None + else "lmdeploy library is not installed" + ) @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - return MatchResult.failure( - reason="LMDeploy base model does not support direct inference", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details="LMDeploy base model class is not intended for direct use", - ) + ) -> Union[bool, str]: + return "LMDeploy base model does not support direct inference, use specific LMDeploy model classes" def generate( self, @@ -188,52 +177,23 @@ def load(self): @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability first - if not LMDEPLOY_INSTALLED: - return MatchResult.failure( - reason="LMDeploy library is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="lmdeploy package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility and quantization if llm_spec.model_format == "awq": # LMDeploy has specific AWQ quantization requirements if "4" not in quantization: - return MatchResult.failure( - reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"AWQ + {quantization} not supported by LMDeploy", - ) + return f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}" # Check model compatibility if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS: - return MatchResult.failure( - reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported chat model: {llm_family.model_name}", - ) - - # Check model abilities - LMDeploy primarily supports chat models - if "chat" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) + return f"Chat model not supported by LMDeploy: {llm_family.model_name}" - return MatchResult.success() + return True async def async_chat( self, diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index ff6b2e51ea..ab8f1608db 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -18,7 +18,6 @@ import importlib.util import logging import pathlib -import platform import sys import threading import time @@ -51,7 +50,6 @@ ) from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 -from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, @@ -405,73 +403,32 @@ def wait_for_load(self): self._context_length = get_context_length(config) @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("mlx_lm") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("mlx_lm") is not None + else "mlx_lm library is not installed" + ) @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Check platform compatibility first - MLX only works on Apple Silicon - if sys.platform != "darwin" or platform.processor() != "arm": - return MatchResult.failure( - reason="MLX engine only works on Apple Silicon Macs (macOS with ARM processor)", - error_type=ErrorType.OS_REQUIREMENT, - technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", - ) - - # Check library availability (only if platform is compatible) - if not cls.check_lib(): - return MatchResult.failure( - reason="MLX library (mlx_lm) is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="mlx_lm package not found in Python environment", - ) + ) -> Union[bool, str]: + # Check library availability first + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if llm_spec.model_format not in ["mlx"]: - return MatchResult.failure( - reason=f"MLX engine only supports MLX format, got: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx", - ) - - # Check model abilities - MLX supports generation but not chat/vision in this base class - if "generate" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"MLX engine requires 'generate' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # MLX base model doesn't support chat or vision - if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability: - return MatchResult.failure( - reason="MLX base model does not support chat or vision abilities", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Unsupported abilities for base MLX: {[a for a in llm_family.model_ability if a in ['chat', 'vision']]}", - ) + return f"MLX engine only supports MLX format, got: {llm_spec.model_format}" # Check memory constraints for Apple Silicon model_size = float(str(llm_spec.model_size_in_billions)) if model_size > 70: # Large models may be problematic - return MatchResult.failure( - reason=f"MLX may have memory limitations with very large models ({model_size}B parameters)", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Large model size: {model_size}B on Apple Silicon", - ) + return f"MLX may have memory limitations with very large models ({model_size}B parameters)" - return MatchResult.success() + return True def _get_prompt_cache( self, prompt, lora_name: Optional[str] = None, model: Any = None @@ -771,39 +728,13 @@ def _sanitize_generate_config( @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Use base class validation first - base_result = super().match_with_reason(llm_family, llm_spec, quantization) - if not base_result.is_match: + ) -> Union[bool, str]: + # First run base class checks + base_result = super().match_json(llm_family, llm_spec, quantization) + if base_result != True: return base_result - # Check chat ability - if "chat" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"MLX Chat requires 'chat' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # MLX Chat doesn't support vision - if "vision" in llm_family.model_ability: - return MatchResult.failure( - reason="MLX Chat model does not support vision abilities", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Vision ability not supported in MLXChatModel", - ) - - return MatchResult.success() + return True def chat( self, @@ -850,59 +781,27 @@ def chat( class MLXVisionModel(MLXModel, ChatModelMixin): @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("mlx_vlm") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("mlx_vlm") is not None + else "mlx_vlm library is not installed" + ) @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Check platform compatibility first - MLX only works on Apple Silicon - if sys.platform != "darwin" or platform.processor() != "arm": - return MatchResult.failure( - reason="MLX Vision engine only works on Apple Silicon Macs (macOS with ARM processor)", - error_type=ErrorType.OS_REQUIREMENT, - technical_details=f"Current platform: {sys.platform}, processor: {platform.processor()}, required: darwin + arm", - ) - - # Check library availability (only if platform is compatible) - MLX Vision uses mlx_vlm - if not cls.check_lib(): - return MatchResult.failure( - reason="MLX Vision library (mlx_vlm) is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="mlx_vlm package not found in Python environment", - ) + ) -> Union[bool, str]: + # Check library availability first - MLX Vision uses mlx_vlm + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if llm_spec.model_format not in ["mlx"]: - return MatchResult.failure( - reason=f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {llm_spec.model_format}, required: mlx", - ) - - # Check vision ability - if "vision" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"MLX Vision requires 'vision' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # Check for distributed inference limitations - # MLX Vision models don't support distributed inference - # This could be checked here if needed + return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}" - return MatchResult.success() + return True def _load_model(self, **kwargs): try: diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index d22a157777..ccb44c00bd 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -15,7 +15,6 @@ import json import logging import multiprocessing -import platform import sys import threading import time @@ -37,7 +36,6 @@ from .. import LLM, LLMFamilyV2, LLMSpecV1 from ..core import chat_context_var from ..llm_family import CustomLLMFamilyV2 -from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, @@ -336,110 +334,130 @@ def _sanitize_generate_config( return generate_config @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("sglang") is not None + def check_lib(cls) -> Union[bool, str]: + # Check CUDA first - this is the most important requirement + try: + import torch - @classmethod - def match_json( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: + if not torch.cuda.is_available(): + return "SGLang requires CUDA support but no CUDA devices detected" + except ImportError: + return "SGLang requires PyTorch with CUDA support" + + if importlib.util.find_spec("sglang") is None: + return "sglang library is not installed" + + try: + if not getattr(sglang, "__version__", None): + return "SGLang version information is not available" + + # Check version - SGLang requires recent version + from packaging import version - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match + if version.parse(sglang.__version__) < version.parse("0.1.0"): + return f"SGLang version {sglang.__version__} is too old, minimum required is 0.1.0" + + return True + except Exception as e: + return f"Error checking SGLang library: {str(e)}" @classmethod - def match_with_reason( + def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability first - if not SGLANG_INSTALLED: - return MatchResult.failure( - reason="SGLang library is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="sglang package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result - # Check hardware requirements - SGLang requires CUDA - if not cls._has_cuda_device(): - return MatchResult.failure( - reason="SGLang requires CUDA GPU support", - error_type=ErrorType.HARDWARE_REQUIREMENT, - technical_details="No CUDA devices detected", - ) + # Check GPU requirements + try: + import torch - # Check OS requirements - if not cls._is_linux(): - return MatchResult.failure( - reason="SGLang only supports Linux operating system", - error_type=ErrorType.OS_REQUIREMENT, - technical_details=f"Current OS: {platform.system()}, required: Linux", - ) + if torch.cuda.device_count() == 0: + return "SGLang requires CUDA support but no CUDA devices detected" + except ImportError: + return "SGLang requires PyTorch with CUDA support" # Check model format compatibility supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"SGLang does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {llm_spec.model_format}", - ) + return f"SGLang does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}" # Check quantization compatibility with format if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return MatchResult.failure( - reason=f"SGLang pytorch format does not support quantization: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"pytorch + {quantization} combination not supported", - ) + return f"SGLang pytorch format does not support quantization: {quantization}" + + # Check model compatibility with more flexible matching + def is_model_supported(model_name: str, supported_list: List[str]) -> bool: + """Check if model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants (e.g., qwen3 variants) + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common patterns + model_lower = model_name.lower() + if any( + family in model_lower + for family in [ + "qwen3", + "llama", + "mistral", + "mixtral", + "qwen2", + "qwen2.5", + "deepseek", + "yi", + "baichuan", + ] + ): + # Check if there's a corresponding supported model with same family + for supported in supported_list: + if any( + family in supported.lower() + for family in [ + "qwen3", + "llama", + "mistral", + "mixtral", + "qwen2", + "qwen2.5", + "deepseek", + "yi", + "baichuan", + ] + ): + return True + + return False - # Check model compatibility if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in SGLANG_SUPPORTED_MODELS: - return MatchResult.failure( - reason=f"Custom model family not supported by SGLang: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom family: {llm_family.model_family}", + if not llm_family.model_family or not is_model_supported( + llm_family.model_family.lower(), SGLANG_SUPPORTED_MODELS + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if llm_family.model_name not in SGLANG_SUPPORTED_MODELS: - return MatchResult.failure( - reason=f"Model not supported by SGLang: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported model: {llm_family.model_name}", + if not is_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in SGLANG_SUPPORTED_MODELS], + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Model may not be fully supported by SGLang: {llm_family.model_name}" ) - # Check model abilities with flexible logic - # SGLang can handle models with various text generation capabilities - has_text_capability = ( - "generate" in llm_family.model_ability - or "chat" in llm_family.model_ability - or "reasoning" in llm_family.model_ability - or "tools" in llm_family.model_ability - ) - - if not has_text_capability: - return MatchResult.failure( - reason=f"SGLang requires text generation capabilities, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # SGLang is primarily designed for text models, not specialized models - specialized_abilities = ["embedding", "rerank", "audio", "vision"] - has_specialized = any( - ability in llm_family.model_ability for ability in specialized_abilities - ) - if has_specialized: - return MatchResult.failure( - reason=f"SGLang is designed for text models, this model has specialized abilities: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Specialized abilities: {[a for a in llm_family.model_ability if a in specialized_abilities]}", - ) - - return MatchResult.success() + return True @staticmethod def _convert_state_to_completion_chunk( @@ -727,65 +745,76 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin): @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Use base class validation first - base_result = super().match_with_reason(llm_family, llm_spec, quantization) - if not base_result.is_match: + ) -> Union[bool, str]: + # First run base class checks + base_result = super().match_json(llm_family, llm_spec, quantization) + if base_result != True: return base_result # Check model format compatibility (same as base) supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"SGLang Chat does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Chat model unsupported format: {llm_spec.model_format}", - ) + return f"SGLang Chat does not support model format: {llm_spec.model_format}" # Check quantization compatibility with format if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return MatchResult.failure( - reason=f"SGLang Chat pytorch format does not support quantization: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"Chat pytorch + {quantization} not supported", - ) + return f"SGLang Chat pytorch format does not support quantization: {quantization}" + + # Check chat model compatibility with more flexible matching + def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: + """Check if chat model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common chat patterns + model_lower = model_name.lower() + if any(suffix in model_lower for suffix in ["chat", "instruct", "coder"]): + if any( + family in model_lower + for family in [ + "qwen3", + "llama", + "mistral", + "mixtral", + "qwen2", + "qwen2.5", + "deepseek", + "yi", + "baichuan", + ] + ): + return True + + return False - # Check chat model compatibility if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS: - return MatchResult.failure( - reason=f"Custom chat model not supported by SGLang: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom chat family: {llm_family.model_family}", + if not is_chat_model_supported( + llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS: - return MatchResult.failure( - reason=f"Chat model not supported by SGLang: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported chat model: {llm_family.model_name}", + if not is_chat_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS], + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Chat model may not be fully supported by SGLang: {llm_family.model_name}" ) - # Check chat ability - if "chat" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"SGLang Chat requires 'chat' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - return MatchResult.success() + return True def _sanitize_chat_config( self, @@ -858,65 +887,81 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin): @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Use base class validation first - base_result = super().match_with_reason(llm_family, llm_spec, quantization) - if not base_result.is_match: + ) -> Union[bool, str]: + # First run base class checks + base_result = super().match_json(llm_family, llm_spec, quantization) + if base_result != True: return base_result # Vision models have the same format restrictions as base SGLANG supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"SGLang Vision does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Vision model unsupported format: {llm_spec.model_format}", + return ( + f"SGLang Vision does not support model format: {llm_spec.model_format}" ) # Vision models typically work with specific quantization settings if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return MatchResult.failure( - reason=f"SGLang Vision pytorch format does not support quantization: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"Vision pytorch + {quantization} not supported", - ) + return f"SGLang Vision pytorch format does not support quantization: {quantization}" + + # Check vision model compatibility with more flexible matching + def is_vision_model_supported( + model_name: str, supported_list: List[str] + ) -> bool: + """Check if vision model is supported with flexible matching.""" + # Direct match + if model_name in supported_list: + return True + + # Partial matching for models with variants + for supported in supported_list: + if model_name.startswith( + supported.lower() + ) or supported.lower().startswith(model_name): + return True + + # Family-based matching for common vision patterns + model_lower = model_name.lower() + if any(suffix in model_lower for suffix in ["vision", "vl", "multi", "mm"]): + if any( + family in model_lower + for family in [ + "qwen3", + "llama", + "mistral", + "mixtral", + "qwen2", + "qwen2.5", + "deepseek", + "yi", + "baichuan", + "internvl", + ] + ): + return True + + return False - # Check vision model compatibility if isinstance(llm_family, CustomLLMFamilyV2): - if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST: - return MatchResult.failure( - reason=f"Custom vision model not supported by SGLang: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom vision family: {llm_family.model_family}", + if not is_vision_model_supported( + llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST: - return MatchResult.failure( - reason=f"Vision model not supported by SGLang: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported vision model: {llm_family.model_name}", + if not is_vision_model_supported( + llm_family.model_name.lower(), + [s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST], + ): + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Vision model may not be fully supported by SGLang: {llm_family.model_name}" ) - # Check vision ability - if "vision" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"SGLang Vision requires 'vision' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - return MatchResult.success() + return True def _sanitize_chat_config( self, diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py index 5a4a9f557d..39e963164b 100644 --- a/xinference/model/llm/transformers/core.py +++ b/xinference/model/llm/transformers/core.py @@ -40,7 +40,6 @@ from ...utils import select_device from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 -from ..match_result import MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, LLAMA3_TOOL_CALL_FAMILY, @@ -494,78 +493,33 @@ def stop(self): del self._tokenizer @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("transformers") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("transformers") is not None + else "transformers library is not installed" + ) @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="Transformers library is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="transformers or torch package not found", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility supported_formats = ["pytorch", "gptq", "awq", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"Transformers does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Transformers unsupported format: {llm_spec.model_format}", - ) + return f"Transformers does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}" # Check for models that shouldn't use Transformers by default model_family = llm_family.model_family or llm_family.model_name if model_family in NON_DEFAULT_MODEL_LIST: - return MatchResult.failure( - reason=f"Model {model_family} is not recommended for Transformers engine", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Model in NON_DEFAULT_MODEL_LIST: {model_family}", - ) + return f"Model {model_family} is not recommended for Transformers engine, has specialized engine preference" - # Check model abilities with flexible logic - # Transformers can handle models with various text processing capabilities - has_text_capability = ( - "generate" in llm_family.model_ability - or "chat" in llm_family.model_ability - or "reasoning" in llm_family.model_ability - or "tools" in llm_family.model_ability - ) - - if not has_text_capability: - return MatchResult.failure( - reason=f"Transformers engine requires text processing capabilities, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # Check for highly specialized models that might not work well with generic Transformers engine - specialized_abilities = ["embedding", "rerank", "audio", "vision"] - has_specialized = any( - ability in llm_family.model_ability for ability in specialized_abilities - ) - if has_specialized and not has_text_capability: - return MatchResult.failure( - reason=f"Model requires specialized engine for its abilities: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Specialized abilities detected: {[a for a in llm_family.model_ability if a in specialized_abilities]}", - ) - - return MatchResult.success() + return True def build_prefill_attention_mask( self, batch_size: int, seq_length: int, reqs: List[InferenceRequest] @@ -1023,8 +977,6 @@ def match_json( model_family = llm_family.model_family or llm_family.model_name if model_family in NON_DEFAULT_MODEL_LIST: return False - if "chat" not in llm_family.model_ability: - return False return True async def chat( diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index bc0eede4c0..7262053a50 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -19,7 +19,6 @@ import logging import multiprocessing import os -import platform import sys import threading import time @@ -56,7 +55,6 @@ from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1 from ..core import chat_context_var from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config -from ..match_result import ErrorType, MatchResult from ..utils import ( DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, @@ -852,111 +850,77 @@ def _sanitize_generate_config( return sanitized @classmethod - def check_lib(cls) -> bool: + def check_lib(cls) -> Union[bool, str]: + # Check CUDA first - this is the most important requirement + try: + import torch + + if not torch.cuda.is_available(): + return "vLLM requires CUDA support but no CUDA devices detected" + except ImportError: + return "vLLM requires PyTorch with CUDA support" + if importlib.util.find_spec("vllm") is None: - return False + return "vLLM library is not installed" try: import vllm if not getattr(vllm, "__version__", None): - return False + return "vLLM version information is not available" # Check version from packaging import version if version.parse(vllm.__version__) < version.parse("0.3.0"): - return False - - # Check CUDA - import torch - - if not torch.cuda.is_available(): - return False + return f"vLLM version {vllm.__version__} is too old, minimum required is 0.3.0" return True - except Exception: - return False + except Exception as e: + return f"Error checking vLLM library: {str(e)}" @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability first if not VLLM_INSTALLED: - return MatchResult.failure( - reason="vLLM library is not installed", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="vllm package not found in Python environment", - ) + return "vLLM library is not installed" - # Check hardware requirements - if not cls._has_cuda_device() and not cls._has_mlu_device(): - return MatchResult.failure( - reason="vLLM requires CUDA or MLU accelerator support", - error_type=ErrorType.HARDWARE_REQUIREMENT, - technical_details="No CUDA or MLU devices detected", - ) + # Check GPU device count + try: + import torch - # Check OS requirements - if not cls._is_linux(): - return MatchResult.failure( - reason="vLLM only supports Linux operating system", - error_type=ErrorType.OS_REQUIREMENT, - technical_details=f"Current OS: {platform.system()}, required: Linux", - ) + if torch.cuda.device_count() == 0: + return "vLLM requires CUDA support but no CUDA devices detected" + except ImportError: + return "vLLM requires PyTorch with CUDA support" # Check model format supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"vLLM does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {llm_spec.model_format}", - ) + return f"vLLM does not support model format: {llm_spec.model_format}, supported formats: {', '.join(supported_formats)}" # Check quantization compatibility with format if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return MatchResult.failure( - reason=f"vLLM pytorch format does not support quantization: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"pytorch + {quantization} combination not supported", + return ( + f"vLLM pytorch format does not support quantization: {quantization}" ) if llm_spec.model_format == "awq": if "4" not in quantization: - return MatchResult.failure( - reason=f"vLLM AWQ format requires 4-bit quantization, got: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"AWQ + {quantization} not supported, only 4-bit", + return ( + f"vLLM AWQ format requires 4-bit quantization, got: {quantization}" ) if llm_spec.model_format == "gptq": if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"): if not any(q in quantization for q in ("3", "4", "8")): - return MatchResult.failure( - reason=f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"GPTQ + {quantization} not supported with vLLM >= 0.3.3", - ) + return f"vLLM GPTQ format requires 3/4/8-bit quantization, got: {quantization}" else: if "4" not in quantization: - return MatchResult.failure( - reason=f"Older vLLM version only supports 4-bit GPTQ, got: {quantization}", - error_type=ErrorType.VERSION_REQUIREMENT, - technical_details=f"GPTQ + {quantization} requires vLLM >= 0.3.3", - ) + return f"Older vLLM version only supports 4-bit GPTQ, got: {quantization} (requires vLLM >= 0.3.3 for 3/8-bit)" # Check model compatibility with more flexible matching def is_model_supported(model_name: str, supported_list: List[str]) -> bool: @@ -1006,53 +970,19 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool: if not llm_family.model_family or not is_model_supported( llm_family.model_family.lower(), VLLM_SUPPORTED_MODELS ): - return MatchResult.failure( - reason=f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom family: {llm_family.model_family}", - ) + return f"Custom model family may not be fully supported by vLLM: {llm_family.model_family}" else: if not is_model_supported( llm_family.model_name.lower(), [s.lower() for s in VLLM_SUPPORTED_MODELS], ): - return MatchResult.failure( - reason=f"Model may not be supported by vLLM: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported model: {llm_family.model_name}", + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Model may not be fully supported by vLLM: {llm_family.model_name}" ) - # Check model abilities with flexible logic - # vLLM can handle models that have text generation capabilities - # Models with 'chat' ability usually also support 'generate' - has_text_capability = ( - "generate" in llm_family.model_ability - or "chat" in llm_family.model_ability - or "reasoning" in llm_family.model_ability - or "tools" in llm_family.model_ability - ) - - if not has_text_capability: - return MatchResult.failure( - reason=f"vLLM requires text generation capabilities, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # Additional check: ensure model doesn't have conflicting abilities - conflicting_abilities = ["embedding", "rerank"] - has_conflicting = any( - ability in llm_family.model_ability for ability in conflicting_abilities - ) - if has_conflicting: - return MatchResult.failure( - reason=f"Model has conflicting abilities for vLLM: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Conflicting abilities detected: {[a for a in llm_family.model_ability if a in conflicting_abilities]}", - ) - # All checks passed - return MatchResult.success() + return True @staticmethod def _convert_request_output_to_completion_chunk( @@ -1459,48 +1389,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin): @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - - # Use base class validation first - base_result = super().match_with_reason(llm_family, llm_spec, quantization) - if not base_result.is_match: + ) -> Union[bool, str]: + # First run base class checks + base_result = super().match_json(llm_family, llm_spec, quantization) + if base_result != True: return base_result # Chat-specific format support (includes GGUFv2 for newer vLLM) supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb", "ggufv2"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"vLLM Chat does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Chat model unsupported format: {llm_spec.model_format}", - ) + return f"vLLM Chat does not support model format: {llm_spec.model_format}" # GGUFv2 requires newer vLLM version if llm_spec.model_format == "ggufv2": if not (VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.2")): - return MatchResult.failure( - reason="vLLM GGUF support requires version >= 0.8.2", - error_type=ErrorType.VERSION_REQUIREMENT, - technical_details=f"Current vLLM: {VLLM_VERSION}, required: >=0.8.2", - ) + return f"vLLM GGUF support requires version >= 0.8.2, current: {VLLM_VERSION}" # AWQ chat models support more quantization levels if llm_spec.model_format == "awq": if not any(q in quantization for q in ("4", "8")): - return MatchResult.failure( - reason=f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"Chat AWQ + {quantization} not supported", - ) + return f"vLLM Chat AWQ requires 4 or 8-bit quantization, got: {quantization}" # Check chat model compatibility with flexible matching def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: @@ -1554,46 +1462,18 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: if not llm_family.model_family or not is_chat_model_supported( llm_family.model_family.lower(), VLLM_SUPPORTED_CHAT_MODELS ): - return MatchResult.failure( - reason=f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom chat family: {llm_family.model_family}", - ) + return f"Custom chat model may not be fully supported by vLLM: {llm_family.model_family}" else: if not is_chat_model_supported( llm_family.model_name.lower(), [s.lower() for s in VLLM_SUPPORTED_CHAT_MODELS], ): - return MatchResult.failure( - reason=f"Chat model may not be supported by vLLM: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported chat model: {llm_family.model_name}", + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Chat model may not be fully supported by vLLM: {llm_family.model_name}" ) - # Check chat ability with flexible logic - # vLLM Chat should work with models that have conversation capabilities - has_chat_capability = ( - "chat" in llm_family.model_ability - or "generate" in llm_family.model_ability - or "reasoning" in llm_family.model_ability - ) - - if not has_chat_capability: - return MatchResult.failure( - reason=f"vLLM Chat requires conversation capabilities, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - # Additional check: ensure model is not purely a tool model without conversation - if set(llm_family.model_ability) == {"tools"}: - return MatchResult.failure( - reason=f"Model only has 'tools' capability without conversation support: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Tool-only model detected", - ) - - return MatchResult.success() + return True def _sanitize_chat_config( self, @@ -1737,47 +1617,26 @@ class VLLMMultiModel(VLLMModel, ChatModelMixin): @classmethod def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - - result = cls.match_with_reason(llm_family, llm_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str - ) -> "MatchResult": - - # Use base class validation first - base_result = super().match_with_reason(llm_family, llm_spec, quantization) - if not base_result.is_match: + ) -> Union[bool, str]: + # First run base class checks + base_result = super().match_json(llm_family, llm_spec, quantization) + if base_result != True: return base_result # Vision models have the same format restrictions as base VLLM supported_formats = ["pytorch", "gptq", "awq", "fp8", "bnb"] if llm_spec.model_format not in supported_formats: - return MatchResult.failure( - reason=f"vLLM Vision does not support model format: {llm_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Vision model unsupported format: {llm_spec.model_format}", - ) + return f"vLLM Vision does not support model format: {llm_spec.model_format}" # Vision models typically work with specific quantization settings if llm_spec.model_format == "pytorch": if quantization != "none" and quantization is not None: - return MatchResult.failure( - reason=f"vLLM Vision pytorch format does not support quantization: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"Vision pytorch + {quantization} not supported", - ) + return f"vLLM Vision pytorch format does not support quantization: {quantization}" # AWQ vision models support more quantization levels than base if llm_spec.model_format == "awq": if not any(q in quantization for q in ("4", "8")): - return MatchResult.failure( - reason=f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}", - error_type=ErrorType.QUANTIZATION, - technical_details=f"Vision AWQ + {quantization} not supported", - ) + return f"vLLM Vision AWQ requires 4 or 8-bit quantization, got: {quantization}" # Check vision model compatibility with flexible matching def is_vision_model_supported( @@ -1815,30 +1674,17 @@ def is_vision_model_supported( if not llm_family.model_family or not is_vision_model_supported( llm_family.model_family.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST ): - return MatchResult.failure( - reason=f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Custom vision family: {llm_family.model_family}", - ) + return f"Custom vision model may not be fully supported by vLLM: {llm_family.model_family}" else: if not llm_family.model_name or not is_vision_model_supported( llm_family.model_name.lower(), VLLM_SUPPORTED_MULTI_MODEL_LIST ): - return MatchResult.failure( - reason=f"Vision model may not be supported by vLLM: {llm_family.model_name}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported vision model: {llm_family.model_name}", + # Instead of hard rejection, give a warning but allow usage + logger.warning( + f"Vision model may not be fully supported by vLLM: {llm_family.model_name}" ) - # Check vision ability - if "vision" not in llm_family.model_ability: - return MatchResult.failure( - reason=f"vLLM Vision requires 'vision' ability, model has: {llm_family.model_ability}", - error_type=ErrorType.ABILITY_MISMATCH, - technical_details=f"Model abilities: {llm_family.model_ability}", - ) - - return MatchResult.success() + return True def _sanitize_model_config( self, model_config: Optional[VLLMModelConfig] diff --git a/xinference/model/rerank/core.py b/xinference/model/rerank/core.py index 2d3edde1c2..f844825d6c 100644 --- a/xinference/model/rerank/core.py +++ b/xinference/model/rerank/core.py @@ -15,13 +15,12 @@ import os from abc import abstractmethod from collections import defaultdict -from typing import Dict, List, Literal, Optional +from typing import Dict, List, Literal, Optional, Union from ..._compat import BaseModel from ...types import Rerank from ..core import VirtualEnvSettings from ..utils import ModelInstanceInfoMixin -from .match_result import MatchResult from .rerank_family import check_engine_by_model_name_and_engine, match_rerank logger = logging.getLogger(__name__) @@ -119,7 +118,7 @@ def __init__( @classmethod @abstractmethod - def check_lib(cls) -> bool: + def check_lib(cls) -> Union[bool, str]: pass @classmethod @@ -129,62 +128,24 @@ def match_json( model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, quantization: str, - ) -> bool: + ) -> Union[bool, str]: pass - @classmethod - def match_with_reason( - cls, - model_family: RerankModelFamilyV2, - model_spec: RerankSpecV1, - quantization: str, - ) -> "MatchResult": - """ - Check if the engine can handle the given rerank model with detailed error information. - - This method provides detailed failure reasons and suggestions when an engine - cannot handle a specific model configuration. The default implementation - falls back to the boolean match_json method for backward compatibility. - - Args: - model_family: The rerank model family information - model_spec: The model specification - quantization: The quantization method - - Returns: - MatchResult: Detailed match result with reasons and suggestions - """ - from .match_result import ErrorType, MatchResult - - # Default implementation for backward compatibility - if cls.match_json(model_family, model_spec, quantization): - return MatchResult.success() - else: - # Get basic reason based on common failure patterns - if not cls.check_lib(): - return MatchResult.failure( - reason=f"Required library for {cls.__name__} is not available", - error_type=ErrorType.DEPENDENCY_MISSING, - ) - else: - return MatchResult.failure( - reason=f"Rerank model configuration is not compatible with {cls.__name__}", - error_type=ErrorType.MODEL_COMPATIBILITY, - ) - @classmethod def match( cls, model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, quantization: str, - ): + ) -> bool: """ Return if the model_spec can be matched. """ - if not cls.check_lib(): + lib_result = cls.check_lib() + if lib_result != True: return False - return cls.match_json(model_family, model_spec, quantization) + match_result = cls.match_json(model_family, model_spec, quantization) + return match_result == True @staticmethod def _get_tokenizer(model_path): diff --git a/xinference/model/rerank/sentence_transformers/core.py b/xinference/model/rerank/sentence_transformers/core.py index 42332bc477..eddc58ac06 100644 --- a/xinference/model/rerank/sentence_transformers/core.py +++ b/xinference/model/rerank/sentence_transformers/core.py @@ -16,7 +16,7 @@ import logging import threading import uuid -from typing import List, Optional, Sequence +from typing import List, Optional, Sequence, Union import numpy as np import torch @@ -31,7 +31,6 @@ RerankModelFamilyV2, RerankSpecV1, ) -from ..match_result import MatchResult from ..utils import preprocess_sentence logger = logging.getLogger(__name__) @@ -332,8 +331,12 @@ def format_instruction(instruction, query, doc): return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata) @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("sentence_transformers") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("sentence_transformers") is not None + else "sentence_transformers library is not installed" + ) @classmethod def match_json( @@ -341,44 +344,19 @@ def match_json( model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, quantization: str, - ) -> bool: - pass - - result = cls.match_with_reason(model_family, model_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, - model_family: RerankModelFamilyV2, - model_spec: RerankSpecV1, - quantization: str, - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="Sentence Transformers library is not installed for reranking", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="sentence_transformers package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if model_spec.model_format not in ["pytorch"]: - return MatchResult.failure( - reason=f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", - ) + return f"Sentence Transformers reranking only supports pytorch format, got: {model_spec.model_format}" # Check rerank-specific requirements if not hasattr(model_family, "model_name"): - return MatchResult.failure( - reason="Rerank model family requires model name specification", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details="Missing model_name in rerank model family", - ) + return "Rerank model family requires model name specification" # Check model type compatibility if model_family.type and model_family.type not in [ @@ -389,27 +367,15 @@ def match_with_reason( "LLM-based", "LLM-based layerwise", ]: - return MatchResult.failure( - reason=f"Model type '{model_family.type}' may not be compatible with reranking engines", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Model type: {model_family.type}", - ) + return f"Model type '{model_family.type}' may not be compatible with reranking engines" # Check max tokens limit for reranking performance max_tokens = model_family.max_tokens if max_tokens and max_tokens > 8192: # High token limits for reranking - return MatchResult.failure( - reason=f"High max_tokens limit for reranking model: {max_tokens}", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details=f"High max_tokens for reranking: {max_tokens}", - ) + return f"High max_tokens limit for reranking model: {max_tokens}, may cause performance issues" # Check language compatibility if not model_family.language or len(model_family.language) == 0: - return MatchResult.failure( - reason="Rerank model language information is missing", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details="Missing language information in rerank model", - ) + return "Rerank model language information is missing" - return MatchResult.success() + return True diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index c2ee75cfef..4f63c0136c 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -1,11 +1,10 @@ import importlib.util import uuid -from typing import List, Optional +from typing import List, Optional, Union from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens from ...utils import cache_clean from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1 -from ..match_result import MatchResult SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"] @@ -140,8 +139,12 @@ def rerank( return Rerank(id=str(uuid.uuid4()), results=reranked_docs, meta=metadata) @classmethod - def check_lib(cls) -> bool: - return importlib.util.find_spec("vllm") is not None + def check_lib(cls) -> Union[bool, str]: + return ( + True + if importlib.util.find_spec("vllm") is not None + else "vllm library is not installed" + ) @classmethod def match_json( @@ -149,35 +152,15 @@ def match_json( model_family: RerankModelFamilyV2, model_spec: RerankSpecV1, quantization: str, - ) -> bool: - - result = cls.match_with_reason(model_family, model_spec, quantization) - return result.is_match - - @classmethod - def match_with_reason( - cls, - model_family: RerankModelFamilyV2, - model_spec: RerankSpecV1, - quantization: str, - ) -> "MatchResult": - from ..match_result import ErrorType, MatchResult - + ) -> Union[bool, str]: # Check library availability - if not cls.check_lib(): - return MatchResult.failure( - reason="vLLM library is not installed for reranking", - error_type=ErrorType.DEPENDENCY_MISSING, - technical_details="vllm package not found in Python environment", - ) + lib_result = cls.check_lib() + if lib_result != True: + return lib_result # Check model format compatibility if model_spec.model_format not in ["pytorch"]: - return MatchResult.failure( - reason=f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}", - error_type=ErrorType.MODEL_FORMAT, - technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch", - ) + return f"vLLM reranking only supports pytorch format, got: {model_spec.model_format}" # Check model name prefix matching if model_spec.model_format == "pytorch": @@ -187,33 +170,17 @@ def match_with_reason( if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]: # Special handling for Qwen3 models if "qwen3" not in model_family.model_name.lower(): - return MatchResult.failure( - reason=f"Model family prefix not supported by vLLM reranking: {prefix}", - error_type=ErrorType.MODEL_COMPATIBILITY, - technical_details=f"Unsupported prefix: {prefix}", - ) + return f"Model family prefix not supported by vLLM reranking: {prefix}" except (IndexError, AttributeError): - return MatchResult.failure( - reason="Unable to parse model family name for vLLM compatibility check", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details=f"Model name parsing failed: {model_family.model_name}", - ) + return f"Unable to parse model family name for vLLM compatibility check: {model_family.model_name}" # Check rerank-specific requirements if not hasattr(model_family, "model_name"): - return MatchResult.failure( - reason="Rerank model family requires model name specification for vLLM", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details="Missing model_name in vLLM rerank model family", - ) + return "Rerank model family requires model name specification for vLLM" # Check max tokens limit for vLLM reranking performance max_tokens = model_family.max_tokens if max_tokens and max_tokens > 4096: # vLLM has stricter limits - return MatchResult.failure( - reason=f"High max_tokens limit for vLLM reranking model: {max_tokens}", - error_type=ErrorType.CONFIGURATION_ERROR, - technical_details=f"High max_tokens for vLLM reranking: {max_tokens}", - ) + return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues" - return MatchResult.success() + return True diff --git a/xinference/model/utils.py b/xinference/model/utils.py index ea7adb309e..3442d38ea1 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -506,33 +506,59 @@ def get_engine_params_by_name( if model_name in LLM_ENGINES and LLM_ENGINES[model_name]: # Try to get model family for testing try: - from .llm.llm_family import match_llm + pass + + # Get the full model family instead of a single spec + from .llm.llm_family import BUILTIN_LLM_FAMILIES + + llm_family = None + for family in BUILTIN_LLM_FAMILIES: + if model_name == family.model_name: + llm_family = family + break - llm_family = match_llm(model_name, None, None, None, None) if llm_family and llm_family.model_specs: - llm_spec = llm_family.model_specs[0] - quantization = llm_spec.quantization or "none" # Test each engine class for detailed error info for engine_class in llm_engine_classes: try: - if hasattr(engine_class, "match_with_reason"): - pass + engine_compatible = False + error_details = None - result = engine_class.match_with_reason( - llm_family, llm_spec, quantization + # Try each model spec to find one compatible with this engine + for llm_spec in llm_family.model_specs: + quantization = ( + llm_spec.quantization or "none" ) - if not result.is_match: - detailed_error = { - "error": result.reason, - "error_type": result.error_type, - "technical_details": result.technical_details, - } - break + + if hasattr(engine_class, "match_json"): + match_result = engine_class.match_json( + llm_family, llm_spec, quantization + ) + if match_result == True: + engine_compatible = True + break # Found compatible spec + else: + # Save error details, but continue trying other specs + error_details = { + "error": ( + match_result + if isinstance( + match_result, str + ) + else "Engine is not compatible" + ), + "error_type": "model_compatibility", + "technical_details": f"The {engine_class.__name__} engine cannot handle the current model configuration: {llm_spec.model_format} format", + } + + if not engine_compatible and error_details: + detailed_error = error_details + break except Exception as e: # Fall back to next engine class with clear error logging logger.warning( - f"Engine class {engine_class.__name__} match_with_reason failed: {e}" + f"Engine class {engine_class.__name__} match_json failed: {e}" ) # Continue to try next engine class, but this is expected behavior for fallback continue @@ -555,8 +581,15 @@ def get_engine_params_by_name( for engine_class in llm_engine_classes: try: if hasattr(engine_class, "check_lib"): - lib_available: bool = engine_class.check_lib() # type: ignore[assignment] - if not lib_available: + lib_result = engine_class.check_lib() + if lib_result != True: + # If check_lib returns a string, it's an error message + error_msg = ( + lib_result + if isinstance(lib_result, str) + else f"Engine {engine_name} library check failed" + ) + engine_params[engine_name] = error_msg break else: # If no check_lib method, try to use engine's match method for compatibility check @@ -564,17 +597,49 @@ def get_engine_params_by_name( try: # Create a minimal test spec if we don't have real model specs from .llm.llm_family import ( + AwqLLMSpecV2, + GgmlLLMSpecV2, + GptqLLMSpecV2, LLMFamilyV2, + MLXLLMSpecV2, PytorchLLMSpecV2, ) - # Create a minimal test case + # Create appropriate test spec based on engine class + engine_name_lower = ( + engine_class.__name__.lower() + ) + if "mlx" in engine_name_lower: + # MLX engines need MLX format + test_spec_class = MLXLLMSpecV2 + model_format = "mlx" + elif ( + "ggml" in engine_name_lower + or "llamacpp" in engine_name_lower + ): + # GGML/llama.cpp engines need GGML format + test_spec_class = GgmlLLMSpecV2 + model_format = "ggmlv3" + elif "gptq" in engine_name_lower: + # GPTQ engines need GPTQ format + test_spec_class = GptqLLMSpecV2 + model_format = "gptq" + elif "awq" in engine_name_lower: + # AWQ engines need AWQ format + test_spec_class = AwqLLMSpecV2 + model_format = "awq" + else: + # Default to PyTorch format + test_spec_class = PytorchLLMSpecV2 + model_format = "pytorch" + + # Create a minimal test case with appropriate format test_family = LLMFamilyV2( model_name="test", model_family="test", model_specs=[ - PytorchLLMSpecV2( - model_format="pytorch", + test_spec_class( + model_format=model_format, quantization="none", ) ], @@ -597,11 +662,21 @@ def get_engine_params_by_name( break elif hasattr(engine_class, "match_json"): # Fallback to simple match method - use test data - if engine_class.match_json( + match_result = engine_class.match_json( test_family, test_spec, "none" - ): - break + ) + if match_result == True: + break # Engine is available else: + # Get detailed error information + error_message = ( + match_result + if isinstance(match_result, str) + else f"Engine {engine_name} is not compatible with current model or environment" + ) + engine_params[engine_name] = ( + error_message + ) break else: # Final fallback: generic import check @@ -653,9 +728,7 @@ def get_engine_params_by_name( return engine_params elif model_type == "embedding": - from .embedding.embed_family import ( - EMBEDDING_ENGINES, - ) + from .embedding.embed_family import EMBEDDING_ENGINES from .embedding.embed_family import ( SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES, ) @@ -716,14 +789,23 @@ def get_engine_params_by_name( ) test_spec = test_family.model_specs[0] - # Use the engine's match method to check compatibility - if embedding_engine_class.match( - test_family, test_spec, "none" - ): + # Use the engine's match_json method to check compatibility and get detailed error + match_result = ( + embedding_engine_class.match_json( + test_family, test_spec, "none" + ) + ) + if match_result == True: break # Engine is available else: + # Get detailed error information + error_message = ( + match_result + if isinstance(match_result, str) + else f"Engine {engine_name} is not compatible with current model or environment" + ) embedding_error_details = { - "error": f"Engine {engine_name} is not compatible with current model or environment", + "error": error_message, "error_type": "model_compatibility", "technical_details": f"The {engine_name} engine cannot handle the current embedding model configuration", } @@ -789,9 +871,7 @@ def get_engine_params_by_name( return engine_params elif model_type == "rerank": - from .rerank.rerank_family import ( - RERANK_ENGINES, - ) + from .rerank.rerank_family import RERANK_ENGINES from .rerank.rerank_family import SUPPORTED_ENGINES as RERANK_SUPPORTED_ENGINES if model_name not in RERANK_ENGINES: @@ -850,14 +930,21 @@ def get_engine_params_by_name( ) test_spec = test_family.model_specs[0] - # Use the engine's match method to check compatibility - if rerank_engine_class.match( + # Use the engine's match_json method to check compatibility and get detailed error + match_result = rerank_engine_class.match_json( test_family, test_spec, "none" - ): + ) + if match_result == True: break # Engine is available else: + # Get detailed error information + error_message = ( + match_result + if isinstance(match_result, str) + else f"Engine {engine_name} is not compatible with current model or environment" + ) rerank_error_details = { - "error": f"Engine {engine_name} is not compatible with current model or environment", + "error": error_message, "error_type": "model_compatibility", "technical_details": f"The {engine_name} engine cannot handle the current rerank model configuration", } From 26ca06f9645f0691cded28dbd2243f27a70912c1 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 29 Oct 2025 14:27:58 +0800 Subject: [PATCH 32/37] pre-commit --- xinference/model/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 3442d38ea1..12be38ec71 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -728,7 +728,9 @@ def get_engine_params_by_name( return engine_params elif model_type == "embedding": - from .embedding.embed_family import EMBEDDING_ENGINES + from .embedding.embed_family import ( + EMBEDDING_ENGINES, + ) from .embedding.embed_family import ( SUPPORTED_ENGINES as EMBEDDING_SUPPORTED_ENGINES, ) From 48a272d2bed187982f95bbff0d5f7cc9ce517b19 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 29 Oct 2025 14:47:14 +0800 Subject: [PATCH 33/37] mypy-error --- xinference/model/llm/sglang/core.py | 10 +++++----- xinference/model/utils.py | 18 ++++-------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index ccb44c00bd..7d5d13d229 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -448,7 +448,7 @@ def is_model_supported(model_name: str, supported_list: List[str]) -> bool: f"Custom model family may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if not is_model_supported( + if not llm_family.model_name or not is_model_supported( llm_family.model_name.lower(), [s.lower() for s in SGLANG_SUPPORTED_MODELS], ): @@ -797,7 +797,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_chat_model_supported( + if not llm_family.model_family or not is_chat_model_supported( llm_family.model_family.lower(), SGLANG_SUPPORTED_CHAT_MODELS ): # Instead of hard rejection, give a warning but allow usage @@ -805,7 +805,7 @@ def is_chat_model_supported(model_name: str, supported_list: List[str]) -> bool: f"Custom chat model may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if not is_chat_model_supported( + if not llm_family.model_name or not is_chat_model_supported( llm_family.model_name.lower(), [s.lower() for s in SGLANG_SUPPORTED_CHAT_MODELS], ): @@ -944,7 +944,7 @@ def is_vision_model_supported( return False if isinstance(llm_family, CustomLLMFamilyV2): - if not is_vision_model_supported( + if not llm_family.model_family or not is_vision_model_supported( llm_family.model_family.lower(), SGLANG_SUPPORTED_VISION_MODEL_LIST ): # Instead of hard rejection, give a warning but allow usage @@ -952,7 +952,7 @@ def is_vision_model_supported( f"Custom vision model may not be fully supported by SGLang: {llm_family.model_family}" ) else: - if not is_vision_model_supported( + if not llm_family.model_name or not is_vision_model_supported( llm_family.model_name.lower(), [s.lower() for s in SGLANG_SUPPORTED_VISION_MODEL_LIST], ): diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 12be38ec71..35f5b21fdc 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -597,9 +597,7 @@ def get_engine_params_by_name( try: # Create a minimal test spec if we don't have real model specs from .llm.llm_family import ( - AwqLLMSpecV2, - GgmlLLMSpecV2, - GptqLLMSpecV2, + LlamaCppLLMSpecV2, LLMFamilyV2, MLXLLMSpecV2, PytorchLLMSpecV2, @@ -618,18 +616,10 @@ def get_engine_params_by_name( or "llamacpp" in engine_name_lower ): # GGML/llama.cpp engines need GGML format - test_spec_class = GgmlLLMSpecV2 - model_format = "ggmlv3" - elif "gptq" in engine_name_lower: - # GPTQ engines need GPTQ format - test_spec_class = GptqLLMSpecV2 - model_format = "gptq" - elif "awq" in engine_name_lower: - # AWQ engines need AWQ format - test_spec_class = AwqLLMSpecV2 - model_format = "awq" + test_spec_class = LlamaCppLLMSpecV2 + model_format = "ggufv2" else: - # Default to PyTorch format + # Default to PyTorch format (supports gptq, awq, fp8, bnb) test_spec_class = PytorchLLMSpecV2 model_format = "pytorch" From 0acb4711751c2d295cbeb037763407b0735aa229 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 29 Oct 2025 17:54:11 +0800 Subject: [PATCH 34/37] fix mlx CI bug --- xinference/model/llm/mlx/core.py | 44 +++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py index ab8f1608db..b391ac97b8 100644 --- a/xinference/model/llm/mlx/core.py +++ b/xinference/model/llm/mlx/core.py @@ -423,6 +423,14 @@ def match_json( if llm_spec.model_format not in ["mlx"]: return f"MLX engine only supports MLX format, got: {llm_spec.model_format}" + # Base MLX model should not handle chat or vision models + # Those should be handled by MLXChatModel and MLXVisionModel respectively + model_abilities = getattr(llm_family, "model_ability", []) + if "chat" in model_abilities: + return False # Let MLXChatModel handle this + if "vision" in model_abilities: + return False # Let MLXVisionModel handle this + # Check memory constraints for Apple Silicon model_size = float(str(llm_spec.model_size_in_billions)) if model_size > 70: # Large models may be problematic @@ -729,10 +737,28 @@ def _sanitize_generate_config( def match_json( cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str ) -> Union[bool, str]: - # First run base class checks - base_result = super().match_json(llm_family, llm_spec, quantization) - if base_result != True: - return base_result + # Check library availability first + lib_result = cls.check_lib() + if lib_result != True: + return lib_result + + # Check model format compatibility + if llm_spec.model_format not in ["mlx"]: + return f"MLX Chat engine only supports MLX format, got: {llm_spec.model_format}" + + # Check that this model has chat ability + model_abilities = getattr(llm_family, "model_ability", []) + if "chat" not in model_abilities: + return False # Not a chat model + + # MLX Chat doesn't support vision + if "vision" in model_abilities: + return False # Let MLXVisionModel handle this + + # Check memory constraints for Apple Silicon + model_size = float(str(llm_spec.model_size_in_billions)) + if model_size > 70: # Large models may be problematic + return f"MLX Chat may have memory limitations with very large models ({model_size}B parameters)" return True @@ -801,6 +827,16 @@ def match_json( if llm_spec.model_format not in ["mlx"]: return f"MLX Vision engine only supports MLX format, got: {llm_spec.model_format}" + # Check that this model has vision ability + model_abilities = getattr(llm_family, "model_ability", []) + if "vision" not in model_abilities: + return False # Not a vision model + + # Check memory constraints for Apple Silicon + model_size = float(str(llm_spec.model_size_in_billions)) + if model_size > 70: # Large models may be problematic + return f"MLX Vision may have memory limitations with very large models ({model_size}B parameters)" + return True def _load_model(self, **kwargs): From 1b973b41f50de563b97f256318ce47ca839abe3c Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Thu, 30 Oct 2025 16:19:19 +0800 Subject: [PATCH 35/37] fix CI bug --- xinference/model/embedding/vllm/core.py | 72 +++++++++++++++++-- xinference/model/rerank/vllm/core.py | 62 ++++++++++++++-- .../model/rerank/vllm/tests/test_vllm.py | 1 + 3 files changed, 124 insertions(+), 11 deletions(-) diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py index 8fc32ebac8..674eeaa21e 100644 --- a/xinference/model/embedding/vllm/core.py +++ b/xinference/model/embedding/vllm/core.py @@ -22,7 +22,7 @@ from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1 logger = logging.getLogger(__name__) -SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"] +SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"] class VLLMEmbeddingModel(EmbeddingModel): @@ -32,16 +32,44 @@ def __init__(self, *args, **kwargs): def load(self): try: + # Handle vLLM-transformers config conflict by setting environment variable + import os + + os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm" + from vllm import LLM - except ImportError: + except ImportError as e: error_message = "Failed to import module 'vllm'" installation_guide = [ "Please make sure 'vllm' is installed. ", "You can install it by `pip install vllm`\n", ] + # Check if it's a config conflict error + if "aimv2" in str(e): + error_message = ( + "vLLM has a configuration conflict with transformers library" + ) + installation_guide = [ + "This is a known issue with certain vLLM and transformers versions.", + "Try upgrading transformers or using a different vLLM version.\n", + ] + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + except Exception as e: + # Handle config registration conflicts + if "aimv2" in str(e) and "already used by a Transformers config" in str(e): + error_message = ( + "vLLM has a configuration conflict with transformers library" + ) + installation_guide = [ + "This is a known issue with certain vLLM and transformers versions.", + "Try: pip install --upgrade transformers vllm\n", + ] + raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}") + raise + if self.model_family.model_name in { "Qwen3-Embedding-0.6B", "Qwen3-Embedding-4B", @@ -168,11 +196,41 @@ def match_json( if lib_result != True: return lib_result - if model_spec.model_format in ["pytorch"]: - prefix = model_family.model_name.split("-", 1)[0] - if prefix in SUPPORTED_MODELS_PREFIXES: - return True - return f"VLLM Embedding engine only supports pytorch format models with supported prefixes, got format: {model_spec.model_format}, model: {model_family.model_name}" + # Check model format compatibility + if model_spec.model_format not in ["pytorch"]: + return f"VLLM Embedding engine only supports pytorch format models, got format: {model_spec.model_format}" + + # Check model name prefix matching + prefix = model_family.model_name.split("-", 1)[0] + if prefix.lower() not in [p.lower() for p in SUPPORTED_MODELS_PREFIXES]: + return f"VLLM Embedding engine only supports models with prefixes {SUPPORTED_MODELS_PREFIXES}, got model: {model_family.model_name}" + + # Additional runtime compatibility checks for vLLM version + try: + import vllm + from packaging.version import Version + + vllm_version = Version(vllm.__version__) + + # Check for vLLM version compatibility issues + if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"): + # vLLM 0.10.x has V1 engine issues on CPU + import platform + + if platform.system() == "Darwin" and platform.machine() in [ + "arm64", + "arm", + ]: + # Check if this is likely to run on CPU (most common for testing) + return f"vLLM {vllm_version} has compatibility issues with embedding models on Apple Silicon CPUs. Consider using a different platform or vLLM version." + elif vllm_version >= Version("0.11.0"): + # vLLM 0.11+ should have fixed the config conflict issue + pass + except Exception: + # If version check fails, continue with basic validation + pass + + return True def wait_for_load(self): # set context length after engine inited diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index 4f63c0136c..2c6d9dbeed 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -6,22 +6,49 @@ from ...utils import cache_clean from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1 -SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"] +SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"] class VLLMRerankModel(RerankModel): def load(self): try: + # Handle vLLM-transformers config conflict by setting environment variable + import os + + os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_vllm" + from vllm import LLM - except ImportError: + except ImportError as e: error_message = "Failed to import module 'vllm'" installation_guide = [ "Please make sure 'vllm' is installed. ", "You can install it by `pip install vllm`\n", ] + # Check if it's a config conflict error + if "aimv2" in str(e): + error_message = ( + "vLLM has a configuration conflict with transformers library" + ) + installation_guide = [ + "This is a known issue with certain vLLM and transformers versions.", + "Try upgrading transformers or using a different vLLM version.\n", + ] + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + except Exception as e: + # Handle config registration conflicts + if "aimv2" in str(e) and "already used by a Transformers config" in str(e): + error_message = ( + "vLLM has a configuration conflict with transformers library" + ) + installation_guide = [ + "This is a known issue with certain vLLM and transformers versions.", + "Try: pip install --upgrade transformers vllm\n", + ] + raise RuntimeError(f"{error_message}\n\n{''.join(installation_guide)}") + raise if self.model_family.model_name in { "Qwen3-Reranker-0.6B", @@ -180,7 +207,34 @@ def match_json( # Check max tokens limit for vLLM reranking performance max_tokens = model_family.max_tokens - if max_tokens and max_tokens > 4096: # vLLM has stricter limits - return f"High max_tokens limit for vLLM reranking model: {max_tokens}, may cause performance issues" + if ( + max_tokens and max_tokens > 32768 + ): # vLLM has stricter limits, but Qwen3 can handle up to 32k + return f"Max tokens limit too high for vLLM reranking model: {max_tokens}, exceeds safe limit" + + # Additional runtime compatibility checks for vLLM version + try: + import vllm + from packaging.version import Version + + vllm_version = Version(vllm.__version__) + + # Check for vLLM version compatibility issues + if vllm_version >= Version("0.10.0") and vllm_version < Version("0.11.0"): + # vLLM 0.10.x has V1 engine issues on CPU + import platform + + if platform.system() == "Darwin" and platform.machine() in [ + "arm64", + "arm", + ]: + # Check if this is likely to run on CPU (most common for testing) + return f"vLLM {vllm_version} has compatibility issues with reranking models on Apple Silicon CPUs. Consider using a different platform or vLLM version." + elif vllm_version >= Version("0.11.0"): + # vLLM 0.11+ should have fixed the config conflict issue + pass + except Exception: + # If version check fails, continue with basic validation + pass return True diff --git a/xinference/model/rerank/vllm/tests/test_vllm.py b/xinference/model/rerank/vllm/tests/test_vllm.py index 37b948ac42..578b62bdd4 100644 --- a/xinference/model/rerank/vllm/tests/test_vllm.py +++ b/xinference/model/rerank/vllm/tests/test_vllm.py @@ -61,6 +61,7 @@ def test_qwen3_vllm(setup): model_name="Qwen3-Reranker-0.6B", model_type="rerank", model_engine="vllm", + max_num_batched_tokens=81920, # Allow larger batch size for Qwen3 ) model = client.get_model(model_uid) From f52824a70484083cd68ef82341d4f4e9b87d8863 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 10 Nov 2025 16:44:45 +0800 Subject: [PATCH 36/37] modify embedding sentence_transformers --- xinference/model/embedding/sentence_transformers/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xinference/model/embedding/sentence_transformers/core.py b/xinference/model/embedding/sentence_transformers/core.py index 4e1c7b8b73..6521358a3f 100644 --- a/xinference/model/embedding/sentence_transformers/core.py +++ b/xinference/model/embedding/sentence_transformers/core.py @@ -449,13 +449,13 @@ def match_json( # Check model dimensions compatibility model_dimensions = model_family.dimensions - if model_dimensions > 1536: # Very large embedding models - return f"Large embedding model detected ({model_dimensions} dimensions), may have performance issues" + if model_dimensions > 8192: # Extremely large embedding models + return f"Extremely large embedding model detected ({model_dimensions} dimensions), may have performance issues" # Check token limits max_tokens = model_family.max_tokens - if max_tokens > 8192: # Very high token limits - return f"High token limit model detected (max_tokens: {max_tokens}), may cause memory issues" + if max_tokens > 131072: # Extremely high token limits (128K) + return f"Extremely high token limit model detected (max_tokens: {max_tokens}), may cause memory issues" # Check for special model requirements model_name = model_family.model_name.lower() From dd2f141d06d5716b274e41c95ac5dee7bcc64575 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 12 Nov 2025 10:44:08 +0800 Subject: [PATCH 37/37] modify embedding sentence_transformers --- xinference/model/embedding/vllm/core.py | 47 +++++++++++++++++++++++-- xinference/model/rerank/vllm/core.py | 40 +++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py index 674eeaa21e..c037ce2b53 100644 --- a/xinference/model/embedding/vllm/core.py +++ b/xinference/model/embedding/vllm/core.py @@ -89,6 +89,34 @@ def load(self): is_matryoshka=True, ) + # Set appropriate VLLM configuration parameters based on model capabilities + model_max_tokens = getattr(self.model_family, "max_tokens", 512) + + # Set max_model_len based on model family capabilities with reasonable limits + max_model_len = min(model_max_tokens, 8192) + if "max_model_len" not in self._kwargs: + self._kwargs["max_model_len"] = max_model_len + + # Ensure max_num_batched_tokens is sufficient for large models + if "max_num_batched_tokens" not in self._kwargs: + # max_num_batched_tokens should be at least max_model_len + # Set to a reasonable minimum that satisfies the constraint + self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len) + + # Configure other reasonable defaults for embedding models + if "gpu_memory_utilization" not in self._kwargs: + self._kwargs["gpu_memory_utilization"] = 0.7 + + # Use a smaller block size for better compatibility + if "block_size" not in self._kwargs: + self._kwargs["block_size"] = 16 + + logger.debug( + f"VLLM configuration for {self.model_family.model_name}: " + f"max_model_len={self._kwargs.get('max_model_len')}, " + f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}" + ) + self._model = LLM(model=self._model_path, task="embed", **self._kwargs) self._tokenizer = self._model.get_tokenizer() @@ -246,6 +274,21 @@ def _set_context_length(self): self._model.llm_engine.vllm_config.model_config.max_model_len ) else: - # v1 - logger.warning("vLLM v1 is not supported, ignore context length setting") + # v1 - Get max_model_len from the v1 engine configuration + try: + # For v1, access the config differently + if hasattr(self._model.llm_engine, "vllm_config"): + self._context_length = ( + self._model.llm_engine.vllm_config.model_config.max_model_len + ) + elif hasattr(self._model.llm_engine, "model_config"): + self._context_length = ( + self._model.llm_engine.model_config.max_model_len + ) + else: + # Fallback to the configured value + self._context_length = self._kwargs.get("max_model_len", 512) + except Exception as e: + logger.warning(f"Failed to get context length from vLLM v1 engine: {e}") + self._context_length = self._kwargs.get("max_model_len", 512) logger.debug("Model context length: %s", self._context_length) diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py index 2c6d9dbeed..9729a2ccc7 100644 --- a/xinference/model/rerank/vllm/core.py +++ b/xinference/model/rerank/vllm/core.py @@ -1,4 +1,6 @@ import importlib.util +import json +import logging import uuid from typing import List, Optional, Union @@ -6,6 +8,8 @@ from ...utils import cache_clean from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1 +logger = logging.getLogger(__name__) + SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"] @@ -67,6 +71,42 @@ def load(self): classifier_from_token=["no", "yes"], is_original_qwen3_reranker=True, ) + elif isinstance(self._kwargs["hf_overrides"], str): + self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"]) + self._kwargs["hf_overrides"].update( + architectures=["Qwen3ForSequenceClassification"], + classifier_from_token=["no", "yes"], + is_original_qwen3_reranker=True, + ) + + # Set appropriate VLLM configuration parameters based on model capabilities + model_max_tokens = getattr(self.model_family, "max_tokens", 512) + + # Set max_model_len based on model family capabilities with reasonable limits + max_model_len = min(model_max_tokens, 8192) + if "max_model_len" not in self._kwargs: + self._kwargs["max_model_len"] = max_model_len + + # Ensure max_num_batched_tokens is sufficient for large models + if "max_num_batched_tokens" not in self._kwargs: + # max_num_batched_tokens should be at least max_model_len + # Set to a reasonable minimum that satisfies the constraint + self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len) + + # Configure other reasonable defaults for reranking models + if "gpu_memory_utilization" not in self._kwargs: + self._kwargs["gpu_memory_utilization"] = 0.7 + + # Use a smaller block size for better compatibility + if "block_size" not in self._kwargs: + self._kwargs["block_size"] = 16 + + logger.debug( + f"VLLM configuration for rerank model {self.model_family.model_name}: " + f"max_model_len={self._kwargs.get('max_model_len')}, " + f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}" + ) + self._model = LLM(model=self._model_path, task="score", **self._kwargs) self._tokenizer = self._model.get_tokenizer()