From 8d1b0c2b2224614c983fb02ec0348db9e685bfe7 Mon Sep 17 00:00:00 2001 From: zhouxss Date: Tue, 25 Mar 2025 03:47:27 +0000 Subject: [PATCH 1/4] merge --- docs/en/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md index e8749d1d..a1d94a8a 100644 --- a/docs/en/supported_models.md +++ b/docs/en/supported_models.md @@ -52,4 +52,4 @@ | jina-embeddings-v3 | jina | embedding | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | bge-reranker-v2-m3 | bge | rerank | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | bge-reranker-large | bge | rerank | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | -| jina-reranker-v2-base-multilingual | jina | rerank | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | +| jina-reranker-v2-base-multilingual | jina | rerank | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | \ No newline at end of file From ef01e39d77fd3e9fdba4bd89943244f01729d989 Mon Sep 17 00:00:00 2001 From: zhouxss Date: Tue, 25 Mar 2025 03:52:23 +0000 Subject: [PATCH 2/4] merge --- docs/en/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md index a1d94a8a..e8749d1d 100644 --- a/docs/en/supported_models.md +++ b/docs/en/supported_models.md @@ -52,4 +52,4 @@ | jina-embeddings-v3 | jina | embedding | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | bge-reranker-v2-m3 | bge | rerank | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | | bge-reranker-large | bge | rerank | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | -| jina-reranker-v2-base-multilingual | jina | rerank | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | \ No newline at end of file +| jina-reranker-v2-base-multilingual | jina | rerank | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | From 2845bc357fdca586143bf1738c432501b754a4e2 Mon Sep 17 00:00:00 2001 From: zhouxss Date: Thu, 27 Mar 2025 08:14:39 +0000 Subject: [PATCH 3/4] add Mistral-Small-3.1-24B-Instruct-2503 --- docs/en/best_deployment_practices.md | 5 +++ docs/en/supported_models.md | 1 + src/emd/models/engines.py | 11 ++++++ src/emd/models/model_series.py | 6 ++++ src/emd/models/utils/constants.py | 1 + src/emd/models/vlms/__init__.py | 1 + src/emd/models/vlms/mistral.py | 54 ++++++++++++++++++++++++++++ 7 files changed, 79 insertions(+) create mode 100644 src/emd/models/vlms/mistral.py diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md index 3a66bfb8..26fae4d3 100644 --- a/docs/en/best_deployment_practices.md +++ b/docs/en/best_deployment_practices.md @@ -4,6 +4,11 @@ This document provides examples of best practices for deploying models using EMD ## Famous Models +### Mistral Small Series +``` +emd deploy --model-id Mistral-Small-3.1-24B-Instruct-2503 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime +``` + ### Gemma 3 Series ``` diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md index e8749d1d..0ecfeca7 100644 --- a/docs/en/supported_models.md +++ b/docs/en/supported_models.md @@ -44,6 +44,7 @@ | gemma-3-4b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | gemma-3-12b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | gemma-3-27b-it | gemma3 | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | +| Mistral-Small-3.1-24B-Instruct-2503 | mistral | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | txt2video-LTX | comfyui | video | comfyui | g5.4xlarge,g5.8xlarge,g6e.2xlarge | sagemaker_async | ❎ | | whisper | whisper | whisper | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_async | ❎ | | bce-embedding-base_v1 | bce | embedding | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index f251071a..e748ab83 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -77,6 +77,17 @@ class KtransformersEngine(OpenAICompitableEngine): } ) + +vllm_mistral_small_engine082 = VllmEngine( + **{ + **vllm_engine064.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.8.2"}, + "dockerfile_name":"Dockerfile", + "default_cli_args": " --tokenizer-mode mistral --config-format mistral --load-format mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384", + "environment_variables": "" + } +) + vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{ **vllm_engine064.model_dump(), "engine_dockerfile_config": {"VERSION":"v0.7.1"}, diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py index 09062998..ceae13ad 100644 --- a/src/emd/models/model_series.py +++ b/src/emd/models/model_series.py @@ -97,6 +97,12 @@ reference_link="https://blog.google/technology/developers/gemma-3/" ) +MISTRAL_SERIES = ModelSeries( + model_series_name=ModelSeriesType.MISTRAL, + description="LLMs and VLMs provided by MISTRAL AI.", + reference_link="https://huggingface.co/mistralai" +) + DEEPSEEK_REASONING_MODEL = ModelSeries( model_series_name=ModelSeriesType.DEEPSEEK_REASONING_MODEL, description="DeepSeek-R1-Zero and DeepSeek-R1 are innovative reasoning models, with the former showcasing strong performance through reinforcement learning alone, while the latter enhances reasoning capabilities by incorporating cold-start data, achieving results comparable to OpenAI-o1 and setting new benchmarks with its distilled versions.", diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py index b9c21b49..d78414e6 100644 --- a/src/emd/models/utils/constants.py +++ b/src/emd/models/utils/constants.py @@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str): class ModelSeriesType(ConstantBase): GEMMA3 = "gemma3" + MISTRAL = "mistral" QWEN2D5 = "qwen2.5" GLM4 = "glm4" INTERLM2d5 = "internlm2.5" diff --git a/src/emd/models/vlms/__init__.py b/src/emd/models/vlms/__init__.py index bf74f45c..4440a29e 100644 --- a/src/emd/models/vlms/__init__.py +++ b/src/emd/models/vlms/__init__.py @@ -1,3 +1,4 @@ from . import qwen from . import internvl from . import gemma3 +from . import mistral diff --git a/src/emd/models/vlms/mistral.py b/src/emd/models/vlms/mistral.py new file mode 100644 index 00000000..fc597105 --- /dev/null +++ b/src/emd/models/vlms/mistral.py @@ -0,0 +1,54 @@ +from ..engines import vllm_mistral_small_engine082 +from .. import Model +from ..frameworks import fastapi_framework +from ..services import ( + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service +) +from emd.models.utils.constants import ModelType +from ..model_series import MISTRAL_SERIES +from ..instances import ( + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d12xlarge_instance, + g5d16xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + g6e2xlarge_instance, + local_instance +) +from ..utils.constants import ModelFilesDownloadSource + + +Model.register( + dict( + model_id = "Mistral-Small-3.1-24B-Instruct-2503", + supported_engines=[vllm_mistral_small_engine082], + supported_instances=[ + g5d12xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + local_instance + ], + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + huggingface_model_id="unsloth/Mistral-Small-3.1-24B-Instruct-2503", + # require_huggingface_token=False, + modelscope_model_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503", + # model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, + application_scenario="vision llms for image understanding", + description="The latest series of mistral small", + model_type=ModelType.VLM, + model_series=MISTRAL_SERIES, + ) +) From d72b12b22282e6e67dc069f99fdd3ee59e25bcbd Mon Sep 17 00:00:00 2001 From: zhouxss Date: Tue, 1 Apr 2025 08:37:49 +0000 Subject: [PATCH 4/4] modify qwq-32b deploy --- src/emd/models/engines.py | 7 +++++++ src/emd/models/llms/qwen.py | 5 +++-- src/emd/models/services.py | 1 + src/pipeline/backend/backend.py | 7 ++++--- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index e748ab83..0dc243f7 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -135,6 +135,13 @@ class KtransformersEngine(OpenAICompitableEngine): "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes" }) +vllm_qwq_engine082 = VllmEngine(**{ + **vllm_qwen25vl72b_engine073.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.8.2"}, + "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", + "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes --enable-reasoning --reasoning-parser deepseek_r1" +}) + vllm_internvl2d5_76b_engine064 = VllmEngine(**{ **vllm_engine064.model_dump(), diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py index fa48ce79..35a2cc1f 100644 --- a/src/emd/models/llms/qwen.py +++ b/src/emd/models/llms/qwen.py @@ -7,7 +7,8 @@ tgi_qwen2d5_on_inf2, tgi_qwen2d5_72b_on_inf2, vllm_qwen2d5_72b_engine064, - vllm_qwq_engine073 + vllm_qwq_engine073, + vllm_qwq_engine082 ) from ..services import ( sagemaker_service, @@ -471,7 +472,7 @@ Model.register( dict( model_id = "QwQ-32B", - supported_engines=[vllm_qwq_engine073], + supported_engines=[vllm_qwq_engine082], supported_instances=[ g5d12xlarge_instance, g5d24xlarge_instance, diff --git a/src/emd/models/services.py b/src/emd/models/services.py index 05737773..859a12c9 100644 --- a/src/emd/models/services.py +++ b/src/emd/models/services.py @@ -91,6 +91,7 @@ "ServiceType":"service_type", "EngineType":"engine_type", "Region": "region", + "DesiredCapacity": "desired_capacity", "ContainerCpu": "container_cpu", "ContainerMemory": "container_memory", "ContainerGpu":"instance_gpu_num" diff --git a/src/pipeline/backend/backend.py b/src/pipeline/backend/backend.py index fc46d39d..4996c97a 100644 --- a/src/pipeline/backend/backend.py +++ b/src/pipeline/backend/backend.py @@ -134,9 +134,10 @@ def start_server(self, server_start_command): logger.info(f"Starting {self.engine_type} server with command: {server_start_command}") t = threading.Thread(target=os.system,args=(server_start_command,),daemon=True) t.start() - t2 = threading.Thread(target=self.check_model_serve_ready,args=(t, "127.0.0.1", self.server_port),daemon=True) - t2.start() - t2.join() + self.check_model_serve_ready(t, "127.0.0.1", self.server_port) + logger.info(f"Server started successfully.") + # t2.start() + # t2.join() return