diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md index 3a66bfb8..26fae4d3 100644 --- a/docs/en/best_deployment_practices.md +++ b/docs/en/best_deployment_practices.md @@ -4,6 +4,11 @@ This document provides examples of best practices for deploying models using EMD ## Famous Models +### Mistral Small Series +``` +emd deploy --model-id Mistral-Small-3.1-24B-Instruct-2503 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime +``` + ### Gemma 3 Series ``` diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md index e8749d1d..0ecfeca7 100644 --- a/docs/en/supported_models.md +++ b/docs/en/supported_models.md @@ -44,6 +44,7 @@ | gemma-3-4b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | gemma-3-12b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | gemma-3-27b-it | gemma3 | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | +| Mistral-Small-3.1-24B-Instruct-2503 | mistral | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ | | txt2video-LTX | comfyui | video | comfyui | g5.4xlarge,g5.8xlarge,g6e.2xlarge | sagemaker_async | ❎ | | whisper | whisper | whisper | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_async | ❎ | | bce-embedding-base_v1 | bce | embedding | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ | diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index f251071a..0dc243f7 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -77,6 +77,17 @@ class KtransformersEngine(OpenAICompitableEngine): } ) + +vllm_mistral_small_engine082 = VllmEngine( + **{ + **vllm_engine064.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.8.2"}, + "dockerfile_name":"Dockerfile", + "default_cli_args": " --tokenizer-mode mistral --config-format mistral --load-format mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384", + "environment_variables": "" + } +) + vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{ **vllm_engine064.model_dump(), "engine_dockerfile_config": {"VERSION":"v0.7.1"}, @@ -124,6 +135,13 @@ class KtransformersEngine(OpenAICompitableEngine): "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes" }) +vllm_qwq_engine082 = VllmEngine(**{ + **vllm_qwen25vl72b_engine073.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.8.2"}, + "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", + "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes --enable-reasoning --reasoning-parser deepseek_r1" +}) + vllm_internvl2d5_76b_engine064 = VllmEngine(**{ **vllm_engine064.model_dump(), diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py index fa48ce79..35a2cc1f 100644 --- a/src/emd/models/llms/qwen.py +++ b/src/emd/models/llms/qwen.py @@ -7,7 +7,8 @@ tgi_qwen2d5_on_inf2, tgi_qwen2d5_72b_on_inf2, vllm_qwen2d5_72b_engine064, - vllm_qwq_engine073 + vllm_qwq_engine073, + vllm_qwq_engine082 ) from ..services import ( sagemaker_service, @@ -471,7 +472,7 @@ Model.register( dict( model_id = "QwQ-32B", - supported_engines=[vllm_qwq_engine073], + supported_engines=[vllm_qwq_engine082], supported_instances=[ g5d12xlarge_instance, g5d24xlarge_instance, diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py index 09062998..ceae13ad 100644 --- a/src/emd/models/model_series.py +++ b/src/emd/models/model_series.py @@ -97,6 +97,12 @@ reference_link="https://blog.google/technology/developers/gemma-3/" ) +MISTRAL_SERIES = ModelSeries( + model_series_name=ModelSeriesType.MISTRAL, + description="LLMs and VLMs provided by MISTRAL AI.", + reference_link="https://huggingface.co/mistralai" +) + DEEPSEEK_REASONING_MODEL = ModelSeries( model_series_name=ModelSeriesType.DEEPSEEK_REASONING_MODEL, description="DeepSeek-R1-Zero and DeepSeek-R1 are innovative reasoning models, with the former showcasing strong performance through reinforcement learning alone, while the latter enhances reasoning capabilities by incorporating cold-start data, achieving results comparable to OpenAI-o1 and setting new benchmarks with its distilled versions.", diff --git a/src/emd/models/services.py b/src/emd/models/services.py index 05737773..859a12c9 100644 --- a/src/emd/models/services.py +++ b/src/emd/models/services.py @@ -91,6 +91,7 @@ "ServiceType":"service_type", "EngineType":"engine_type", "Region": "region", + "DesiredCapacity": "desired_capacity", "ContainerCpu": "container_cpu", "ContainerMemory": "container_memory", "ContainerGpu":"instance_gpu_num" diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py index b9c21b49..d78414e6 100644 --- a/src/emd/models/utils/constants.py +++ b/src/emd/models/utils/constants.py @@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str): class ModelSeriesType(ConstantBase): GEMMA3 = "gemma3" + MISTRAL = "mistral" QWEN2D5 = "qwen2.5" GLM4 = "glm4" INTERLM2d5 = "internlm2.5" diff --git a/src/emd/models/vlms/__init__.py b/src/emd/models/vlms/__init__.py index bf74f45c..4440a29e 100644 --- a/src/emd/models/vlms/__init__.py +++ b/src/emd/models/vlms/__init__.py @@ -1,3 +1,4 @@ from . import qwen from . import internvl from . import gemma3 +from . import mistral diff --git a/src/emd/models/vlms/mistral.py b/src/emd/models/vlms/mistral.py new file mode 100644 index 00000000..fc597105 --- /dev/null +++ b/src/emd/models/vlms/mistral.py @@ -0,0 +1,54 @@ +from ..engines import vllm_mistral_small_engine082 +from .. import Model +from ..frameworks import fastapi_framework +from ..services import ( + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service +) +from emd.models.utils.constants import ModelType +from ..model_series import MISTRAL_SERIES +from ..instances import ( + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d12xlarge_instance, + g5d16xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + g6e2xlarge_instance, + local_instance +) +from ..utils.constants import ModelFilesDownloadSource + + +Model.register( + dict( + model_id = "Mistral-Small-3.1-24B-Instruct-2503", + supported_engines=[vllm_mistral_small_engine082], + supported_instances=[ + g5d12xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + local_instance + ], + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + huggingface_model_id="unsloth/Mistral-Small-3.1-24B-Instruct-2503", + # require_huggingface_token=False, + modelscope_model_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503", + # model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, + application_scenario="vision llms for image understanding", + description="The latest series of mistral small", + model_type=ModelType.VLM, + model_series=MISTRAL_SERIES, + ) +) diff --git a/src/pipeline/backend/backend.py b/src/pipeline/backend/backend.py index fc46d39d..4996c97a 100644 --- a/src/pipeline/backend/backend.py +++ b/src/pipeline/backend/backend.py @@ -134,9 +134,10 @@ def start_server(self, server_start_command): logger.info(f"Starting {self.engine_type} server with command: {server_start_command}") t = threading.Thread(target=os.system,args=(server_start_command,),daemon=True) t.start() - t2 = threading.Thread(target=self.check_model_serve_ready,args=(t, "127.0.0.1", self.server_port),daemon=True) - t2.start() - t2.join() + self.check_model_serve_ready(t, "127.0.0.1", self.server_port) + logger.info(f"Server started successfully.") + # t2.start() + # t2.join() return