Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/en/best_deployment_practices.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ This document provides examples of best practices for deploying models using EMD

## Famous Models

### Mistral Small Series
```
emd deploy --model-id Mistral-Small-3.1-24B-Instruct-2503 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
```

### Gemma 3 Series

```
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
| gemma-3-4b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ |
| gemma-3-12b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ |
| gemma-3-27b-it | gemma3 | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ |
| Mistral-Small-3.1-24B-Instruct-2503 | mistral | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs | ❎ |
| txt2video-LTX | comfyui | video | comfyui | g5.4xlarge,g5.8xlarge,g6e.2xlarge | sagemaker_async | ❎ |
| whisper | whisper | whisper | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_async | ❎ |
| bce-embedding-base_v1 | bce | embedding | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs | ✅ |
Expand Down
18 changes: 18 additions & 0 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@ class KtransformersEngine(OpenAICompitableEngine):
}
)


vllm_mistral_small_engine082 = VllmEngine(
**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.2"},
"dockerfile_name":"Dockerfile",
"default_cli_args": " --tokenizer-mode mistral --config-format mistral --load-format mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384",
"environment_variables": ""
}
)

vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.7.1"},
Expand Down Expand Up @@ -124,6 +135,13 @@ class KtransformersEngine(OpenAICompitableEngine):
"default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes"
})

vllm_qwq_engine082 = VllmEngine(**{
**vllm_qwen25vl72b_engine073.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.2"},
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes --enable-reasoning --reasoning-parser deepseek_r1"
})


vllm_internvl2d5_76b_engine064 = VllmEngine(**{
**vllm_engine064.model_dump(),
Expand Down
5 changes: 3 additions & 2 deletions src/emd/models/llms/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
tgi_qwen2d5_on_inf2,
tgi_qwen2d5_72b_on_inf2,
vllm_qwen2d5_72b_engine064,
vllm_qwq_engine073
vllm_qwq_engine073,
vllm_qwq_engine082
)
from ..services import (
sagemaker_service,
Expand Down Expand Up @@ -471,7 +472,7 @@
Model.register(
dict(
model_id = "QwQ-32B",
supported_engines=[vllm_qwq_engine073],
supported_engines=[vllm_qwq_engine082],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
Expand Down
6 changes: 6 additions & 0 deletions src/emd/models/model_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@
reference_link="https://blog.google/technology/developers/gemma-3/"
)

MISTRAL_SERIES = ModelSeries(
model_series_name=ModelSeriesType.MISTRAL,
description="LLMs and VLMs provided by MISTRAL AI.",
reference_link="https://huggingface.co/mistralai"
)

DEEPSEEK_REASONING_MODEL = ModelSeries(
model_series_name=ModelSeriesType.DEEPSEEK_REASONING_MODEL,
description="DeepSeek-R1-Zero and DeepSeek-R1 are innovative reasoning models, with the former showcasing strong performance through reinforcement learning alone, while the latter enhances reasoning capabilities by incorporating cold-start data, achieving results comparable to OpenAI-o1 and setting new benchmarks with its distilled versions.",
Expand Down
1 change: 1 addition & 0 deletions src/emd/models/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
"ServiceType":"service_type",
"EngineType":"engine_type",
"Region": "region",
"DesiredCapacity": "desired_capacity",
"ContainerCpu": "container_cpu",
"ContainerMemory": "container_memory",
"ContainerGpu":"instance_gpu_num"
Expand Down
1 change: 1 addition & 0 deletions src/emd/models/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str):

class ModelSeriesType(ConstantBase):
GEMMA3 = "gemma3"
MISTRAL = "mistral"
QWEN2D5 = "qwen2.5"
GLM4 = "glm4"
INTERLM2d5 = "internlm2.5"
Expand Down
1 change: 1 addition & 0 deletions src/emd/models/vlms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import qwen
from . import internvl
from . import gemma3
from . import mistral
54 changes: 54 additions & 0 deletions src/emd/models/vlms/mistral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from ..engines import vllm_mistral_small_engine082
from .. import Model
from ..frameworks import fastapi_framework
from ..services import (
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
)
from emd.models.utils.constants import ModelType
from ..model_series import MISTRAL_SERIES
from ..instances import (
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d12xlarge_instance,
g5d16xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
g6e2xlarge_instance,
local_instance
)
from ..utils.constants import ModelFilesDownloadSource


Model.register(
dict(
model_id = "Mistral-Small-3.1-24B-Instruct-2503",
supported_engines=[vllm_mistral_small_engine082],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
huggingface_model_id="unsloth/Mistral-Small-3.1-24B-Instruct-2503",
# require_huggingface_token=False,
modelscope_model_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503",
# model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
application_scenario="vision llms for image understanding",
description="The latest series of mistral small",
model_type=ModelType.VLM,
model_series=MISTRAL_SERIES,
)
)
7 changes: 4 additions & 3 deletions src/pipeline/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,10 @@ def start_server(self, server_start_command):
logger.info(f"Starting {self.engine_type} server with command: {server_start_command}")
t = threading.Thread(target=os.system,args=(server_start_command,),daemon=True)
t.start()
t2 = threading.Thread(target=self.check_model_serve_ready,args=(t, "127.0.0.1", self.server_port),daemon=True)
t2.start()
t2.join()
self.check_model_serve_ready(t, "127.0.0.1", self.server_port)
logger.info(f"Server started successfully.")
# t2.start()
# t2.join()
return


Expand Down