Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.

Commit 6fe7c21

Browse files
authored
fix: modify qwq reasoning output (#90)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy
1 parent 69c0fd5 commit 6fe7c21

File tree

10 files changed

+94
-5
lines changed

10 files changed

+94
-5
lines changed

docs/en/best_deployment_practices.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ This document provides examples of best practices for deploying models using EMD
44

55
## Famous Models
66

7+
### Mistral Small Series
8+
```
9+
emd deploy --model-id Mistral-Small-3.1-24B-Instruct-2503 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
10+
```
11+
712
### Gemma 3 Series
813

914
```

docs/en/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
| gemma-3-4b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs ||
4545
| gemma-3-12b-it | gemma3 | vlm | vllm | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,sagemaker_async,ecs ||
4646
| gemma-3-27b-it | gemma3 | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs ||
47+
| Mistral-Small-3.1-24B-Instruct-2503 | mistral | vlm | vllm | g5.12xlarge,g5.24xlarge,g5.48xlarge | sagemaker_realtime,sagemaker_async,ecs ||
4748
| txt2video-LTX | comfyui | video | comfyui | g5.4xlarge,g5.8xlarge,g6e.2xlarge | sagemaker_async ||
4849
| whisper | whisper | whisper | huggingface | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_async ||
4950
| bce-embedding-base_v1 | bce | embedding | vllm | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge | sagemaker_realtime,ecs ||

src/emd/models/engines.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,17 @@ class KtransformersEngine(OpenAICompitableEngine):
7777
}
7878
)
7979

80+
81+
vllm_mistral_small_engine082 = VllmEngine(
82+
**{
83+
**vllm_engine064.model_dump(),
84+
"engine_dockerfile_config": {"VERSION":"v0.8.2"},
85+
"dockerfile_name":"Dockerfile",
86+
"default_cli_args": " --tokenizer-mode mistral --config-format mistral --load-format mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384",
87+
"environment_variables": ""
88+
}
89+
)
90+
8091
vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{
8192
**vllm_engine064.model_dump(),
8293
"engine_dockerfile_config": {"VERSION":"v0.7.1"},
@@ -124,6 +135,13 @@ class KtransformersEngine(OpenAICompitableEngine):
124135
"default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes"
125136
})
126137

138+
vllm_qwq_engine082 = VllmEngine(**{
139+
**vllm_qwen25vl72b_engine073.model_dump(),
140+
"engine_dockerfile_config": {"VERSION":"v0.8.2"},
141+
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
142+
"default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes --enable-reasoning --reasoning-parser deepseek_r1"
143+
})
144+
127145

128146
vllm_internvl2d5_76b_engine064 = VllmEngine(**{
129147
**vllm_engine064.model_dump(),

src/emd/models/llms/qwen.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
tgi_qwen2d5_on_inf2,
88
tgi_qwen2d5_72b_on_inf2,
99
vllm_qwen2d5_72b_engine064,
10-
vllm_qwq_engine073
10+
vllm_qwq_engine073,
11+
vllm_qwq_engine082
1112
)
1213
from ..services import (
1314
sagemaker_service,
@@ -471,7 +472,7 @@
471472
Model.register(
472473
dict(
473474
model_id = "QwQ-32B",
474-
supported_engines=[vllm_qwq_engine073],
475+
supported_engines=[vllm_qwq_engine082],
475476
supported_instances=[
476477
g5d12xlarge_instance,
477478
g5d24xlarge_instance,

src/emd/models/model_series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@
9797
reference_link="https://blog.google/technology/developers/gemma-3/"
9898
)
9999

100+
MISTRAL_SERIES = ModelSeries(
101+
model_series_name=ModelSeriesType.MISTRAL,
102+
description="LLMs and VLMs provided by MISTRAL AI.",
103+
reference_link="https://huggingface.co/mistralai"
104+
)
105+
100106
DEEPSEEK_REASONING_MODEL = ModelSeries(
101107
model_series_name=ModelSeriesType.DEEPSEEK_REASONING_MODEL,
102108
description="DeepSeek-R1-Zero and DeepSeek-R1 are innovative reasoning models, with the former showcasing strong performance through reinforcement learning alone, while the latter enhances reasoning capabilities by incorporating cold-start data, achieving results comparable to OpenAI-o1 and setting new benchmarks with its distilled versions.",

src/emd/models/services.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
"ServiceType":"service_type",
9292
"EngineType":"engine_type",
9393
"Region": "region",
94+
"DesiredCapacity": "desired_capacity",
9495
"ContainerCpu": "container_cpu",
9596
"ContainerMemory": "container_memory",
9697
"ContainerGpu":"instance_gpu_num"

src/emd/models/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str):
214214

215215
class ModelSeriesType(ConstantBase):
216216
GEMMA3 = "gemma3"
217+
MISTRAL = "mistral"
217218
QWEN2D5 = "qwen2.5"
218219
GLM4 = "glm4"
219220
INTERLM2d5 = "internlm2.5"

src/emd/models/vlms/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from . import qwen
22
from . import internvl
33
from . import gemma3
4+
from . import mistral

src/emd/models/vlms/mistral.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from ..engines import vllm_mistral_small_engine082
2+
from .. import Model
3+
from ..frameworks import fastapi_framework
4+
from ..services import (
5+
sagemaker_service,
6+
sagemaker_async_service,
7+
ecs_service,
8+
local_service
9+
)
10+
from emd.models.utils.constants import ModelType
11+
from ..model_series import MISTRAL_SERIES
12+
from ..instances import (
13+
g5d2xlarge_instance,
14+
g5d4xlarge_instance,
15+
g5d8xlarge_instance,
16+
g5d12xlarge_instance,
17+
g5d16xlarge_instance,
18+
g5d24xlarge_instance,
19+
g5d48xlarge_instance,
20+
g6e2xlarge_instance,
21+
local_instance
22+
)
23+
from ..utils.constants import ModelFilesDownloadSource
24+
25+
26+
Model.register(
27+
dict(
28+
model_id = "Mistral-Small-3.1-24B-Instruct-2503",
29+
supported_engines=[vllm_mistral_small_engine082],
30+
supported_instances=[
31+
g5d12xlarge_instance,
32+
g5d24xlarge_instance,
33+
g5d48xlarge_instance,
34+
local_instance
35+
],
36+
supported_services=[
37+
sagemaker_service,
38+
sagemaker_async_service,
39+
ecs_service,
40+
local_service
41+
],
42+
supported_frameworks=[
43+
fastapi_framework
44+
],
45+
huggingface_model_id="unsloth/Mistral-Small-3.1-24B-Instruct-2503",
46+
# require_huggingface_token=False,
47+
modelscope_model_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503",
48+
# model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
49+
application_scenario="vision llms for image understanding",
50+
description="The latest series of mistral small",
51+
model_type=ModelType.VLM,
52+
model_series=MISTRAL_SERIES,
53+
)
54+
)

src/pipeline/backend/backend.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,10 @@ def start_server(self, server_start_command):
134134
logger.info(f"Starting {self.engine_type} server with command: {server_start_command}")
135135
t = threading.Thread(target=os.system,args=(server_start_command,),daemon=True)
136136
t.start()
137-
t2 = threading.Thread(target=self.check_model_serve_ready,args=(t, "127.0.0.1", self.server_port),daemon=True)
138-
t2.start()
139-
t2.join()
137+
self.check_model_serve_ready(t, "127.0.0.1", self.server_port)
138+
logger.info(f"Server started successfully.")
139+
# t2.start()
140+
# t2.join()
140141
return
141142

142143

0 commit comments

Comments
 (0)