Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
</p>

## 🔥 Latest News

- 2025-04-29: Deploy Qwen 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###Qwen-3-Series).
- 2025-04-21: Deploy GLM Z1/0414 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###GLM-Z1/0414-Series).
- 2025-03-17: Deploy Gemma 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###gemma-3-series).
- 2025-03-06: Deploy QwQ-32B with [one command line](docs/en/best_deployment_practices.md##famous-models###qwen-series###qwq-32b).

Expand Down
17 changes: 17 additions & 0 deletions docs/en/best_deployment_practices.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,23 @@
This document provides examples of best practices for deploying models using EMD for various use cases.

## Famous Models
### Qwen 3 Series
```
emd deploy --model-id Qwen3-30B-A3B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime

emd deploy --model-id Qwen3-32B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime

emd deploy --model-id Qwen3-8B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
```


### GLM Z1/0414 Series
```
emd deploy --model-id GLM-Z1-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime

emd deploy --model-id GLM-4-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
```


### Mistral Small Series
```
Expand Down
6 changes: 5 additions & 1 deletion src/emd/cfn/sagemaker_realtime/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ Parameters:
Region:
Type: String
Description: The region to be used for the SageMaker Endpoint
MinCapacity:
Type: Number
Description: The minimum capacity of the endpoint
Default: 1
MaxCapacity:
Type: Number
Description: The maximum capacity of the endpoint
Expand Down Expand Up @@ -117,7 +121,7 @@ Resources:
Type: AWS::ApplicationAutoScaling::ScalableTarget
Properties:
MaxCapacity: !Ref MaxCapacity
MinCapacity: 1
MinCapacity: !Ref MinCapacity
RoleARN: !GetAtt ExecutionRole.Arn
ResourceId: !Sub "endpoint/${SageMakerEndpoint.EndpointName}/variant/AllTraffic"
ScalableDimension: "sagemaker:variant:DesiredInstanceCount"
Expand Down
27 changes: 27 additions & 0 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,25 @@ class KtransformersEngine(OpenAICompitableEngine):
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --chat-template emd/models/chat_templates/qwen2vl_add_prefill_chat_template.jinja --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=2,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
})


vllm_ui_tars_1_5_engin084 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=1,video=0 --max_num_seq 2 --gpu_memory_utilization 0.9 --enable-prefix-caching"
})



vllm_qwen3_engin084 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --max_model_len 16000 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
})


vllm_qwen2vl72b_engine064 = VllmEngine(**{
**vllm_engine064.model_dump(),
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
Expand All @@ -141,6 +160,14 @@ class KtransformersEngine(OpenAICompitableEngine):
"default_cli_args": " --max_model_len 25000 --disable-log-stats --limit-mm-per-prompt image=20,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
})

vllm_qwen25vl72b_engine084 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"dockerfile_name":"Dockerfile_qwen25_vl",
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
})

vllm_qwq_engine073 = VllmEngine(**{
**vllm_qwen25vl72b_engine073.model_dump(),
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
Expand Down
241 changes: 239 additions & 2 deletions src/emd/models/llms/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
tgi_qwen2d5_72b_on_inf2,
vllm_qwen2d5_72b_engine064,
vllm_qwq_engine073,
vllm_qwq_engine082
vllm_qwq_engine082,
vllm_qwen3_engin084
)
from ..services import (
sagemaker_service,
Expand All @@ -34,7 +35,7 @@
from emd.models.utils.constants import ModelType
from emd.models.utils.constants import ModelType
from emd.models import ModelSeries
from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL
from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL,QWEN3_SERIES

Model.register(
dict(
Expand Down Expand Up @@ -498,3 +499,239 @@
model_series=QWEN_REASONING_MODEL
)
)


Model.register(
dict(
model_id = "Qwen3-8B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
g4dn2xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-8B",
modelscope_model_id="Qwen/Qwen3-8B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)

Model.register(
dict(
model_id = "Qwen3-0.6B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
g4dn2xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-0.6B",
modelscope_model_id="Qwen/Qwen3-0.6B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)

Model.register(
dict(
model_id = "Qwen3-1.7B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
g4dn2xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-1.7B",
modelscope_model_id="Qwen/Qwen3-1.7B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)


Model.register(
dict(
model_id = "Qwen3-4B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
g4dn2xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-4B",
modelscope_model_id="Qwen/Qwen3-4B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)


Model.register(
dict(
model_id = "Qwen3-14B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-14B",
modelscope_model_id="Qwen/Qwen3-14B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)

Model.register(
dict(
model_id = "Qwen3-32B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-32B",
modelscope_model_id="Qwen/Qwen3-32B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)


Model.register(
dict(
model_id = "Qwen3-30B-A3B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-30B-A3B",
modelscope_model_id="Qwen/Qwen3-30B-A3B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)
14 changes: 14 additions & 0 deletions src/emd/models/model_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
reference_link="https://github.com/QwenLM/Qwen2.5"
)

QWEN3_SERIES = ModelSeries(
model_series_name = ModelSeriesType.QWEN3,
description="the latest addition to the Qwen family of large language models. These models represent our most advanced and intelligent systems to date, improving from our experience in building QwQ and Qwen2.5. We are making the weights of Qwen3 available to the public, including both dense and Mixture-of-Expert (MoE) models.",
reference_link="https://github.com/QwenLM/Qwen3"
)


GLM4_SERIES = ModelSeries(
model_series_name = ModelSeriesType.GLM4,
description="The GLM-4 series includes the latest generation of pre-trained models launched by Zhipu AI.",
Expand Down Expand Up @@ -62,6 +69,13 @@
reference_link="https://github.com/QwenLM/Qwen2-VL"
)


AGENT_SERIES = ModelSeries(
model_series_name=ModelSeriesType.AGENT,
description="""LLM or VLM models for Agentic tasks, e.g. computer-use,brower-use""",
reference_link=""
)

INTERNVL25_SERIES = ModelSeries(
model_series_name=ModelSeriesType.INTERNVL25,
description="""InternVL2.5 is an advanced multimodal large language model (MLLM) series with parameter coverage ranging from 1B to 78B. InternVL2_5-78B is the first open-source MLLMs to achieve over 70% on the MMMU benchmark, matching the performance of leading closed-source commercial models like GPT-4o.""",
Expand Down
Loading