aws-samples · yanbasic · Apr 29, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 27, 2025
diff --git a/README.md b/README.md
@@ -15,7 +15,8 @@
 </p>
 
 ## 🔥 Latest News
-
+- 2025-04-29: Deploy Qwen 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###Qwen-3-Series).
+- 2025-04-21: Deploy GLM Z1/0414 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###GLM-Z1/0414-Series).
 - 2025-03-17: Deploy Gemma 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###gemma-3-series).
 - 2025-03-06: Deploy QwQ-32B with [one command line](docs/en/best_deployment_practices.md##famous-models###qwen-series###qwq-32b).
 

diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
@@ -3,6 +3,23 @@
 This document provides examples of best practices for deploying models using EMD for various use cases.
 
 ## Famous Models
+### Qwen 3 Series
+```
+emd deploy --model-id Qwen3-30B-A3B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id Qwen3-32B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id Qwen3-8B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+```
+
+
+### GLM Z1/0414 Series
+```
+emd deploy --model-id GLM-Z1-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id GLM-4-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+```
+
 
 ### Mistral Small Series
 ```

diff --git a/src/emd/cfn/sagemaker_realtime/template.yaml b/src/emd/cfn/sagemaker_realtime/template.yaml
@@ -26,6 +26,10 @@ Parameters:
   Region:
     Type: String
     Description: The region to be used for the SageMaker Endpoint
+  MinCapacity:
+    Type: Number
+    Description: The minimum capacity of the endpoint
+    Default: 1
   MaxCapacity:
     Type: Number
     Description: The maximum capacity of the endpoint
@@ -117,7 +121,7 @@ Resources:
     Type: AWS::ApplicationAutoScaling::ScalableTarget
     Properties:
       MaxCapacity: !Ref MaxCapacity
-      MinCapacity: 1
+      MinCapacity: !Ref MinCapacity
       RoleARN: !GetAtt ExecutionRole.Arn
       ResourceId: !Sub "endpoint/${SageMakerEndpoint.EndpointName}/variant/AllTraffic"
       ScalableDimension: "sagemaker:variant:DesiredInstanceCount"

diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
@@ -127,6 +127,25 @@ class KtransformersEngine(OpenAICompitableEngine):
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
             "default_cli_args": " --chat-template emd/models/chat_templates/qwen2vl_add_prefill_chat_template.jinja --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=2,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
 })
+
+
+vllm_ui_tars_1_5_engin084 = VllmEngine(**{
+             **vllm_engine064.model_dump(),
+             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=1,video=0 --max_num_seq 2 --gpu_memory_utilization 0.9 --enable-prefix-caching"
+})
+
+
+
+vllm_qwen3_engin084 = VllmEngine(**{
+             **vllm_engine064.model_dump(),
+             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 16000 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
+})
+
+
 vllm_qwen2vl72b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
@@ -141,6 +160,14 @@ class KtransformersEngine(OpenAICompitableEngine):
             "default_cli_args": " --max_model_len 25000 --disable-log-stats --limit-mm-per-prompt image=20,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
 })
 
+vllm_qwen25vl72b_engine084 = VllmEngine(**{
+            **vllm_engine064.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "dockerfile_name":"Dockerfile_qwen25_vl",
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
+})
+
 vllm_qwq_engine073 = VllmEngine(**{
             **vllm_qwen25vl72b_engine073.model_dump(),
             "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",

diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
@@ -8,7 +8,8 @@
     tgi_qwen2d5_72b_on_inf2,
     vllm_qwen2d5_72b_engine064,
     vllm_qwq_engine073,
-    vllm_qwq_engine082
+    vllm_qwq_engine082,
+    vllm_qwen3_engin084
 )
 from ..services import (
     sagemaker_service,
@@ -34,7 +35,7 @@
 from emd.models.utils.constants import ModelType
 from emd.models.utils.constants import ModelType
 from emd.models import ModelSeries
-from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL
+from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL,QWEN3_SERIES
 
 Model.register(
     dict(
@@ -498,3 +499,239 @@
         model_series=QWEN_REASONING_MODEL
     )
 )
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-8B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-8B",
+        modelscope_model_id="Qwen/Qwen3-8B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-0.6B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-0.6B",
+        modelscope_model_id="Qwen/Qwen3-0.6B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-1.7B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-1.7B",
+        modelscope_model_id="Qwen/Qwen3-1.7B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-4B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-4B",
+        modelscope_model_id="Qwen/Qwen3-4B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-14B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-14B",
+        modelscope_model_id="Qwen/Qwen3-14B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-32B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-32B",
+        modelscope_model_id="Qwen/Qwen3-32B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-30B-A3B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-30B-A3B",
+        modelscope_model_id="Qwen/Qwen3-30B-A3B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py
@@ -7,6 +7,13 @@
     reference_link="https://github.com/QwenLM/Qwen2.5"
 )
 
+QWEN3_SERIES = ModelSeries(
+    model_series_name = ModelSeriesType.QWEN3,
+    description="the latest addition to the Qwen family of large language models. These models represent our most advanced and intelligent systems to date, improving from our experience in building QwQ and Qwen2.5. We are making the weights of Qwen3 available to the public, including both dense and Mixture-of-Expert (MoE) models.",
+    reference_link="https://github.com/QwenLM/Qwen3"
+)
+
+
 GLM4_SERIES = ModelSeries(
     model_series_name = ModelSeriesType.GLM4,
     description="The GLM-4 series includes the latest generation of pre-trained models launched by Zhipu AI.",
@@ -62,6 +69,13 @@
     reference_link="https://github.com/QwenLM/Qwen2-VL"
 )
 
+
+AGENT_SERIES = ModelSeries(
+    model_series_name=ModelSeriesType.AGENT,
+    description="""LLM or VLM models for Agentic tasks, e.g. computer-use,brower-use""",
+    reference_link=""
+)
+
 INTERNVL25_SERIES = ModelSeries(
     model_series_name=ModelSeriesType.INTERNVL25,
     description="""InternVL2.5 is an advanced multimodal large language model (MLLM) series with parameter coverage ranging from 1B to 78B. InternVL2_5-78B is the first open-source MLLMs to achieve over 70% on the MMMU benchmark, matching the performance of leading closed-source commercial models like GPT-4o.""",