From 8d1b0c2b2224614c983fb02ec0348db9e685bfe7 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 25 Mar 2025 03:47:27 +0000
Subject: [PATCH 01/14] merge

---
 docs/en/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md
index e8749d1d..a1d94a8a 100644
--- a/docs/en/supported_models.md
+++ b/docs/en/supported_models.md
@@ -52,4 +52,4 @@
 | jina-embeddings-v3                   | jina                     | embedding   | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |
 | bge-reranker-v2-m3                   | bge                      | rerank      | vllm                    | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                               | sagemaker_realtime,ecs                 | ✅                     |
 | bge-reranker-large                   | bge                      | rerank      | vllm                    | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                               | sagemaker_realtime,ecs                 | ✅                     |
-| jina-reranker-v2-base-multilingual   | jina                     | rerank      | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |
+| jina-reranker-v2-base-multilingual   | jina                     | rerank      | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |
\ No newline at end of file

From ef01e39d77fd3e9fdba4bd89943244f01729d989 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 25 Mar 2025 03:52:23 +0000
Subject: [PATCH 02/14] merge

---
 docs/en/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md
index a1d94a8a..e8749d1d 100644
--- a/docs/en/supported_models.md
+++ b/docs/en/supported_models.md
@@ -52,4 +52,4 @@
 | jina-embeddings-v3                   | jina                     | embedding   | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |
 | bge-reranker-v2-m3                   | bge                      | rerank      | vllm                    | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                               | sagemaker_realtime,ecs                 | ✅                     |
 | bge-reranker-large                   | bge                      | rerank      | vllm                    | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                               | sagemaker_realtime,ecs                 | ✅                     |
-| jina-reranker-v2-base-multilingual   | jina                     | rerank      | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |
\ No newline at end of file
+| jina-reranker-v2-base-multilingual   | jina                     | rerank      | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_realtime,ecs                 | ✅                     |

From 2845bc357fdca586143bf1738c432501b754a4e2 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Thu, 27 Mar 2025 08:14:39 +0000
Subject: [PATCH 03/14] add Mistral-Small-3.1-24B-Instruct-2503

---
 docs/en/best_deployment_practices.md |  5 +++
 docs/en/supported_models.md          |  1 +
 src/emd/models/engines.py            | 11 ++++++
 src/emd/models/model_series.py       |  6 ++++
 src/emd/models/utils/constants.py    |  1 +
 src/emd/models/vlms/__init__.py      |  1 +
 src/emd/models/vlms/mistral.py       | 54 ++++++++++++++++++++++++++++
 7 files changed, 79 insertions(+)
 create mode 100644 src/emd/models/vlms/mistral.py

diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
index 3a66bfb8..26fae4d3 100644
--- a/docs/en/best_deployment_practices.md
+++ b/docs/en/best_deployment_practices.md
@@ -4,6 +4,11 @@ This document provides examples of best practices for deploying models using EMD
 
 ## Famous Models
 
+### Mistral Small Series
+```
+emd deploy --model-id Mistral-Small-3.1-24B-Instruct-2503 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+```
+
 ### Gemma 3 Series
 
 ```
diff --git a/docs/en/supported_models.md b/docs/en/supported_models.md
index e8749d1d..0ecfeca7 100644
--- a/docs/en/supported_models.md
+++ b/docs/en/supported_models.md
@@ -44,6 +44,7 @@
 | gemma-3-4b-it                        | gemma3                   | vlm         | vllm                    | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                                      | sagemaker_realtime,sagemaker_async,ecs | ❎                     |
 | gemma-3-12b-it                       | gemma3                   | vlm         | vllm                    | g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                                      | sagemaker_realtime,sagemaker_async,ecs | ❎                     |
 | gemma-3-27b-it                       | gemma3                   | vlm         | vllm                    | g5.12xlarge,g5.24xlarge,g5.48xlarge                                                                                                                                                               | sagemaker_realtime,sagemaker_async,ecs | ❎                     |
+| Mistral-Small-3.1-24B-Instruct-2503  | mistral                  | vlm         | vllm                    | g5.12xlarge,g5.24xlarge,g5.48xlarge                                                                                                                                                               | sagemaker_realtime,sagemaker_async,ecs | ❎                     |
 | txt2video-LTX                        | comfyui                  | video       | comfyui                 | g5.4xlarge,g5.8xlarge,g6e.2xlarge                                                                                                                                                                 | sagemaker_async                        | ❎                     |
 | whisper                              | whisper                  | whisper     | huggingface             | g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                                            | sagemaker_async                        | ❎                     |
 | bce-embedding-base_v1                | bce                      | embedding   | vllm                    | g4dn.2xlarge,g5.xlarge,g5.2xlarge,g5.4xlarge,g5.8xlarge,g5.16xlarge                                                                                                                               | sagemaker_realtime,ecs                 | ✅                     |
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index f251071a..e748ab83 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -77,6 +77,17 @@ class KtransformersEngine(OpenAICompitableEngine):
     }
 )
 
+
+vllm_mistral_small_engine082 = VllmEngine(
+    **{
+    **vllm_engine064.model_dump(),
+    "engine_dockerfile_config": {"VERSION":"v0.8.2"},
+    "dockerfile_name":"Dockerfile",
+    "default_cli_args": " --tokenizer-mode mistral --config-format mistral --load-format mistral  --limit-mm-per-prompt 'image=4' --max-model-len 16384",
+    "environment_variables": ""
+    }
+)
+
 vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{
             **vllm_engine064.model_dump(),
             "engine_dockerfile_config": {"VERSION":"v0.7.1"},
diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py
index 09062998..ceae13ad 100644
--- a/src/emd/models/model_series.py
+++ b/src/emd/models/model_series.py
@@ -97,6 +97,12 @@
     reference_link="https://blog.google/technology/developers/gemma-3/"
 )
 
+MISTRAL_SERIES = ModelSeries(
+    model_series_name=ModelSeriesType.MISTRAL,
+    description="LLMs and VLMs provided by MISTRAL AI.",
+    reference_link="https://huggingface.co/mistralai"
+)
+
 DEEPSEEK_REASONING_MODEL = ModelSeries(
     model_series_name=ModelSeriesType.DEEPSEEK_REASONING_MODEL,
     description="DeepSeek-R1-Zero and DeepSeek-R1 are innovative reasoning models, with the former showcasing strong performance through reinforcement learning alone, while the latter enhances reasoning capabilities by incorporating cold-start data, achieving results comparable to OpenAI-o1 and setting new benchmarks with its distilled versions.",
diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py
index b9c21b49..d78414e6 100644
--- a/src/emd/models/utils/constants.py
+++ b/src/emd/models/utils/constants.py
@@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str):
 
 class ModelSeriesType(ConstantBase):
     GEMMA3 = "gemma3"
+    MISTRAL = "mistral"
     QWEN2D5 = "qwen2.5"
     GLM4 = "glm4"
     INTERLM2d5 = "internlm2.5"
diff --git a/src/emd/models/vlms/__init__.py b/src/emd/models/vlms/__init__.py
index bf74f45c..4440a29e 100644
--- a/src/emd/models/vlms/__init__.py
+++ b/src/emd/models/vlms/__init__.py
@@ -1,3 +1,4 @@
 from . import qwen
 from . import internvl
 from . import gemma3
+from . import mistral
diff --git a/src/emd/models/vlms/mistral.py b/src/emd/models/vlms/mistral.py
new file mode 100644
index 00000000..fc597105
--- /dev/null
+++ b/src/emd/models/vlms/mistral.py
@@ -0,0 +1,54 @@
+from ..engines import vllm_mistral_small_engine082
+from .. import Model
+from ..frameworks import fastapi_framework
+from ..services import (
+    sagemaker_service,
+    sagemaker_async_service,
+    ecs_service,
+    local_service
+)
+from emd.models.utils.constants import ModelType
+from ..model_series import MISTRAL_SERIES
+from ..instances import (
+    g5d2xlarge_instance,
+    g5d4xlarge_instance,
+    g5d8xlarge_instance,
+    g5d12xlarge_instance,
+    g5d16xlarge_instance,
+    g5d24xlarge_instance,
+    g5d48xlarge_instance,
+    g6e2xlarge_instance,
+    local_instance
+)
+from ..utils.constants import ModelFilesDownloadSource
+
+
+Model.register(
+    dict(
+        model_id = "Mistral-Small-3.1-24B-Instruct-2503",
+        supported_engines=[vllm_mistral_small_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        huggingface_model_id="unsloth/Mistral-Small-3.1-24B-Instruct-2503",
+        # require_huggingface_token=False,
+        modelscope_model_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        # model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
+        application_scenario="vision llms for image understanding",
+        description="The latest series of mistral small",
+        model_type=ModelType.VLM,
+        model_series=MISTRAL_SERIES,
+    )
+)

From d72b12b22282e6e67dc069f99fdd3ee59e25bcbd Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 1 Apr 2025 08:37:49 +0000
Subject: [PATCH 04/14] modify qwq-32b deploy

---
 src/emd/models/engines.py       | 7 +++++++
 src/emd/models/llms/qwen.py     | 5 +++--
 src/emd/models/services.py      | 1 +
 src/pipeline/backend/backend.py | 7 ++++---
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index e748ab83..0dc243f7 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -135,6 +135,13 @@ class KtransformersEngine(OpenAICompitableEngine):
             "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes"
 })
 
+vllm_qwq_engine082 = VllmEngine(**{
+            **vllm_qwen25vl72b_engine073.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"v0.8.2"},
+            "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes  --enable-reasoning --reasoning-parser deepseek_r1"
+})
+
 
 vllm_internvl2d5_76b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
index fa48ce79..35a2cc1f 100644
--- a/src/emd/models/llms/qwen.py
+++ b/src/emd/models/llms/qwen.py
@@ -7,7 +7,8 @@
     tgi_qwen2d5_on_inf2,
     tgi_qwen2d5_72b_on_inf2,
     vllm_qwen2d5_72b_engine064,
-    vllm_qwq_engine073
+    vllm_qwq_engine073,
+    vllm_qwq_engine082
 )
 from ..services import (
     sagemaker_service,
@@ -471,7 +472,7 @@
 Model.register(
     dict(
         model_id = "QwQ-32B",
-        supported_engines=[vllm_qwq_engine073],
+        supported_engines=[vllm_qwq_engine082],
         supported_instances=[
             g5d12xlarge_instance,
             g5d24xlarge_instance,
diff --git a/src/emd/models/services.py b/src/emd/models/services.py
index 05737773..859a12c9 100644
--- a/src/emd/models/services.py
+++ b/src/emd/models/services.py
@@ -91,6 +91,7 @@
         "ServiceType":"service_type",
         "EngineType":"engine_type",
         "Region": "region",
+        "DesiredCapacity": "desired_capacity",
         "ContainerCpu": "container_cpu",
         "ContainerMemory": "container_memory",
         "ContainerGpu":"instance_gpu_num"
diff --git a/src/pipeline/backend/backend.py b/src/pipeline/backend/backend.py
index fc46d39d..4996c97a 100644
--- a/src/pipeline/backend/backend.py
+++ b/src/pipeline/backend/backend.py
@@ -134,9 +134,10 @@ def start_server(self, server_start_command):
         logger.info(f"Starting {self.engine_type} server with command: {server_start_command}")
         t = threading.Thread(target=os.system,args=(server_start_command,),daemon=True)
         t.start()
-        t2 = threading.Thread(target=self.check_model_serve_ready,args=(t, "127.0.0.1", self.server_port),daemon=True)
-        t2.start()
-        t2.join()
+        self.check_model_serve_ready(t, "127.0.0.1", self.server_port)
+        logger.info(f"Server started successfully.")
+        # t2.start()
+        # t2.join()
         return
 
 

From 48b97c44c487ca29336f94ef69eb0d8741d7a0fd Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Mon, 7 Apr 2025 08:43:51 +0000
Subject: [PATCH 05/14] add txgemma model;

---
 docs/en/best_deployment_practices.md | 42 +++++++++++++
 src/emd/constants.py                 |  6 ++
 src/emd/models/engines.py            | 12 ++++
 src/emd/models/llms/__init__.py      |  3 +-
 src/emd/models/llms/deepseek.py      | 54 ++++++++---------
 src/emd/models/llms/txgemma.py       | 91 ++++++++++++++++++++++++++++
 src/emd/models/model_series.py       |  7 +++
 src/emd/models/services.py           |  2 +-
 src/emd/models/utils/constants.py    |  1 +
 src/emd/models/vlms/gemma3.py        |  4 ++
 src/emd/sdk/deploy.py                | 13 +++-
 src/emd/utils/file_utils.py          |  6 ++
 12 files changed, 209 insertions(+), 32 deletions(-)
 create mode 100644 src/emd/models/llms/txgemma.py
 create mode 100644 src/emd/utils/file_utils.py

diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
index 26fae4d3..d4747e57 100644
--- a/docs/en/best_deployment_practices.md
+++ b/docs/en/best_deployment_practices.md
@@ -64,6 +64,48 @@ emd deploy --model-id Qwen2.5-14B-Instruct-AWQ --instance-type g4dn.2xlarge --en
 }'
 ```
 
+### Example: Customize model download methods
+- You can load models from different locations by addingappropriate values in the extra-params parameter
+1. Load model from S3
+```json
+{
+  "model_params":{
+    "model_files_s3_path":"<S3_PATH>"
+    }
+}
+```
+2. Load model from local path (only applicable for local deployment)
+```json
+{
+  "model_params": {    "model_files_local_path":"<LOCAL_PATH>"
+  }
+}
+```
+3. Skip downloading and uploading model files in codebuild, which will significantly reducedeployment time
+```json
+{
+  "model_params": {
+    "need_prepare_model":false
+  }
+}
+```
+4. Specify the download source for model files
+```json
+{
+  "model_params":{
+    "model_files_download_source":"huggingface|modelscope|auto(default)"
+    }
+}
+```
+5. Specify the model ID on huggingface or modelscope
+```json
+{
+  "model_params": {
+    "huggingface_model_id":"model id on huggingface","modelscope_model_id":"model id on modelscope"
+    }
+}
+```
+
 ## Environmental variables
 - `LOCAL_DEPLOY_PORT: ` Local deployment port, default: `8080`
 
diff --git a/src/emd/constants.py b/src/emd/constants.py
index a3f949b1..53dd60c8 100644
--- a/src/emd/constants.py
+++ b/src/emd/constants.py
@@ -1,4 +1,5 @@
 from .revision import VERSION, convert_version_name_to_stack_name
+import os
 ENV_STACK_NAME = f'EMD-Env'
 MODEL_STACK_NAME_PREFIX = f"EMD-Model"
 ENV_BUCKET_NAME_PREFIX = "emd-env-artifactbucket"
@@ -25,3 +26,8 @@
 
 LOCAL_REGION = "local"
 # EMD_USE_NO_PROFILE_CHOICE = "Don't set"
+
+LOCAL_DEPLOY_PIPELINE_ZIP_DIR = os.path.join(
+    os.path.expanduser("~"),
+    f"emd_{VERSION}"
+)
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index 0dc243f7..0bc13595 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -78,6 +78,18 @@ class KtransformersEngine(OpenAICompitableEngine):
 )
 
 
+vllm_texgemma082 = VllmEngine(**{
+            "engine_type":EngineType.VLLM,
+            "engine_dockerfile_config": {"VERSION":"v0.8.2"},
+            "engine_cls":"vllm.vllm_backend.VLLMBackend",
+            "base_image_host":"public.ecr.aws",
+            "use_public_ecr":True,
+            "docker_login_region":"us-east-1",
+            "default_cli_args": " --max_num_seq 10 --disable-log-stats"
+}
+)
+
+
 vllm_mistral_small_engine082 = VllmEngine(
     **{
     **vllm_engine064.model_dump(),
diff --git a/src/emd/models/llms/__init__.py b/src/emd/models/llms/__init__.py
index e823ab18..2750df0c 100644
--- a/src/emd/models/llms/__init__.py
+++ b/src/emd/models/llms/__init__.py
@@ -5,5 +5,6 @@
     llama,
     deepseek,
     baichuan,
-    jina
+    jina,
+    txgemma
 )
diff --git a/src/emd/models/llms/deepseek.py b/src/emd/models/llms/deepseek.py
index df61526b..1359ac33 100644
--- a/src/emd/models/llms/deepseek.py
+++ b/src/emd/models/llms/deepseek.py
@@ -334,33 +334,33 @@
     )
 )
 
-Model.register(
-    dict(
-        model_id = "deepseek-r1-671b-1.58bit_ollama",
-        supported_engines=[ollama_deepseek_r1_qwen2d5_1d5b_engine057],
-        supported_instances=[
-            g5d48xlarge_instance,
-            local_instance
-        ],
-        supported_services=[
-            sagemaker_service,
-            sagemaker_async_service,
-            ecs_service,
-            local_service
-        ],
-        supported_frameworks=[
-            fastapi_framework
-        ],
-        allow_china_region=False,
-        ollama_model_id="SIGJNF/deepseek-r1-671b-1.58bit",
-        # modelscope_model_id="Qwen/Qwen2.5-14B-Instruct",
-        require_huggingface_token=False,
-        application_scenario="Agent, tool use, translation, summary",
-        description="The latest series of DeepSeek LLMs for reasoning",
-        model_type=ModelType.LLM,
-        model_series=DEEPSEEK_REASONING_MODEL
-    )
-)
+# Model.register(
+#     dict(
+#         model_id = "deepseek-r1-671b-1.58bit_ollama",
+#         supported_engines=[ollama_deepseek_r1_qwen2d5_1d5b_engine057],
+#         supported_instances=[
+#             g5d48xlarge_instance,
+#             local_instance
+#         ],
+#         supported_services=[
+#             sagemaker_service,
+#             sagemaker_async_service,
+#             ecs_service,
+#             local_service
+#         ],
+#         supported_frameworks=[
+#             fastapi_framework
+#         ],
+#         allow_china_region=False,
+#         ollama_model_id="SIGJNF/deepseek-r1-671b-1.58bit",
+#         # modelscope_model_id="Qwen/Qwen2.5-14B-Instruct",
+#         require_huggingface_token=False,
+#         application_scenario="Agent, tool use, translation, summary",
+#         description="The latest series of DeepSeek LLMs for reasoning",
+#         model_type=ModelType.LLM,
+#         model_series=DEEPSEEK_REASONING_MODEL
+#     )
+# )
 
 
 Model.register(
diff --git a/src/emd/models/llms/txgemma.py b/src/emd/models/llms/txgemma.py
new file mode 100644
index 00000000..0c4e29ff
--- /dev/null
+++ b/src/emd/models/llms/txgemma.py
@@ -0,0 +1,91 @@
+from ..engines import vllm_texgemma082
+from .. import Model
+from ..frameworks import fastapi_framework
+from ..services import (
+    sagemaker_service,
+    sagemaker_async_service,
+    ecs_service,
+    local_service
+)
+from emd.models.utils.constants import ModelType
+from ..model_series import TXGEMMA_SERIES
+from ..instances import (
+    g5d2xlarge_instance,
+    g5d4xlarge_instance,
+    g5d8xlarge_instance,
+    g5d12xlarge_instance,
+    g5d16xlarge_instance,
+    g5d24xlarge_instance,
+    g5d48xlarge_instance,
+    g6e2xlarge_instance,
+    local_instance
+)
+from ..utils.constants import ModelFilesDownloadSource
+
+
+Model.register(
+    dict(
+        model_id = "txgemma-9b-chat",
+        supported_engines=[vllm_texgemma082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            local_instance
+        ],
+        disable_hf_transfer=True,
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        huggingface_model_id="google/txgemma-9b-chat",
+        modelscope_model_id="AI-ModelScope/txgemma-9b-chat",
+        model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
+        # require_huggingface_token=True,
+        application_scenario="llms for the development of therapeutics.",
+        description="The latest series of txgemma",
+        model_type=ModelType.LLM,
+        model_series=TXGEMMA_SERIES,
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "txgemma-27b-chat",
+        supported_engines=[vllm_texgemma082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        disable_hf_transfer=True,
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        huggingface_model_id="google/txgemma-27b-chat",
+        modelscope_model_id="AI-ModelScope/txgemma-27b-chat",
+        model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
+        # require_huggingface_token=True,
+        application_scenario="llms for the development of therapeutics.",
+        description="The latest series of txgemma",
+        model_type=ModelType.LLM,
+        model_series=TXGEMMA_SERIES,
+    )
+)
diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py
index ceae13ad..896fa512 100644
--- a/src/emd/models/model_series.py
+++ b/src/emd/models/model_series.py
@@ -97,6 +97,13 @@
     reference_link="https://blog.google/technology/developers/gemma-3/"
 )
 
+TXGEMMA_SERIES = ModelSeries(
+    model_series_name=ModelSeriesType.TXGEMMA,
+    description="TXGemma is a series of open models to accelerate the development of therapeutics.",
+    reference_link="https://huggingface.co/collections/google/txgemma-release-67dd92e931c857d15e4d1e87"
+)
+
+
 MISTRAL_SERIES = ModelSeries(
     model_series_name=ModelSeriesType.MISTRAL,
     description="LLMs and VLMs provided by MISTRAL AI.",
diff --git a/src/emd/models/services.py b/src/emd/models/services.py
index 859a12c9..824792f0 100644
--- a/src/emd/models/services.py
+++ b/src/emd/models/services.py
@@ -91,7 +91,7 @@
         "ServiceType":"service_type",
         "EngineType":"engine_type",
         "Region": "region",
-        "DesiredCapacity": "desired_capacity",
+        "DesiredCapacity": ValueWithDefault(name="desired_capacity",default=1),
         "ContainerCpu": "container_cpu",
         "ContainerMemory": "container_memory",
         "ContainerGpu":"instance_gpu_num"
diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py
index d78414e6..27311173 100644
--- a/src/emd/models/utils/constants.py
+++ b/src/emd/models/utils/constants.py
@@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str):
 
 class ModelSeriesType(ConstantBase):
     GEMMA3 = "gemma3"
+    TXGEMMA = "txgemma"
     MISTRAL = "mistral"
     QWEN2D5 = "qwen2.5"
     GLM4 = "glm4"
diff --git a/src/emd/models/vlms/gemma3.py b/src/emd/models/vlms/gemma3.py
index 4f049787..822cddd6 100644
--- a/src/emd/models/vlms/gemma3.py
+++ b/src/emd/models/vlms/gemma3.py
@@ -10,6 +10,7 @@
 from emd.models.utils.constants import ModelType
 from ..model_series import Gemma3_SERIES
 from ..instances import (
+    g4dn12xlarge_instance,
     g5d2xlarge_instance,
     g5d4xlarge_instance,
     g5d8xlarge_instance,
@@ -43,6 +44,7 @@
         supported_frameworks=[
             fastapi_framework
         ],
+        allow_china_region = True,
         modelscope_model_id="LLM-Research/gemma-3-4b-it",
         model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
         # require_huggingface_token=False,
@@ -74,6 +76,7 @@
         supported_frameworks=[
             fastapi_framework
         ],
+        allow_china_region = True,
         # huggingface_model_id="google/gemma-3-12b-it",
         # require_huggingface_token=False,
         modelscope_model_id="LLM-Research/gemma-3-12b-it",
@@ -106,6 +109,7 @@
         supported_frameworks=[
             fastapi_framework
         ],
+        allow_china_region = True,
         # huggingface_model_id="unsloth/gemma-3-27b-it",
         modelscope_model_id="LLM-Research/gemma-3-27b-it",
         model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
diff --git a/src/emd/sdk/deploy.py b/src/emd/sdk/deploy.py
index 6426215d..03443ebf 100644
--- a/src/emd/sdk/deploy.py
+++ b/src/emd/sdk/deploy.py
@@ -14,8 +14,10 @@
     MODEL_DEFAULT_TAG,
     MODEL_STACK_NAME_PREFIX,
     VERSION,
-    LOCAL_REGION
+    LOCAL_REGION,
+    LOCAL_DEPLOY_PIPELINE_ZIP_DIR
 )
+from emd.utils.file_utils import mkdir_with_mode
 from emd.models import Model
 from emd.models.utils.constants import FrameworkType, ServiceType,InstanceType
 from emd.models.utils.serialize_utils import dump_extra_params
@@ -318,7 +320,10 @@ def deploy_local(
     # region: Optional[str] = None,
     # model_stack_name=None,
     extra_params=None,
-    pipeline_zip_local_path=f"/tmp/emd_{VERSION}/pipeline.zip",
+    pipeline_zip_local_path=os.path.join(
+        LOCAL_DEPLOY_PIPELINE_ZIP_DIR,
+        "pipeline.zip"
+    ),
     # env_stack_on_failure = "ROLLBACK",
     # force_env_stack_update = False,
     # waiting_until_deploy_complete = True
@@ -328,7 +333,9 @@ def deploy_local(
     logger.info(f"parsed extra_params: {extra_params}")
     extra_params = dump_extra_params(extra_params or {})
     dir = os.path.dirname(pipeline_zip_local_path)
-    os.makedirs(dir, exist_ok=True)
+
+    mkdir_with_mode(dir, exist_ok=True,mode=0o777)
+    # os.makedirs(dir, exist_ok=True,mode=0o777)
     with open(pipeline_zip_local_path, "wb") as f:
         buffer = ziped_pipeline()
         f.write(buffer.read())
diff --git a/src/emd/utils/file_utils.py b/src/emd/utils/file_utils.py
new file mode 100644
index 00000000..11599dc1
--- /dev/null
+++ b/src/emd/utils/file_utils.py
@@ -0,0 +1,6 @@
+import os
+
+def mkdir_with_mode(directory,exist_ok=True,mode=0o777):
+    oldmask = os.umask(0)
+    os.makedirs(directory, mode=mode,exist_ok=exist_ok)
+    os.umask(oldmask)

From d7fe697bd5f05e6421746111fb335d0627aa718d Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Mon, 7 Apr 2025 09:36:47 +0000
Subject: [PATCH 06/14] modify model list command

---
 src/emd/cli.py                 | 12 +++++++++---
 src/emd/models/llms/txgemma.py |  2 ++
 src/emd/models/model.py        |  6 ++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/emd/cli.py b/src/emd/cli.py
index 7b6fcbf8..9212833d 100644
--- a/src/emd/cli.py
+++ b/src/emd/cli.py
@@ -83,11 +83,17 @@
 
 @app.command(help="List supported models")
 @catch_aws_credential_errors
-def list_supported_models(model_id: Annotated[
+def list_supported_models(
+    model_id: Annotated[
         str, typer.Argument(help="Model ID")
-    ] = None):
+    ] = None,
+    detail: Annotated[
+        Optional[bool],
+        typer.Option("-a", "--detail", help="output model infomation in details.")
+    ] = False
+):
     # console.print("[bold blue]Retrieving models...[/bold blue]")
-    support_models = Model.get_supported_models()
+    support_models = Model.get_supported_models(detail=detail)
     if model_id:
         support_models = [model for _model_id,model in support_models.items() if _model_id == model_id]
     r = json.dumps(support_models,indent=2,ensure_ascii=False)
diff --git a/src/emd/models/llms/txgemma.py b/src/emd/models/llms/txgemma.py
index 0c4e29ff..06aa7e6f 100644
--- a/src/emd/models/llms/txgemma.py
+++ b/src/emd/models/llms/txgemma.py
@@ -47,6 +47,7 @@
         supported_frameworks=[
             fastapi_framework
         ],
+        allow_china_region=True,
         huggingface_model_id="google/txgemma-9b-chat",
         modelscope_model_id="AI-ModelScope/txgemma-9b-chat",
         model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
@@ -79,6 +80,7 @@
         supported_frameworks=[
             fastapi_framework
         ],
+        allow_china_region=True,
         huggingface_model_id="google/txgemma-27b-chat",
         modelscope_model_id="AI-ModelScope/txgemma-27b-chat",
         model_files_download_source=ModelFilesDownloadSource.MODELSCOPE,
diff --git a/src/emd/models/model.py b/src/emd/models/model.py
index 1e052ef1..6289e890 100644
--- a/src/emd/models/model.py
+++ b/src/emd/models/model.py
@@ -210,8 +210,10 @@ def get_model(cls ,model_id:str,update:dict = None) -> T:
         return model
 
     @classmethod
-    def get_supported_models(cls) -> dict:
-        return {model_id: model.model_type for model_id,model in cls.model_map.items()}
+    def get_supported_models(cls,detail=False) -> dict:
+        if not detail:
+            return {model_id: model.model_type for model_id,model in cls.model_map.items()}
+        return {model_id: model.model_dump() for model_id,model in cls.model_map.items()}
 
     def find_current_engine(self,engine_type:str) -> dict:
         supported_engines:List[Engine]  = self.supported_engines

From d575c580476c32ed3eeb4f5c627f95c72af7e918 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Mon, 7 Apr 2025 09:42:18 +0000
Subject: [PATCH 07/14] fix typo

---
 src/emd/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/emd/cli.py b/src/emd/cli.py
index 9212833d..ed7421e2 100644
--- a/src/emd/cli.py
+++ b/src/emd/cli.py
@@ -89,7 +89,7 @@ def list_supported_models(
     ] = None,
     detail: Annotated[
         Optional[bool],
-        typer.Option("-a", "--detail", help="output model infomation in details.")
+        typer.Option("-a", "--detail", help="output model information in details.")
     ] = False
 ):
     # console.print("[bold blue]Retrieving models...[/bold blue]")

From 4370be096f50186c1b95ab4d5552fe4ba8d14ec6 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Fri, 18 Apr 2025 06:43:29 +0000
Subject: [PATCH 08/14] add some ecs parameters

---
 src/emd/cfn/ecs/post_build.py | 4 ++--
 src/emd/models/services.py    | 3 +++
 src/pipeline/deploy/deploy.py | 2 --
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/emd/cfn/ecs/post_build.py b/src/emd/cfn/ecs/post_build.py
index 4a5acc4f..6d1b8c32 100644
--- a/src/emd/cfn/ecs/post_build.py
+++ b/src/emd/cfn/ecs/post_build.py
@@ -104,9 +104,9 @@ def deploy_vpc_template(region):
     vpc_id = None
     subnets = None
     for output in outputs:
-        if output["OutputKey"] == "VPCID":
+        if output["OutputKey"] == "VPCID" and output["OutputValue"]:
             vpc_id = output["OutputValue"]
-        elif output["OutputKey"] == "Subnets":
+        elif output["OutputKey"] == "Subnets" and output["OutputValue"]:
             subnets = output["OutputValue"]
     update_parameters_file("parameters.json", {"VPCID": vpc_id, "Subnets": subnets})
     return vpc_id, subnets
diff --git a/src/emd/models/services.py b/src/emd/models/services.py
index 824792f0..838ddf2c 100644
--- a/src/emd/models/services.py
+++ b/src/emd/models/services.py
@@ -92,6 +92,9 @@
         "EngineType":"engine_type",
         "Region": "region",
         "DesiredCapacity": ValueWithDefault(name="desired_capacity",default=1),
+        "MaxSize": ValueWithDefault(name="max_size",default=1),
+        "VPCID": ValueWithDefault(name="vpc_id",default=""),
+        "Subnets": ValueWithDefault(name="subnet_ids",default=""),
         "ContainerCpu": "container_cpu",
         "ContainerMemory": "container_memory",
         "ContainerGpu":"instance_gpu_num"
diff --git a/src/pipeline/deploy/deploy.py b/src/pipeline/deploy/deploy.py
index 42040755..ceeee6a5 100644
--- a/src/pipeline/deploy/deploy.py
+++ b/src/pipeline/deploy/deploy.py
@@ -108,9 +108,7 @@ def run(
             + "-"
             + time.strftime("%Y-%m-%d-%H-%M-%S")
         )
-
         role_arn = get_or_create_role(role_name, region)
-
         create_sagemaker_endpoint(
             region=region,
             instance_type=instance_type,

From 5cb72e3185116af3996eeadd7527b5cc2197df9f Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 22 Apr 2025 06:15:01 +0000
Subject: [PATCH 09/14] add glm4-z1 models

---
 src/emd/commands/deploy.py                    |  10 +-
 src/emd/models/engines.py                     |  17 ++
 src/emd/models/llms/glm.py                    | 152 +++++++++++++++++-
 .../zhipu_z1_vllm_image_dockerfile            |   8 +
 4 files changed, 184 insertions(+), 3 deletions(-)
 create mode 100644 src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_dockerfile

diff --git a/src/emd/commands/deploy.py b/src/emd/commands/deploy.py
index fc92453a..3d0b001a 100644
--- a/src/emd/commands/deploy.py
+++ b/src/emd/commands/deploy.py
@@ -239,6 +239,9 @@ def deploy(
     dockerfile_local_path: Annotated[
         str, typer.Option("--dockerfile-local-path", help="Your custom Dockerfile path for building the model image, all files must be in the same directory")
     ] = None,
+    local_gpus:Annotated[
+        str, typer.Option("--local-gpus", help="Local gpu ids to deploy the model (e.g. `0,1,2`), only working with local deployment mode.")
+    ] = None,
 ):
     if only_allow_local_deploy:
         allow_local_deploy = True
@@ -389,8 +392,10 @@ def deploy(
     )
     if service_type == ServiceType.LOCAL:
         if check_cuda_exists():
-            if os.environ.get('CUDA_VISIBLE_DEVICES'):
-                console.print(f"[bold blue]local gpus: {os.environ.get('CUDA_VISIBLE_DEVICES')}[/bold blue]")
+            if local_gpus is not None:
+                os.environ['CUDA_VISIBLE_DEVICES']=local_gpus
+            elif os.environ.get('CUDA_VISIBLE_DEVICES'):
+                pass
             else:
                 gpu_num = get_gpu_num()
                 support_gpu_num = model.supported_instances[0].gpu_num
@@ -400,6 +405,7 @@ def deploy(
                         default=f"{default_gpus_str}"
                     ).ask()
                 os.environ['CUDA_VISIBLE_DEVICES']=gpus_to_deploy
+            console.print(f"[bold blue]local gpus: {os.environ.get('CUDA_VISIBLE_DEVICES')}[/bold blue]")
         instance_type = InstanceType.LOCAL
     else:
         if instance_type is None:
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index 0bc13595..09bdc089 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -165,6 +165,23 @@ class KtransformersEngine(OpenAICompitableEngine):
 
 vllm_glm4_engine064 = vllm_engine064
 
+
+vllm_glm4_0414_engine082 = VllmEngine(**{
+            **vllm_qwen25vl72b_engine073.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
+            "environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic"
+})
+
+
+vllm_glm4_z1_engine082 = VllmEngine(**{
+            **vllm_qwen25vl72b_engine073.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
+            "environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic --enable-reasoning --reasoning-parser granite"
+})
+
+
 vllm_glm4_wo_flashinfer_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
             #  "engine_dockerfile_config": {"VERSION":"v0.6.0"},
diff --git a/src/emd/models/llms/glm.py b/src/emd/models/llms/glm.py
index faaf69ef..ea3d02bd 100644
--- a/src/emd/models/llms/glm.py
+++ b/src/emd/models/llms/glm.py
@@ -1,5 +1,10 @@
 from .. import Model
-from ..engines import vllm_glm4_engine064,vllm_glm4_wo_flashinfer_engine064
+from ..engines import (
+    vllm_glm4_engine064,
+    vllm_glm4_wo_flashinfer_engine064,
+    vllm_glm4_0414_engine082,
+    vllm_glm4_z1_engine082
+)
 from ..services import (
     sagemaker_service,
     sagemaker_async_service,
@@ -79,3 +84,148 @@
         model_series=GLM4_SERIES
     )
 )
+
+
+Model.register(
+    dict(
+        model_id = "GLM-4-9B-0414",
+        supported_engines=[vllm_glm4_0414_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        allow_china_region=True,
+        supported_frameworks=[fastapi_framework],
+        huggingface_model_id="THUDM/GLM-4-9B-0414",
+        modelscope_model_id="ZhipuAI/GLM-4-9B-0414",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="GLM-4-32B-0414 series",
+        model_type=ModelType.LLM,
+        model_series=GLM4_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "GLM-4-32B-0414",
+        supported_engines=[vllm_glm4_0414_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        allow_china_region=True,
+        supported_frameworks=[fastapi_framework],
+        huggingface_model_id="THUDM/GLM-4-32B-0414",
+        modelscope_model_id="ZhipuAI/GLM-4-32B-0414",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="GLM-4-32B-0414 series",
+        model_type=ModelType.LLM,
+        model_series=GLM4_SERIES
+    )
+)
+
+
+
+Model.register(
+    dict(
+        model_id = "GLM-Z1-9B-0414",
+        supported_engines=[vllm_glm4_z1_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        allow_china_region=True,
+        supported_frameworks=[fastapi_framework],
+        huggingface_model_id="THUDM/GLM-Z1-9B-0414",
+        modelscope_model_id="ZhipuAI/GLM-Z1-9B-0414",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="GLM-4-32B-0414 series",
+        model_type=ModelType.LLM,
+        model_series=GLM4_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "GLM-Z1-32B-0414",
+        supported_engines=[vllm_glm4_z1_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        allow_china_region=True,
+        supported_frameworks=[fastapi_framework],
+        huggingface_model_id="THUDM/GLM-Z1-32B-0414",
+        modelscope_model_id="ZhipuAI/GLM-Z1-32B-0414",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="GLM-4-32B-0414 series",
+        model_type=ModelType.LLM,
+        model_series=GLM4_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "GLM-Z1-Rumination-32B-0414",
+        supported_engines=[vllm_glm4_z1_engine082],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        allow_china_region=True,
+        supported_frameworks=[fastapi_framework],
+        huggingface_model_id="THUDM/GLM-Z1-Rumination-32B-0414",
+        modelscope_model_id="ZhipuAI/GLM-Z1-Rumination-32B-0414",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="GLM-4-32B-0414 series",
+        model_type=ModelType.LLM,
+        model_series=GLM4_SERIES
+    )
+)
diff --git a/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_dockerfile b/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_dockerfile
new file mode 100644
index 00000000..229501f9
--- /dev/null
+++ b/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_dockerfile
@@ -0,0 +1,8 @@
+From vllm/vllm-openai:v0.8.4
+
+RUN git clone https://github.com/vllm-project/vllm.git && cd vllm && git fetch origin pull/16618/head:pr-16618 && VLLM_USE_PRECOMPILED=1 pip install --editable .
+
+EXPOSE 8080
+
+# Set the serve script as the entrypoint
+ENTRYPOINT ["/usr/bin/serve"]

From a17b54d613c52e657f70507401a5fc62c37a5d0c Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 22 Apr 2025 08:49:32 +0000
Subject: [PATCH 10/14] modify vllm backend

---
 src/emd/models/engines.py                 |  6 ++++--
 src/pipeline/backend/vllm/vllm_backend.py | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index 09bdc089..255c1a75 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -170,7 +170,8 @@ class KtransformersEngine(OpenAICompitableEngine):
             **vllm_qwen25vl72b_engine073.model_dump(),
             "engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
             "environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
-            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic"
+            # "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic"
+            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats"
 })
 
 
@@ -178,7 +179,8 @@ class KtransformersEngine(OpenAICompitableEngine):
             **vllm_qwen25vl72b_engine073.model_dump(),
             "engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
             "environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
-            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic --enable-reasoning --reasoning-parser granite"
+            # "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic --enable-reasoning --reasoning-parser granite"
+            "default_cli_args": "--max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-reasoning --reasoning-parser granite"
 })
 
 
diff --git a/src/pipeline/backend/vllm/vllm_backend.py b/src/pipeline/backend/vllm/vllm_backend.py
index 67701ae7..d68cc669 100644
--- a/src/pipeline/backend/vllm/vllm_backend.py
+++ b/src/pipeline/backend/vllm/vllm_backend.py
@@ -2,7 +2,7 @@
 import sys
 import os
 from emd.models.utils.constants import ModelType
-
+import inspect
 from backend.backend import OpenAICompitableProxyBackendBase
 from emd.utils.logger_utils import get_logger
 
@@ -22,6 +22,13 @@ def create_proxy_server_start_command(self,model_path):
             serve_command += f" --api-key {self.api_key}"
         return serve_command
 
+    def openai_create_helper(self,fn:callable,request:dict):
+        sig = inspect.signature(fn)
+        extra_body = request.get("extra_body",{})
+        extra_params = {k:request.pop(k) for k in list(request.keys()) if k not in sig.parameters}
+        extra_body.update(extra_params)
+        request['extra_body'] = extra_body
+        return fn(**request)
 
     def invoke(self, request):
         # Transform input to vllm format
@@ -30,7 +37,7 @@ def invoke(self, request):
         logger.info(f"Chat request:{request}")
         if self.model_type == ModelType.EMBEDDING:
             # print('cal embedding....')
-            response = self.client.embeddings.create(**request)
+            response =self.openai_create_helper(self.client.embeddings.create,request)
             # print('end cal embedding....')
         elif self.model_type == ModelType.RERANK:
             headers = {
@@ -43,7 +50,8 @@ def invoke(self, request):
                 headers=headers
             ).json()
         else:
-            response = self.client.chat.completions.create(**request)
+            # response = self.client.chat.completions.create(**request)
+            response = self.openai_create_helper(self.client.chat.completions.create,request)
         logger.info(f"response:{response},{request}")
 
         if request.get("stream", False):
@@ -58,7 +66,7 @@ async def ainvoke(self, request):
         logger.info(f"Chat request:{request}")
         if self.model_type == ModelType.EMBEDDING:
             # print('cal embedding....')
-            response = await self.async_client.embeddings.create(**request)
+            response = await self.openai_create_helper(self.async_client.embeddings.create,request)
             # print('end cal embedding....')
         elif self.model_type == ModelType.RERANK:
             headers = {
@@ -71,7 +79,10 @@ async def ainvoke(self, request):
                 headers=headers
             ).json()
         else:
-            response = await self.async_client.chat.completions.create(**request)
+            response = await self.openai_create_helper(
+                self.async_client.chat.completions.create,
+                request
+            )
         logger.info(f"response:{response},{request}")
 
         if request.get("stream", False):

From a17a1f460f0c8ec0bb93ab8aab8bee7f98c0d2c2 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 29 Apr 2025 09:10:12 +0000
Subject: [PATCH 11/14] add qwen3

---
 README.md                                     |   3 +-
 docs/en/best_deployment_practices.md          |  17 ++
 src/emd/cfn/sagemaker_realtime/template.yaml  |   6 +-
 src/emd/models/engines.py                     |  27 ++
 src/emd/models/llms/qwen.py                   | 241 +++++++++++++++++-
 src/emd/models/model_series.py                |  14 +
 src/emd/models/services.py                    |   9 +-
 src/emd/models/utils/constants.py             |   2 +
 src/emd/models/vlms/qwen.py                   |  42 ++-
 .../zhipu_z1_vllm_image_build.md              |  12 +
 10 files changed, 364 insertions(+), 9 deletions(-)
 create mode 100644 src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_build.md

diff --git a/README.md b/README.md
index 56b1a3df..c6fcdf0c 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,8 @@
 </p>
 
 ## 🔥 Latest News
-
+- 2025-04-29: Deploy Qwen 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###Qwen-3-Series).
+- 2025-04-21: Deploy GLM Z1/0414 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###GLM-Z1/0414-Series).
 - 2025-03-17: Deploy Gemma 3 series models with [one command line](https://github.com/aws-samples/easy-model-deployer/blob/main/docs/en/best_deployment_practices.md##famous-models###gemma-3-series).
 - 2025-03-06: Deploy QwQ-32B with [one command line](docs/en/best_deployment_practices.md##famous-models###qwen-series###qwq-32b).
 
diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
index d4747e57..e9779634 100644
--- a/docs/en/best_deployment_practices.md
+++ b/docs/en/best_deployment_practices.md
@@ -3,6 +3,23 @@
 This document provides examples of best practices for deploying models using EMD for various use cases.
 
 ## Famous Models
+### Qwen 3 Series
+```
+emd deploy --model-id Qwen3-30B-A3B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id Qwen3-32B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id Qwen3-8B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+```
+
+
+### GLM Z1/0414 Series
+```
+emd deploy --model-id GLM-Z1-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+
+emd deploy --model-id GLM-4-32B-0414 --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker_realtime
+```
+
 
 ### Mistral Small Series
 ```
diff --git a/src/emd/cfn/sagemaker_realtime/template.yaml b/src/emd/cfn/sagemaker_realtime/template.yaml
index d5ada13b..90cf3406 100644
--- a/src/emd/cfn/sagemaker_realtime/template.yaml
+++ b/src/emd/cfn/sagemaker_realtime/template.yaml
@@ -26,6 +26,10 @@ Parameters:
   Region:
     Type: String
     Description: The region to be used for the SageMaker Endpoint
+  MinCapacity:
+    Type: Number
+    Description: The minimum capacity of the endpoint
+    Default: 1
   MaxCapacity:
     Type: Number
     Description: The maximum capacity of the endpoint
@@ -117,7 +121,7 @@ Resources:
     Type: AWS::ApplicationAutoScaling::ScalableTarget
     Properties:
       MaxCapacity: !Ref MaxCapacity
-      MinCapacity: 1
+      MinCapacity: !Ref MinCapacity
       RoleARN: !GetAtt ExecutionRole.Arn
       ResourceId: !Sub "endpoint/${SageMakerEndpoint.EndpointName}/variant/AllTraffic"
       ScalableDimension: "sagemaker:variant:DesiredInstanceCount"
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index 255c1a75..fa1ecccb 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -127,6 +127,25 @@ class KtransformersEngine(OpenAICompitableEngine):
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
             "default_cli_args": " --chat-template emd/models/chat_templates/qwen2vl_add_prefill_chat_template.jinja --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=2,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
 })
+
+
+vllm_ui_tars_1_5_engin084 = VllmEngine(**{
+             **vllm_engine064.model_dump(),
+             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 16000 --disable-log-stats --limit-mm-per-prompt image=1,video=0 --max_num_seq 2 --gpu_memory_utilization 0.9 --enable-prefix-caching"
+})
+
+
+
+vllm_qwen3_engin084 = VllmEngine(**{
+             **vllm_engine064.model_dump(),
+             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 16000 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
+})
+
+
 vllm_qwen2vl72b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
@@ -141,6 +160,14 @@ class KtransformersEngine(OpenAICompitableEngine):
             "default_cli_args": " --max_model_len 25000 --disable-log-stats --limit-mm-per-prompt image=20,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
 })
 
+vllm_qwen25vl72b_engine084 = VllmEngine(**{
+            **vllm_engine064.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "dockerfile_name":"Dockerfile_qwen25_vl",
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
+})
+
 vllm_qwq_engine073 = VllmEngine(**{
             **vllm_qwen25vl72b_engine073.model_dump(),
             "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
index 35a2cc1f..7ea4d3d6 100644
--- a/src/emd/models/llms/qwen.py
+++ b/src/emd/models/llms/qwen.py
@@ -8,7 +8,8 @@
     tgi_qwen2d5_72b_on_inf2,
     vllm_qwen2d5_72b_engine064,
     vllm_qwq_engine073,
-    vllm_qwq_engine082
+    vllm_qwq_engine082,
+    vllm_qwen3_engin084
 )
 from ..services import (
     sagemaker_service,
@@ -34,7 +35,7 @@
 from emd.models.utils.constants import ModelType
 from emd.models.utils.constants import ModelType
 from emd.models import ModelSeries
-from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL
+from ..model_series import QWEN2D5_SERIES,QWEN_REASONING_MODEL,QWEN3_SERIES
 
 Model.register(
     dict(
@@ -498,3 +499,239 @@
         model_series=QWEN_REASONING_MODEL
     )
 )
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-8B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-8B",
+        modelscope_model_id="Qwen/Qwen3-8B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-0.6B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-0.6B",
+        modelscope_model_id="Qwen/Qwen3-0.6B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-1.7B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-1.7B",
+        modelscope_model_id="Qwen/Qwen3-1.7B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-4B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d16xlarge_instance,
+            g4dn2xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-4B",
+        modelscope_model_id="Qwen/Qwen3-4B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-14B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-14B",
+        modelscope_model_id="Qwen/Qwen3-14B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-32B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-32B",
+        modelscope_model_id="Qwen/Qwen3-32B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-30B-A3B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            # g5d24xlarge_instance,
+            # g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-30B-A3B",
+        modelscope_model_id="Qwen/Qwen3-30B-A3B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py
index 896fa512..e5b16ccb 100644
--- a/src/emd/models/model_series.py
+++ b/src/emd/models/model_series.py
@@ -7,6 +7,13 @@
     reference_link="https://github.com/QwenLM/Qwen2.5"
 )
 
+QWEN3_SERIES = ModelSeries(
+    model_series_name = ModelSeriesType.QWEN3,
+    description="the latest addition to the Qwen family of large language models. These models represent our most advanced and intelligent systems to date, improving from our experience in building QwQ and Qwen2.5. We are making the weights of Qwen3 available to the public, including both dense and Mixture-of-Expert (MoE) models.",
+    reference_link="https://github.com/QwenLM/Qwen3"
+)
+
+
 GLM4_SERIES = ModelSeries(
     model_series_name = ModelSeriesType.GLM4,
     description="The GLM-4 series includes the latest generation of pre-trained models launched by Zhipu AI.",
@@ -62,6 +69,13 @@
     reference_link="https://github.com/QwenLM/Qwen2-VL"
 )
 
+
+AGENT_SERIES = ModelSeries(
+    model_series_name=ModelSeriesType.AGENT,
+    description="""LLM or VLM models for Agentic tasks, e.g. computer-use,brower-use""",
+    reference_link=""
+)
+
 INTERNVL25_SERIES = ModelSeries(
     model_series_name=ModelSeriesType.INTERNVL25,
     description="""InternVL2.5 is an advanced multimodal large language model (MLLM) series with parameter coverage ranging from 1B to 78B. InternVL2_5-78B is the first open-source MLLMs to achieve over 70% on the MMMU benchmark, matching the performance of leading closed-source commercial models like GPT-4o.""",
diff --git a/src/emd/models/services.py b/src/emd/models/services.py
index 7c12612a..d66b4f2d 100644
--- a/src/emd/models/services.py
+++ b/src/emd/models/services.py
@@ -16,6 +16,7 @@
         "EngineType":"engine_type",
         "Region":"region",
         "MaxCapacity": ValueWithDefault(name="max_capacity",default=1),
+        "MinCapacity": ValueWithDefault(name="min_capacity",default=1),
         "AutoScalingTargetValue": ValueWithDefault(name="auto_scaling_target_value",default=10),
         "SageMakerEndpointName": ValueWithDefault(name="sagemaker_endpoint_name",default="Auto-generate")
     },
@@ -36,7 +37,8 @@
         "EngineType":"engine_type",
         "Region":"region",
         "MaxCapacity": ValueWithDefault(name="max_capacity",default=1),
-        "AutoScalingTargetValue": ValueWithDefault(name="auto_scaling_target_value",default=10)
+        "MinCapacity": ValueWithDefault(name="min_capacity",default=1),
+        "AutoScalingTargetValue": ValueWithDefault(name="auto_scaling_target_value",default=10),
     },
     name = "Amazon SageMaker AI Real-time inference",
     service_type=ServiceType.SAGEMAKER_OLDER,
@@ -55,7 +57,10 @@
         "FrameWorkType":"framework_type",
         "ServiceType":"service_type",
         "EngineType":"engine_type",
-        "Region":"region"
+        "Region":"region",
+        "MaxCapacity": ValueWithDefault(name="max_capacity",default=1),
+        "MinCapacity": ValueWithDefault(name="min_capacity",default=1),
+        "AutoScalingTargetValue": ValueWithDefault(name="auto_scaling_target_value",default=10),
     },
     name = "Amazon SageMaker AI Asynchronous inference with OpenAI Compatible API",
     service_type=ServiceType.SAGEMAKER_ASYNC,
diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py
index 27311173..4fde3cad 100644
--- a/src/emd/models/utils/constants.py
+++ b/src/emd/models/utils/constants.py
@@ -217,6 +217,7 @@ class ModelSeriesType(ConstantBase):
     TXGEMMA = "txgemma"
     MISTRAL = "mistral"
     QWEN2D5 = "qwen2.5"
+    QWEN3 = "qwen3"
     GLM4 = "glm4"
     INTERLM2d5 = "internlm2.5"
     WHISPER = "whisper"
@@ -225,6 +226,7 @@ class ModelSeriesType(ConstantBase):
     BCE = "bce"
     COMFYUI = "comfyui"
     QWEN2VL = "qwen2vl"
+    AGENT = "agent"
     INTERNVL25 = "internvl2.5"
     LLAMA = "llama"
     QWEN_REASONING_MODEL = "qwen reasoning model"
diff --git a/src/emd/models/vlms/qwen.py b/src/emd/models/vlms/qwen.py
index c968515a..feb6e9cb 100644
--- a/src/emd/models/vlms/qwen.py
+++ b/src/emd/models/vlms/qwen.py
@@ -2,7 +2,9 @@
 from ..engines import (
     vllm_qwen2vl7b_engine064,
     vllm_qwen2vl72b_engine064,
-    vllm_qwen25vl72b_engine073
+    vllm_qwen25vl72b_engine073,
+    vllm_ui_tars_1_5_engin084,
+    vllm_qwen25vl72b_engine084
 )
 from ..services import (
     sagemaker_service,
@@ -23,7 +25,7 @@
     local_instance
 )
 from emd.models.utils.constants import ModelType
-from ..model_series import QWEN2VL_SERIES,QWEN_REASONING_MODEL
+from ..model_series import QWEN2VL_SERIES,QWEN_REASONING_MODEL,AGENT_SERIES
 
 
 Model.register(
@@ -57,7 +59,7 @@
 Model.register(
     dict(
         model_id = "Qwen2.5-VL-72B-Instruct-AWQ",
-        supported_engines=[vllm_qwen25vl72b_engine073],
+        supported_engines=[vllm_qwen25vl72b_engine084],
         supported_instances=[
             g5d12xlarge_instance,
             g5d24xlarge_instance,
@@ -169,3 +171,37 @@
         model_series=QWEN2VL_SERIES
     )
 )
+
+
+
+Model.register(
+    dict(
+        model_id = "UI-TARS-1.5-7B",
+        supported_engines=[vllm_ui_tars_1_5_engin084],
+        supported_instances=[
+            g5d2xlarge_instance,
+            g5d4xlarge_instance,
+            g5d8xlarge_instance,
+            g5d12xlarge_instance,
+            g5d16xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            g6e2xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service, sagemaker_async_service,local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="ByteDance-Seed/UI-TARS-1.5-7B",
+        modelscope_model_id="ByteDance-Seed/UI-TARS-1.5-7B",
+        require_huggingface_token=False,
+        application_scenario="computer-use or browser-use",
+        description="The latest series of UI-TARS-1.5 from ByteDance-Seed team",
+        model_type=ModelType.VLM,
+        model_series=AGENT_SERIES
+    )
+)
diff --git a/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_build.md b/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_build.md
new file mode 100644
index 00000000..39bacd10
--- /dev/null
+++ b/src/pipeline/backend/convert_engine_image_to_dmaa_dockerfiles/zhipu_z1_vllm_image_build.md
@@ -0,0 +1,12 @@
+
+To build the current image, please first download the following repo:
+```shell
+git clone  https://github.com/vllm-project/vllm.git vllm_glm_z1
+cd vllm_glm_z1 && git reset --hard fe742aef5aaf406c62cafa248068818bfe517d6e
+```
+Then run the following command to build the image:
+```shell
+#optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=8 --build-arg nvcc_threads=2 -f docker/Dockerfile . --target vllm-openai --tag vllm/vllm-openai:glm_z1_and_0414
+docker tag vllm/vllm-openai:glm_z1_and_0414 public.ecr.aws/aws-gcr-solutions/dmaa-vllm/vllm-openai:glm_z1_and_0414
+```

From 8d37586df3053939fd1de28c4c7b7f94ac53f21e Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 29 Apr 2025 13:53:36 +0000
Subject: [PATCH 12/14] fix cli bugs

---
 src/emd/models/engines.py   |  4 ++--
 src/emd/models/llms/qwen.py | 40 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index fa1ecccb..d6bd3e13 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -140,9 +140,9 @@ class KtransformersEngine(OpenAICompitableEngine):
 
 vllm_qwen3_engin084 = VllmEngine(**{
              **vllm_engine064.model_dump(),
-             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+             "engine_dockerfile_config": {"VERSION":"v0.8.5"},
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
-            "default_cli_args": " --max_model_len 16000 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
+            "default_cli_args": " --max_model_len 16000 --max_num_seq 30 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
 })
 
 
diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
index 7ea4d3d6..57a7e4f5 100644
--- a/src/emd/models/llms/qwen.py
+++ b/src/emd/models/llms/qwen.py
@@ -578,7 +578,7 @@
             g5d4xlarge_instance,
             g5d8xlarge_instance,
             g5d16xlarge_instance,
-            g4dn2xlarge_instance,
+            # g4dn2xlarge_instance,
             # g5d24xlarge_instance,
             # g5d48xlarge_instance,
             local_instance
@@ -671,6 +671,44 @@
     )
 )
 
+
+# ValueError("type fp8e4nv not supported in this architecture. The supported fp8 dtypes are ('fp8e4b15', 'fp8e5')")
+# The g5 instance may not support fp8e4nv
+# Model.register(
+#     dict(
+#         model_id = "Qwen3-14B-FP8",
+#         supported_engines=[vllm_qwen3_engin084],
+#         supported_instances=[
+#             g5d2xlarge_instance,
+#             g5d4xlarge_instance,
+#             g5d8xlarge_instance,
+#             g5d16xlarge_instance,
+#             # g4dn2xlarge_instance,
+#             # g5d24xlarge_instance,
+#             # g5d48xlarge_instance,
+#             local_instance
+#         ],
+#         supported_services=[
+#             sagemaker_service,
+#             sagemaker_async_service,
+#             ecs_service,
+#             local_service
+#         ],
+#         supported_frameworks=[
+#             fastapi_framework
+#         ],
+#         allow_china_region=True,
+#         huggingface_model_id="Qwen/Qwen3-14B-FP8",
+#         modelscope_model_id="Qwen/Qwen3-14B-FP8",
+#         require_huggingface_token=False,
+#         application_scenario="Agent, tool use, translation, summary",
+#         description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+#         model_type=ModelType.LLM,
+#         model_series=QWEN3_SERIES
+#     )
+# )
+
+
 Model.register(
     dict(
         model_id = "Qwen3-32B",

From 1f1ab3381420cf73cfd59ca786423aefbfed11c5 Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Tue, 29 Apr 2025 14:38:40 +0000
Subject: [PATCH 13/14] fix

---
 src/emd/models/engines.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index dacf3303..d6bd3e13 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -140,15 +140,9 @@ class KtransformersEngine(OpenAICompitableEngine):
 
 vllm_qwen3_engin084 = VllmEngine(**{
              **vllm_engine064.model_dump(),
-<<<<<<< HEAD
              "engine_dockerfile_config": {"VERSION":"v0.8.5"},
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
             "default_cli_args": " --max_model_len 16000 --max_num_seq 30 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
-=======
-             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
-            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
-            "default_cli_args": " --max_model_len 16000 --disable-log-stats  --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
->>>>>>> 36a49970280a935d9e4f7cf97180faa8a9477bf7
 })
 
 

From 29fa1425852d81ca7d73162fb01a0a1aee39140d Mon Sep 17 00:00:00 2001
From: zhouxss <zhouxss@amazon.com>
Date: Wed, 7 May 2025 11:49:04 +0000
Subject: [PATCH 14/14] add deeseek r1/Qwen3-235B-A22B

---
 src/emd/commands/deploy.py      |  1 +
 src/emd/models/engines.py       |  7 ++++-
 src/emd/models/llms/deepseek.py | 26 +++++++++++++++++
 src/emd/models/llms/qwen.py     | 49 +++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/emd/commands/deploy.py b/src/emd/commands/deploy.py
index 3d0b001a..9314377a 100644
--- a/src/emd/commands/deploy.py
+++ b/src/emd/commands/deploy.py
@@ -399,6 +399,7 @@ def deploy(
             else:
                 gpu_num = get_gpu_num()
                 support_gpu_num = model.supported_instances[0].gpu_num
+                support_gpu_num = support_gpu_num or gpu_num
                 default_gpus_str = ",".join([str(i) for i in range(min(gpu_num,support_gpu_num))])
                 gpus_to_deploy = questionary.text(
                         "input the local gpu ids to deploy the model (e.g. 0,1,2):",
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
index d6bd3e13..d16ffc38 100644
--- a/src/emd/models/engines.py
+++ b/src/emd/models/engines.py
@@ -108,6 +108,11 @@ class KtransformersEngine(OpenAICompitableEngine):
 
 vllm_deepseek_r1_distill_llama_engine071 = vllm_deepseek_r1_distill_qwen_engine071
 
+vllm_deepseek_r1_engine084 = VllmEngine(**{
+            **vllm_engine064.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "default_cli_args": "--max_num_seq 10 --max_model_len 16000 --chat-template emd/models/chat_templates/deepseek_r1.jinja"
+})
 
 vllm_qwen2d5_72b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
@@ -165,7 +170,7 @@ class KtransformersEngine(OpenAICompitableEngine):
             "engine_dockerfile_config": {"VERSION":"v0.8.4"},
             "dockerfile_name":"Dockerfile_qwen25_vl",
             "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
-            "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
+            "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
 })
 
 vllm_qwq_engine073 = VllmEngine(**{
diff --git a/src/emd/models/llms/deepseek.py b/src/emd/models/llms/deepseek.py
index 1359ac33..a022329d 100644
--- a/src/emd/models/llms/deepseek.py
+++ b/src/emd/models/llms/deepseek.py
@@ -7,6 +7,7 @@
     llama_cpp_deepseek_r1_distill_engineb9ab0a4,
     tgi_deepseek_r1_llama_70b_engine301,
     ktransformers_engine,
+    vllm_deepseek_r1_engine084
 )
 from ..services import (
     sagemaker_service,
@@ -450,6 +451,31 @@
     )
 )
 
+Model.register(
+    dict(
+        model_id = "DeepSeek-R1",
+        supported_engines=[vllm_deepseek_r1_engine084],
+        supported_instances=[
+            local_instance
+        ],
+        supported_services=[
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        need_prepare_model=False,
+        huggingface_model_id="unsloth/DeepSeek-R1",
+        modelscope_model_id="unsloth/DeepSeek-R1",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of DeepSeek LLMs for reasoning",
+        model_type=ModelType.LLM,
+        model_series=DEEPSEEK_REASONING_MODEL
+    )
+)
+
 Model.register(
     dict(
         model_id = "deepseek-r1-671b-4bit_gguf",
diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
index 57a7e4f5..d0d423e3 100644
--- a/src/emd/models/llms/qwen.py
+++ b/src/emd/models/llms/qwen.py
@@ -773,3 +773,52 @@
         model_series=QWEN3_SERIES
     )
 )
+
+
+Model.register(
+    dict(
+        model_id = "Qwen3-235B-A22B",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            local_instance
+        ],
+        supported_services=[
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-235B-A22B",
+        modelscope_model_id="Qwen/Qwen3-235B-A22B",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)
+
+Model.register(
+    dict(
+        model_id = "Qwen3-235B-A22B-FP8",
+        supported_engines=[vllm_qwen3_engin084],
+        supported_instances=[
+            local_instance
+        ],
+        supported_services=[
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/Qwen3-235B-A22B-FP8",
+        modelscope_model_id="Qwen/Qwen3-235B-A22B-FP8",
+        require_huggingface_token=False,
+        application_scenario="Agent, tool use, translation, summary",
+        description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
+        model_type=ModelType.LLM,
+        model_series=QWEN3_SERIES
+    )
+)