Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
8d1b0c2
merge
11zhouxuan Mar 25, 2025
ef01e39
merge
11zhouxuan Mar 25, 2025
2845bc3
add Mistral-Small-3.1-24B-Instruct-2503
11zhouxuan Mar 27, 2025
d72b12b
modify qwq-32b deploy
11zhouxuan Apr 1, 2025
48b97c4
add txgemma model;
11zhouxuan Apr 7, 2025
84b6a33
merge
11zhouxuan Apr 7, 2025
d7fe697
modify model list command
11zhouxuan Apr 7, 2025
389986c
merge
11zhouxuan Apr 7, 2025
d575c58
fix typo
11zhouxuan Apr 7, 2025
d0b4894
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan Apr 18, 2025
4370be0
add some ecs parameters
11zhouxuan Apr 18, 2025
5cb72e3
add glm4-z1 models
11zhouxuan Apr 22, 2025
a17b54d
modify vllm backend
11zhouxuan Apr 22, 2025
4981a89
merge
11zhouxuan Apr 22, 2025
3649748
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan Apr 24, 2025
2749a29
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan Apr 24, 2025
a17a1f4
add qwen3
11zhouxuan Apr 29, 2025
8d37586
fix cli bugs
11zhouxuan Apr 29, 2025
b4ad1d3
merge
11zhouxuan Apr 29, 2025
1f1ab33
fix
11zhouxuan Apr 29, 2025
29fa142
add deeseek r1/Qwen3-235B-A22B
11zhouxuan May 7, 2025
a546df1
fix local deploy account bug
11zhouxuan May 8, 2025
ffef6b0
add qwen 3 awq models
11zhouxuan May 8, 2025
6441ef8
merge
11zhouxuan May 8, 2025
7047cae
fix serialize_utils bugs
11zhouxuan May 8, 2025
fb9aab6
modify qwen3 deployment
11zhouxuan May 14, 2025
eaa4b91
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan May 14, 2025
dd8edb5
modify docs
11zhouxuan May 15, 2025
8403262
fix:jina embedding v4
11zhouxuan Jul 9, 2025
72f2062
modify qwen3 engine;add strands client test
11zhouxuan Jul 16, 2025
0bfa146
modify engine
11zhouxuan Jul 17, 2025
223ce70
merge
11zhouxuan Jul 17, 2025
d801bb2
merge
11zhouxuan Jul 17, 2025
ddce3d4
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan Aug 5, 2025
1f9ba16
add gptoss models
11zhouxuan Aug 12, 2025
e18b237
Merge branch 'main' of https://github.com/aws-samples/easy-model-depl…
11zhouxuan Aug 13, 2025
72930b3
add openai_oss.py
11zhouxuan Aug 18, 2025
0a5e2f3
merge
11zhouxuan Aug 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ class KtransformersEngine(OpenAICompitableEngine):
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
})

# "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"

vllm_qwq_engine073 = VllmEngine(**{
**vllm_qwen25vl72b_engine073.model_dump(),
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
Expand Down Expand Up @@ -280,6 +282,17 @@ class KtransformersEngine(OpenAICompitableEngine):
}
)

vllm_gptoss_engine = VllmEngine(
**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"gptoss"},
"environment_variables": "export VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1",
"default_cli_args": " --async-scheduling --max_num_seq 5 --max_model_len 32000"

}
)


tgi_llama3d3_engine301 = TgiEngine(
**{
"engine_type":EngineType.TGI,
Expand Down
4 changes: 3 additions & 1 deletion src/emd/models/llms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from . import (
glm,
internlm,
openai_oss,
qwen,
llama,
deepseek,
baichuan,
jina,
txgemma,
medgemma
medgemma,
openai_oss
)
86 changes: 86 additions & 0 deletions src/emd/models/llms/openai_oss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from ..engines import vllm_gptoss_engine
from .. import Model
from ..frameworks import fastapi_framework
from ..services import (
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
)
from emd.models.utils.constants import ModelType
from ..instances import (
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d12xlarge_instance,
g5d16xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
g6e2xlarge_instance,
local_instance
)
from ..utils.constants import ModelFilesDownloadSource
from ..model_series import GPTOSS_SERIES
Model.register(
dict(
model_id = "gpt-oss-20b",
supported_engines=[vllm_gptoss_engine],
supported_instances=[
g5d2xlarge_instance,
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="openai/gpt-oss-20b",
modelscope_model_id="openai/gpt-oss-20b",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.",
model_type=ModelType.LLM,
model_series=GPTOSS_SERIES
)
)


Model.register(
dict(
model_id = "gpt-oss-120b",
supported_engines=[vllm_gptoss_engine],
supported_instances=[
g5d12xlarge_instance,
g5d24xlarge_instance,
g5d48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="openai/gpt-oss-120b",
modelscope_model_id="openai/gpt-oss-120b",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.",
model_type=ModelType.LLM,
model_series=GPTOSS_SERIES
)
)
6 changes: 6 additions & 0 deletions src/emd/models/model_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@
reference_link="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
)

GPTOSS_SERIES = ModelSeries(
model_series_name=ModelSeriesType.GPTOSS,
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations for building, experimenting, and scaling generative AI applications. These models are designed to foster innovation and collaboration in the open-source AI community.",
reference_link="https://openai.com/index/introducing-gpt-oss/"
)

DOTS_OCR_SERIES = ModelSeries(
model_series_name=ModelSeriesType.DOTS_OCR,
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",
Expand Down
1 change: 1 addition & 0 deletions src/emd/models/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,5 @@ class ModelSeriesType(ConstantBase):
DEEPSEEK_REASONING_MODEL = "deepseek reasoning model"
DEEPSEEK_v3 = "deepseek v3"
BAICHUAN = "baichuan"
GPTOSS = "gptoss"
DOTS_OCR = "dots_ocr"
33 changes: 33 additions & 0 deletions src/emd/models/vlms/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
g5d24xlarge_instance,
g5d48xlarge_instance,
g6e2xlarge_instance,
g6e12xlarge_instance,
g6e24xlarge_instance,
g6e48xlarge_instance,
local_instance
)
from emd.models.utils.constants import ModelType
Expand Down Expand Up @@ -85,6 +88,36 @@
)
)

Model.register(
dict(
model_id = "Qwen2.5-VL-72B-Instruct",
supported_engines=[vllm_qwen25vl72b_engine084],
supported_instances=[
g5d48xlarge_instance,
g6e12xlarge_instance,
g6e24xlarge_instance,
g6e48xlarge_instance,
local_instance
],
supported_services=[
sagemaker_service,
sagemaker_async_service,
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen2.5-VL-72B-Instruct",
modelscope_model_id="Qwen/Qwen2.5-VL-72B-Instruct",
require_huggingface_token=False,
application_scenario="vision llms for image understanding",
description="The latest series of Qwen2.5 VL",
model_type=ModelType.VLM,
model_series=QWEN2VL_SERIES
)
)

Model.register(
dict(
model_id = "Qwen2.5-VL-32B-Instruct",
Expand Down