Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.

Commit ed13fa1

Browse files
authored
Feat/add gptoss vllm (#184)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy * add txgemma model; * modify model list command * fix typo * add some ecs parameters * add glm4-z1 models * modify vllm backend * add qwen3 * fix cli bugs * fix * add deeseek r1/Qwen3-235B-A22B * fix local deploy account bug * add qwen 3 awq models * fix serialize_utils bugs * modify qwen3 deployment * modify docs * modify qwen3 engine;add strands client test * modify engine * merge * add gptoss models * add openai_oss.py
1 parent 92e8fef commit ed13fa1

File tree

6 files changed

+142
-1
lines changed

6 files changed

+142
-1
lines changed

src/emd/models/engines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ class KtransformersEngine(OpenAICompitableEngine):
222222
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
223223
})
224224

225+
# "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
226+
225227
vllm_qwq_engine073 = VllmEngine(**{
226228
**vllm_qwen25vl72b_engine073.model_dump(),
227229
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
@@ -280,6 +282,17 @@ class KtransformersEngine(OpenAICompitableEngine):
280282
}
281283
)
282284

285+
vllm_gptoss_engine = VllmEngine(
286+
**{
287+
**vllm_engine064.model_dump(),
288+
"engine_dockerfile_config": {"VERSION":"gptoss"},
289+
"environment_variables": "export VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1",
290+
"default_cli_args": " --async-scheduling --max_num_seq 5 --max_model_len 32000"
291+
292+
}
293+
)
294+
295+
283296
tgi_llama3d3_engine301 = TgiEngine(
284297
**{
285298
"engine_type":EngineType.TGI,

src/emd/models/llms/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from . import (
22
glm,
33
internlm,
4+
openai_oss,
45
qwen,
56
llama,
67
deepseek,
78
baichuan,
89
jina,
910
txgemma,
10-
medgemma
11+
medgemma,
12+
openai_oss
1113
)

src/emd/models/llms/openai_oss.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from ..engines import vllm_gptoss_engine
2+
from .. import Model
3+
from ..frameworks import fastapi_framework
4+
from ..services import (
5+
sagemaker_service,
6+
sagemaker_async_service,
7+
ecs_service,
8+
local_service
9+
)
10+
from emd.models.utils.constants import ModelType
11+
from ..instances import (
12+
g5d2xlarge_instance,
13+
g5d4xlarge_instance,
14+
g5d8xlarge_instance,
15+
g5d12xlarge_instance,
16+
g5d16xlarge_instance,
17+
g5d24xlarge_instance,
18+
g5d48xlarge_instance,
19+
g6e2xlarge_instance,
20+
local_instance
21+
)
22+
from ..utils.constants import ModelFilesDownloadSource
23+
from ..model_series import GPTOSS_SERIES
24+
Model.register(
25+
dict(
26+
model_id = "gpt-oss-20b",
27+
supported_engines=[vllm_gptoss_engine],
28+
supported_instances=[
29+
g5d2xlarge_instance,
30+
g5d4xlarge_instance,
31+
g5d8xlarge_instance,
32+
g5d16xlarge_instance,
33+
# g5d24xlarge_instance,
34+
# g5d48xlarge_instance,
35+
local_instance
36+
],
37+
supported_services=[
38+
sagemaker_service,
39+
sagemaker_async_service,
40+
ecs_service,
41+
local_service
42+
],
43+
supported_frameworks=[
44+
fastapi_framework
45+
],
46+
allow_china_region=True,
47+
huggingface_model_id="openai/gpt-oss-20b",
48+
modelscope_model_id="openai/gpt-oss-20b",
49+
require_huggingface_token=False,
50+
application_scenario="Agent, tool use, translation, summary",
51+
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.",
52+
model_type=ModelType.LLM,
53+
model_series=GPTOSS_SERIES
54+
)
55+
)
56+
57+
58+
Model.register(
59+
dict(
60+
model_id = "gpt-oss-120b",
61+
supported_engines=[vllm_gptoss_engine],
62+
supported_instances=[
63+
g5d12xlarge_instance,
64+
g5d24xlarge_instance,
65+
g5d48xlarge_instance,
66+
local_instance
67+
],
68+
supported_services=[
69+
sagemaker_service,
70+
sagemaker_async_service,
71+
ecs_service,
72+
local_service
73+
],
74+
supported_frameworks=[
75+
fastapi_framework
76+
],
77+
allow_china_region=True,
78+
huggingface_model_id="openai/gpt-oss-120b",
79+
modelscope_model_id="openai/gpt-oss-120b",
80+
require_huggingface_token=False,
81+
application_scenario="Agent, tool use, translation, summary",
82+
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.",
83+
model_type=ModelType.LLM,
84+
model_series=GPTOSS_SERIES
85+
)
86+
)

src/emd/models/model_series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,12 @@
158158
reference_link="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
159159
)
160160

161+
GPTOSS_SERIES = ModelSeries(
162+
model_series_name=ModelSeriesType.GPTOSS,
163+
description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations for building, experimenting, and scaling generative AI applications. These models are designed to foster innovation and collaboration in the open-source AI community.",
164+
reference_link="https://openai.com/index/introducing-gpt-oss/"
165+
)
166+
161167
DOTS_OCR_SERIES = ModelSeries(
162168
model_series_name=ModelSeriesType.DOTS_OCR,
163169
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",

src/emd/models/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,4 +235,5 @@ class ModelSeriesType(ConstantBase):
235235
DEEPSEEK_REASONING_MODEL = "deepseek reasoning model"
236236
DEEPSEEK_v3 = "deepseek v3"
237237
BAICHUAN = "baichuan"
238+
GPTOSS = "gptoss"
238239
DOTS_OCR = "dots_ocr"

src/emd/models/vlms/qwen.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
g5d24xlarge_instance,
2323
g5d48xlarge_instance,
2424
g6e2xlarge_instance,
25+
g6e12xlarge_instance,
26+
g6e24xlarge_instance,
27+
g6e48xlarge_instance,
2528
local_instance
2629
)
2730
from emd.models.utils.constants import ModelType
@@ -85,6 +88,36 @@
8588
)
8689
)
8790

91+
Model.register(
92+
dict(
93+
model_id = "Qwen2.5-VL-72B-Instruct",
94+
supported_engines=[vllm_qwen25vl72b_engine084],
95+
supported_instances=[
96+
g5d48xlarge_instance,
97+
g6e12xlarge_instance,
98+
g6e24xlarge_instance,
99+
g6e48xlarge_instance,
100+
local_instance
101+
],
102+
supported_services=[
103+
sagemaker_service,
104+
sagemaker_async_service,
105+
local_service
106+
],
107+
supported_frameworks=[
108+
fastapi_framework
109+
],
110+
allow_china_region=True,
111+
huggingface_model_id="Qwen/Qwen2.5-VL-72B-Instruct",
112+
modelscope_model_id="Qwen/Qwen2.5-VL-72B-Instruct",
113+
require_huggingface_token=False,
114+
application_scenario="vision llms for image understanding",
115+
description="The latest series of Qwen2.5 VL",
116+
model_type=ModelType.VLM,
117+
model_series=QWEN2VL_SERIES
118+
)
119+
)
120+
88121
Model.register(
89122
dict(
90123
model_id = "Qwen2.5-VL-32B-Instruct",

0 commit comments

Comments
 (0)