Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/emd/commands/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def deploy(
else:
gpu_num = get_gpu_num()
support_gpu_num = model.supported_instances[0].gpu_num
support_gpu_num = support_gpu_num or gpu_num
default_gpus_str = ",".join([str(i) for i in range(min(gpu_num,support_gpu_num))])
gpus_to_deploy = questionary.text(
"input the local gpu ids to deploy the model (e.g. 0,1,2):",
Expand Down
11 changes: 8 additions & 3 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ class KtransformersEngine(OpenAICompitableEngine):

vllm_deepseek_r1_distill_llama_engine071 = vllm_deepseek_r1_distill_qwen_engine071

vllm_deepseek_r1_engine084 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"default_cli_args": "--max_num_seq 10 --max_model_len 16000 --chat-template emd/models/chat_templates/deepseek_r1.jinja"
})

vllm_qwen2d5_72b_engine064 = VllmEngine(**{
**vllm_engine064.model_dump(),
Expand Down Expand Up @@ -140,9 +145,9 @@ class KtransformersEngine(OpenAICompitableEngine):

vllm_qwen3_engin084 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"engine_dockerfile_config": {"VERSION":"v0.8.5"},
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --max_model_len 16000 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
"default_cli_args": " --max_model_len 16000 --max_num_seq 30 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
})


Expand All @@ -165,7 +170,7 @@ class KtransformersEngine(OpenAICompitableEngine):
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
"dockerfile_name":"Dockerfile_qwen25_vl",
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
})

vllm_qwq_engine073 = VllmEngine(**{
Expand Down
26 changes: 26 additions & 0 deletions src/emd/models/llms/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
llama_cpp_deepseek_r1_distill_engineb9ab0a4,
tgi_deepseek_r1_llama_70b_engine301,
ktransformers_engine,
vllm_deepseek_r1_engine084
)
from ..services import (
sagemaker_service,
Expand Down Expand Up @@ -450,6 +451,31 @@
)
)

Model.register(
dict(
model_id = "DeepSeek-R1",
supported_engines=[vllm_deepseek_r1_engine084],
supported_instances=[
local_instance
],
supported_services=[
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
need_prepare_model=False,
huggingface_model_id="unsloth/DeepSeek-R1",
modelscope_model_id="unsloth/DeepSeek-R1",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of DeepSeek LLMs for reasoning",
model_type=ModelType.LLM,
model_series=DEEPSEEK_REASONING_MODEL
)
)

Model.register(
dict(
model_id = "deepseek-r1-671b-4bit_gguf",
Expand Down
89 changes: 88 additions & 1 deletion src/emd/models/llms/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@
g5d4xlarge_instance,
g5d8xlarge_instance,
g5d16xlarge_instance,
g4dn2xlarge_instance,
# g4dn2xlarge_instance,
# g5d24xlarge_instance,
# g5d48xlarge_instance,
local_instance
Expand Down Expand Up @@ -671,6 +671,44 @@
)
)


# ValueError("type fp8e4nv not supported in this architecture. The supported fp8 dtypes are ('fp8e4b15', 'fp8e5')")
# The g5 instance may not support fp8e4nv
# Model.register(
# dict(
# model_id = "Qwen3-14B-FP8",
# supported_engines=[vllm_qwen3_engin084],
# supported_instances=[
# g5d2xlarge_instance,
# g5d4xlarge_instance,
# g5d8xlarge_instance,
# g5d16xlarge_instance,
# # g4dn2xlarge_instance,
# # g5d24xlarge_instance,
# # g5d48xlarge_instance,
# local_instance
# ],
# supported_services=[
# sagemaker_service,
# sagemaker_async_service,
# ecs_service,
# local_service
# ],
# supported_frameworks=[
# fastapi_framework
# ],
# allow_china_region=True,
# huggingface_model_id="Qwen/Qwen3-14B-FP8",
# modelscope_model_id="Qwen/Qwen3-14B-FP8",
# require_huggingface_token=False,
# application_scenario="Agent, tool use, translation, summary",
# description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
# model_type=ModelType.LLM,
# model_series=QWEN3_SERIES
# )
# )


Model.register(
dict(
model_id = "Qwen3-32B",
Expand Down Expand Up @@ -735,3 +773,52 @@
model_series=QWEN3_SERIES
)
)


Model.register(
dict(
model_id = "Qwen3-235B-A22B",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
local_instance
],
supported_services=[
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-235B-A22B",
modelscope_model_id="Qwen/Qwen3-235B-A22B",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)

Model.register(
dict(
model_id = "Qwen3-235B-A22B-FP8",
supported_engines=[vllm_qwen3_engin084],
supported_instances=[
local_instance
],
supported_services=[
local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="Qwen/Qwen3-235B-A22B-FP8",
modelscope_model_id="Qwen/Qwen3-235B-A22B-FP8",
require_huggingface_token=False,
application_scenario="Agent, tool use, translation, summary",
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
model_type=ModelType.LLM,
model_series=QWEN3_SERIES
)
)