diff --git a/src/emd/commands/deploy.py b/src/emd/commands/deploy.py index 3d0b001a..9314377a 100644 --- a/src/emd/commands/deploy.py +++ b/src/emd/commands/deploy.py @@ -399,6 +399,7 @@ def deploy( else: gpu_num = get_gpu_num() support_gpu_num = model.supported_instances[0].gpu_num + support_gpu_num = support_gpu_num or gpu_num default_gpus_str = ",".join([str(i) for i in range(min(gpu_num,support_gpu_num))]) gpus_to_deploy = questionary.text( "input the local gpu ids to deploy the model (e.g. 0,1,2):", diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index fa1ecccb..d16ffc38 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -108,6 +108,11 @@ class KtransformersEngine(OpenAICompitableEngine): vllm_deepseek_r1_distill_llama_engine071 = vllm_deepseek_r1_distill_qwen_engine071 +vllm_deepseek_r1_engine084 = VllmEngine(**{ + **vllm_engine064.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.8.4"}, + "default_cli_args": "--max_num_seq 10 --max_model_len 16000 --chat-template emd/models/chat_templates/deepseek_r1.jinja" +}) vllm_qwen2d5_72b_engine064 = VllmEngine(**{ **vllm_engine064.model_dump(), @@ -140,9 +145,9 @@ class KtransformersEngine(OpenAICompitableEngine): vllm_qwen3_engin084 = VllmEngine(**{ **vllm_engine064.model_dump(), - "engine_dockerfile_config": {"VERSION":"v0.8.4"}, + "engine_dockerfile_config": {"VERSION":"v0.8.5"}, "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", - "default_cli_args": " --max_model_len 16000 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching" + "default_cli_args": " --max_model_len 16000 --max_num_seq 30 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching" }) @@ -165,7 +170,7 @@ class KtransformersEngine(OpenAICompitableEngine): "engine_dockerfile_config": {"VERSION":"v0.8.4"}, "dockerfile_name":"Dockerfile_qwen25_vl", "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", - "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9" + "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7" }) vllm_qwq_engine073 = VllmEngine(**{ diff --git a/src/emd/models/llms/deepseek.py b/src/emd/models/llms/deepseek.py index 1359ac33..a022329d 100644 --- a/src/emd/models/llms/deepseek.py +++ b/src/emd/models/llms/deepseek.py @@ -7,6 +7,7 @@ llama_cpp_deepseek_r1_distill_engineb9ab0a4, tgi_deepseek_r1_llama_70b_engine301, ktransformers_engine, + vllm_deepseek_r1_engine084 ) from ..services import ( sagemaker_service, @@ -450,6 +451,31 @@ ) ) +Model.register( + dict( + model_id = "DeepSeek-R1", + supported_engines=[vllm_deepseek_r1_engine084], + supported_instances=[ + local_instance + ], + supported_services=[ + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + need_prepare_model=False, + huggingface_model_id="unsloth/DeepSeek-R1", + modelscope_model_id="unsloth/DeepSeek-R1", + require_huggingface_token=False, + application_scenario="Agent, tool use, translation, summary", + description="The latest series of DeepSeek LLMs for reasoning", + model_type=ModelType.LLM, + model_series=DEEPSEEK_REASONING_MODEL + ) +) + Model.register( dict( model_id = "deepseek-r1-671b-4bit_gguf", diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py index 7ea4d3d6..d0d423e3 100644 --- a/src/emd/models/llms/qwen.py +++ b/src/emd/models/llms/qwen.py @@ -578,7 +578,7 @@ g5d4xlarge_instance, g5d8xlarge_instance, g5d16xlarge_instance, - g4dn2xlarge_instance, + # g4dn2xlarge_instance, # g5d24xlarge_instance, # g5d48xlarge_instance, local_instance @@ -671,6 +671,44 @@ ) ) + +# ValueError("type fp8e4nv not supported in this architecture. The supported fp8 dtypes are ('fp8e4b15', 'fp8e5')") +# The g5 instance may not support fp8e4nv +# Model.register( +# dict( +# model_id = "Qwen3-14B-FP8", +# supported_engines=[vllm_qwen3_engin084], +# supported_instances=[ +# g5d2xlarge_instance, +# g5d4xlarge_instance, +# g5d8xlarge_instance, +# g5d16xlarge_instance, +# # g4dn2xlarge_instance, +# # g5d24xlarge_instance, +# # g5d48xlarge_instance, +# local_instance +# ], +# supported_services=[ +# sagemaker_service, +# sagemaker_async_service, +# ecs_service, +# local_service +# ], +# supported_frameworks=[ +# fastapi_framework +# ], +# allow_china_region=True, +# huggingface_model_id="Qwen/Qwen3-14B-FP8", +# modelscope_model_id="Qwen/Qwen3-14B-FP8", +# require_huggingface_token=False, +# application_scenario="Agent, tool use, translation, summary", +# description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.", +# model_type=ModelType.LLM, +# model_series=QWEN3_SERIES +# ) +# ) + + Model.register( dict( model_id = "Qwen3-32B", @@ -735,3 +773,52 @@ model_series=QWEN3_SERIES ) ) + + +Model.register( + dict( + model_id = "Qwen3-235B-A22B", + supported_engines=[vllm_qwen3_engin084], + supported_instances=[ + local_instance + ], + supported_services=[ + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="Qwen/Qwen3-235B-A22B", + modelscope_model_id="Qwen/Qwen3-235B-A22B", + require_huggingface_token=False, + application_scenario="Agent, tool use, translation, summary", + description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.", + model_type=ModelType.LLM, + model_series=QWEN3_SERIES + ) +) + +Model.register( + dict( + model_id = "Qwen3-235B-A22B-FP8", + supported_engines=[vllm_qwen3_engin084], + supported_instances=[ + local_instance + ], + supported_services=[ + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="Qwen/Qwen3-235B-A22B-FP8", + modelscope_model_id="Qwen/Qwen3-235B-A22B-FP8", + require_huggingface_token=False, + application_scenario="Agent, tool use, translation, summary", + description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.", + model_type=ModelType.LLM, + model_series=QWEN3_SERIES + ) +)