diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index 6e972d1..8ee52d1 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -222,6 +222,8 @@ class KtransformersEngine(OpenAICompitableEngine): "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7" }) +# "default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7" + vllm_qwq_engine073 = VllmEngine(**{ **vllm_qwen25vl72b_engine073.model_dump(), "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", @@ -280,6 +282,17 @@ class KtransformersEngine(OpenAICompitableEngine): } ) +vllm_gptoss_engine = VllmEngine( + **{ + **vllm_engine064.model_dump(), + "engine_dockerfile_config": {"VERSION":"gptoss"}, + "environment_variables": "export VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1", + "default_cli_args": " --async-scheduling --max_num_seq 5 --max_model_len 32000" + + } +) + + tgi_llama3d3_engine301 = TgiEngine( **{ "engine_type":EngineType.TGI, diff --git a/src/emd/models/llms/__init__.py b/src/emd/models/llms/__init__.py index 9ea306d..56b12fd 100644 --- a/src/emd/models/llms/__init__.py +++ b/src/emd/models/llms/__init__.py @@ -1,11 +1,13 @@ from . import ( glm, internlm, + openai_oss, qwen, llama, deepseek, baichuan, jina, txgemma, - medgemma + medgemma, + openai_oss ) diff --git a/src/emd/models/llms/openai_oss.py b/src/emd/models/llms/openai_oss.py new file mode 100644 index 0000000..89a437e --- /dev/null +++ b/src/emd/models/llms/openai_oss.py @@ -0,0 +1,86 @@ +from ..engines import vllm_gptoss_engine +from .. import Model +from ..frameworks import fastapi_framework +from ..services import ( + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service +) +from emd.models.utils.constants import ModelType +from ..instances import ( + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d12xlarge_instance, + g5d16xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + g6e2xlarge_instance, + local_instance +) +from ..utils.constants import ModelFilesDownloadSource +from ..model_series import GPTOSS_SERIES +Model.register( + dict( + model_id = "gpt-oss-20b", + supported_engines=[vllm_gptoss_engine], + supported_instances=[ + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d16xlarge_instance, + # g5d24xlarge_instance, + # g5d48xlarge_instance, + local_instance + ], + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="openai/gpt-oss-20b", + modelscope_model_id="openai/gpt-oss-20b", + require_huggingface_token=False, + application_scenario="Agent, tool use, translation, summary", + description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.", + model_type=ModelType.LLM, + model_series=GPTOSS_SERIES + ) +) + + +Model.register( + dict( + model_id = "gpt-oss-120b", + supported_engines=[vllm_gptoss_engine], + supported_instances=[ + g5d12xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + local_instance + ], + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="openai/gpt-oss-120b", + modelscope_model_id="openai/gpt-oss-120b", + require_huggingface_token=False, + application_scenario="Agent, tool use, translation, summary", + description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations. These models are designed for building, experimenting, and scaling generative AI applications while fostering innovation and collaboration in the open-source AI community.", + model_type=ModelType.LLM, + model_series=GPTOSS_SERIES + ) +) diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py index 32ab1e5..3de2efe 100644 --- a/src/emd/models/model_series.py +++ b/src/emd/models/model_series.py @@ -158,6 +158,12 @@ reference_link="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct" ) +GPTOSS_SERIES = ModelSeries( + model_series_name=ModelSeriesType.GPTOSS, + description="GPT-OSS (GPT Open Source Software) is OpenAI's initiative to provide open-source AI models, making advanced language models accessible to developers, researchers, and organizations for building, experimenting, and scaling generative AI applications. These models are designed to foster innovation and collaboration in the open-source AI community.", + reference_link="https://openai.com/index/introducing-gpt-oss/" +) + DOTS_OCR_SERIES = ModelSeries( model_series_name=ModelSeriesType.DOTS_OCR, description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.", diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py index 06039d2..f7ef3b3 100644 --- a/src/emd/models/utils/constants.py +++ b/src/emd/models/utils/constants.py @@ -235,4 +235,5 @@ class ModelSeriesType(ConstantBase): DEEPSEEK_REASONING_MODEL = "deepseek reasoning model" DEEPSEEK_v3 = "deepseek v3" BAICHUAN = "baichuan" + GPTOSS = "gptoss" DOTS_OCR = "dots_ocr" diff --git a/src/emd/models/vlms/qwen.py b/src/emd/models/vlms/qwen.py index e0dae20..cb0ed5c 100644 --- a/src/emd/models/vlms/qwen.py +++ b/src/emd/models/vlms/qwen.py @@ -22,6 +22,9 @@ g5d24xlarge_instance, g5d48xlarge_instance, g6e2xlarge_instance, + g6e12xlarge_instance, + g6e24xlarge_instance, + g6e48xlarge_instance, local_instance ) from emd.models.utils.constants import ModelType @@ -85,6 +88,36 @@ ) ) +Model.register( + dict( + model_id = "Qwen2.5-VL-72B-Instruct", + supported_engines=[vllm_qwen25vl72b_engine084], + supported_instances=[ + g5d48xlarge_instance, + g6e12xlarge_instance, + g6e24xlarge_instance, + g6e48xlarge_instance, + local_instance + ], + supported_services=[ + sagemaker_service, + sagemaker_async_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="Qwen/Qwen2.5-VL-72B-Instruct", + modelscope_model_id="Qwen/Qwen2.5-VL-72B-Instruct", + require_huggingface_token=False, + application_scenario="vision llms for image understanding", + description="The latest series of Qwen2.5 VL", + model_type=ModelType.VLM, + model_series=QWEN2VL_SERIES + ) +) + Model.register( dict( model_id = "Qwen2.5-VL-32B-Instruct",