diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md index 26fae4d3..d4747e57 100644 --- a/docs/en/best_deployment_practices.md +++ b/docs/en/best_deployment_practices.md @@ -64,6 +64,48 @@ emd deploy --model-id Qwen2.5-14B-Instruct-AWQ --instance-type g4dn.2xlarge --en }' ``` +### Example: Customize model download methods +- You can load models from different locations by addingappropriate values in the extra-params parameter +1. Load model from S3 +```json +{ + "model_params":{ + "model_files_s3_path":"" + } +} +``` +2. Load model from local path (only applicable for local deployment) +```json +{ + "model_params": { "model_files_local_path":"" + } +} +``` +3. Skip downloading and uploading model files in codebuild, which will significantly reducedeployment time +```json +{ + "model_params": { + "need_prepare_model":false + } +} +``` +4. Specify the download source for model files +```json +{ + "model_params":{ + "model_files_download_source":"huggingface|modelscope|auto(default)" + } +} +``` +5. Specify the model ID on huggingface or modelscope +```json +{ + "model_params": { + "huggingface_model_id":"model id on huggingface","modelscope_model_id":"model id on modelscope" + } +} +``` + ## Environmental variables - `LOCAL_DEPLOY_PORT: ` Local deployment port, default: `8080` diff --git a/src/emd/constants.py b/src/emd/constants.py index a3f949b1..53dd60c8 100644 --- a/src/emd/constants.py +++ b/src/emd/constants.py @@ -1,4 +1,5 @@ from .revision import VERSION, convert_version_name_to_stack_name +import os ENV_STACK_NAME = f'EMD-Env' MODEL_STACK_NAME_PREFIX = f"EMD-Model" ENV_BUCKET_NAME_PREFIX = "emd-env-artifactbucket" @@ -25,3 +26,8 @@ LOCAL_REGION = "local" # EMD_USE_NO_PROFILE_CHOICE = "Don't set" + +LOCAL_DEPLOY_PIPELINE_ZIP_DIR = os.path.join( + os.path.expanduser("~"), + f"emd_{VERSION}" +) diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index 0dc243f7..0bc13595 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -78,6 +78,18 @@ class KtransformersEngine(OpenAICompitableEngine): ) +vllm_texgemma082 = VllmEngine(**{ + "engine_type":EngineType.VLLM, + "engine_dockerfile_config": {"VERSION":"v0.8.2"}, + "engine_cls":"vllm.vllm_backend.VLLMBackend", + "base_image_host":"public.ecr.aws", + "use_public_ecr":True, + "docker_login_region":"us-east-1", + "default_cli_args": " --max_num_seq 10 --disable-log-stats" +} +) + + vllm_mistral_small_engine082 = VllmEngine( **{ **vllm_engine064.model_dump(), diff --git a/src/emd/models/llms/__init__.py b/src/emd/models/llms/__init__.py index e823ab18..2750df0c 100644 --- a/src/emd/models/llms/__init__.py +++ b/src/emd/models/llms/__init__.py @@ -5,5 +5,6 @@ llama, deepseek, baichuan, - jina + jina, + txgemma ) diff --git a/src/emd/models/llms/deepseek.py b/src/emd/models/llms/deepseek.py index df61526b..1359ac33 100644 --- a/src/emd/models/llms/deepseek.py +++ b/src/emd/models/llms/deepseek.py @@ -334,33 +334,33 @@ ) ) -Model.register( - dict( - model_id = "deepseek-r1-671b-1.58bit_ollama", - supported_engines=[ollama_deepseek_r1_qwen2d5_1d5b_engine057], - supported_instances=[ - g5d48xlarge_instance, - local_instance - ], - supported_services=[ - sagemaker_service, - sagemaker_async_service, - ecs_service, - local_service - ], - supported_frameworks=[ - fastapi_framework - ], - allow_china_region=False, - ollama_model_id="SIGJNF/deepseek-r1-671b-1.58bit", - # modelscope_model_id="Qwen/Qwen2.5-14B-Instruct", - require_huggingface_token=False, - application_scenario="Agent, tool use, translation, summary", - description="The latest series of DeepSeek LLMs for reasoning", - model_type=ModelType.LLM, - model_series=DEEPSEEK_REASONING_MODEL - ) -) +# Model.register( +# dict( +# model_id = "deepseek-r1-671b-1.58bit_ollama", +# supported_engines=[ollama_deepseek_r1_qwen2d5_1d5b_engine057], +# supported_instances=[ +# g5d48xlarge_instance, +# local_instance +# ], +# supported_services=[ +# sagemaker_service, +# sagemaker_async_service, +# ecs_service, +# local_service +# ], +# supported_frameworks=[ +# fastapi_framework +# ], +# allow_china_region=False, +# ollama_model_id="SIGJNF/deepseek-r1-671b-1.58bit", +# # modelscope_model_id="Qwen/Qwen2.5-14B-Instruct", +# require_huggingface_token=False, +# application_scenario="Agent, tool use, translation, summary", +# description="The latest series of DeepSeek LLMs for reasoning", +# model_type=ModelType.LLM, +# model_series=DEEPSEEK_REASONING_MODEL +# ) +# ) Model.register( diff --git a/src/emd/models/llms/txgemma.py b/src/emd/models/llms/txgemma.py new file mode 100644 index 00000000..0c4e29ff --- /dev/null +++ b/src/emd/models/llms/txgemma.py @@ -0,0 +1,91 @@ +from ..engines import vllm_texgemma082 +from .. import Model +from ..frameworks import fastapi_framework +from ..services import ( + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service +) +from emd.models.utils.constants import ModelType +from ..model_series import TXGEMMA_SERIES +from ..instances import ( + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d12xlarge_instance, + g5d16xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + g6e2xlarge_instance, + local_instance +) +from ..utils.constants import ModelFilesDownloadSource + + +Model.register( + dict( + model_id = "txgemma-9b-chat", + supported_engines=[vllm_texgemma082], + supported_instances=[ + g5d12xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + g5d2xlarge_instance, + g5d4xlarge_instance, + g5d8xlarge_instance, + g5d16xlarge_instance, + local_instance + ], + disable_hf_transfer=True, + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + huggingface_model_id="google/txgemma-9b-chat", + modelscope_model_id="AI-ModelScope/txgemma-9b-chat", + model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, + # require_huggingface_token=True, + application_scenario="llms for the development of therapeutics.", + description="The latest series of txgemma", + model_type=ModelType.LLM, + model_series=TXGEMMA_SERIES, + ) +) + + +Model.register( + dict( + model_id = "txgemma-27b-chat", + supported_engines=[vllm_texgemma082], + supported_instances=[ + g5d12xlarge_instance, + g5d24xlarge_instance, + g5d48xlarge_instance, + local_instance + ], + disable_hf_transfer=True, + supported_services=[ + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service + ], + supported_frameworks=[ + fastapi_framework + ], + huggingface_model_id="google/txgemma-27b-chat", + modelscope_model_id="AI-ModelScope/txgemma-27b-chat", + model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, + # require_huggingface_token=True, + application_scenario="llms for the development of therapeutics.", + description="The latest series of txgemma", + model_type=ModelType.LLM, + model_series=TXGEMMA_SERIES, + ) +) diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py index ceae13ad..896fa512 100644 --- a/src/emd/models/model_series.py +++ b/src/emd/models/model_series.py @@ -97,6 +97,13 @@ reference_link="https://blog.google/technology/developers/gemma-3/" ) +TXGEMMA_SERIES = ModelSeries( + model_series_name=ModelSeriesType.TXGEMMA, + description="TXGemma is a series of open models to accelerate the development of therapeutics.", + reference_link="https://huggingface.co/collections/google/txgemma-release-67dd92e931c857d15e4d1e87" +) + + MISTRAL_SERIES = ModelSeries( model_series_name=ModelSeriesType.MISTRAL, description="LLMs and VLMs provided by MISTRAL AI.", diff --git a/src/emd/models/services.py b/src/emd/models/services.py index 859a12c9..824792f0 100644 --- a/src/emd/models/services.py +++ b/src/emd/models/services.py @@ -91,7 +91,7 @@ "ServiceType":"service_type", "EngineType":"engine_type", "Region": "region", - "DesiredCapacity": "desired_capacity", + "DesiredCapacity": ValueWithDefault(name="desired_capacity",default=1), "ContainerCpu": "container_cpu", "ContainerMemory": "container_memory", "ContainerGpu":"instance_gpu_num" diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py index d78414e6..27311173 100644 --- a/src/emd/models/utils/constants.py +++ b/src/emd/models/utils/constants.py @@ -214,6 +214,7 @@ def get_service_quota_code(cls, instance_type: str): class ModelSeriesType(ConstantBase): GEMMA3 = "gemma3" + TXGEMMA = "txgemma" MISTRAL = "mistral" QWEN2D5 = "qwen2.5" GLM4 = "glm4" diff --git a/src/emd/models/vlms/gemma3.py b/src/emd/models/vlms/gemma3.py index 4f049787..822cddd6 100644 --- a/src/emd/models/vlms/gemma3.py +++ b/src/emd/models/vlms/gemma3.py @@ -10,6 +10,7 @@ from emd.models.utils.constants import ModelType from ..model_series import Gemma3_SERIES from ..instances import ( + g4dn12xlarge_instance, g5d2xlarge_instance, g5d4xlarge_instance, g5d8xlarge_instance, @@ -43,6 +44,7 @@ supported_frameworks=[ fastapi_framework ], + allow_china_region = True, modelscope_model_id="LLM-Research/gemma-3-4b-it", model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, # require_huggingface_token=False, @@ -74,6 +76,7 @@ supported_frameworks=[ fastapi_framework ], + allow_china_region = True, # huggingface_model_id="google/gemma-3-12b-it", # require_huggingface_token=False, modelscope_model_id="LLM-Research/gemma-3-12b-it", @@ -106,6 +109,7 @@ supported_frameworks=[ fastapi_framework ], + allow_china_region = True, # huggingface_model_id="unsloth/gemma-3-27b-it", modelscope_model_id="LLM-Research/gemma-3-27b-it", model_files_download_source=ModelFilesDownloadSource.MODELSCOPE, diff --git a/src/emd/sdk/deploy.py b/src/emd/sdk/deploy.py index 6426215d..03443ebf 100644 --- a/src/emd/sdk/deploy.py +++ b/src/emd/sdk/deploy.py @@ -14,8 +14,10 @@ MODEL_DEFAULT_TAG, MODEL_STACK_NAME_PREFIX, VERSION, - LOCAL_REGION + LOCAL_REGION, + LOCAL_DEPLOY_PIPELINE_ZIP_DIR ) +from emd.utils.file_utils import mkdir_with_mode from emd.models import Model from emd.models.utils.constants import FrameworkType, ServiceType,InstanceType from emd.models.utils.serialize_utils import dump_extra_params @@ -318,7 +320,10 @@ def deploy_local( # region: Optional[str] = None, # model_stack_name=None, extra_params=None, - pipeline_zip_local_path=f"/tmp/emd_{VERSION}/pipeline.zip", + pipeline_zip_local_path=os.path.join( + LOCAL_DEPLOY_PIPELINE_ZIP_DIR, + "pipeline.zip" + ), # env_stack_on_failure = "ROLLBACK", # force_env_stack_update = False, # waiting_until_deploy_complete = True @@ -328,7 +333,9 @@ def deploy_local( logger.info(f"parsed extra_params: {extra_params}") extra_params = dump_extra_params(extra_params or {}) dir = os.path.dirname(pipeline_zip_local_path) - os.makedirs(dir, exist_ok=True) + + mkdir_with_mode(dir, exist_ok=True,mode=0o777) + # os.makedirs(dir, exist_ok=True,mode=0o777) with open(pipeline_zip_local_path, "wb") as f: buffer = ziped_pipeline() f.write(buffer.read()) diff --git a/src/emd/utils/file_utils.py b/src/emd/utils/file_utils.py new file mode 100644 index 00000000..11599dc1 --- /dev/null +++ b/src/emd/utils/file_utils.py @@ -0,0 +1,6 @@ +import os + +def mkdir_with_mode(directory,exist_ok=True,mode=0o777): + oldmask = os.umask(0) + os.makedirs(directory, mode=mode,exist_ok=exist_ok) + os.umask(oldmask)