From 372b2c513f8f0a96a5b31cd0e5b9719ea2efe378 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 27 Aug 2025 10:35:58 +0000 Subject: [PATCH 01/32] AUTOTEST: add device type for ascend --- autotest/config-ascend.yaml | 58 ++++++ autotest/conftest.py | 23 ++- autotest/tools/pipeline/llm_case.py | 18 +- autotest/tools/pipeline/mllm_case.py | 20 +- .../test_pipeline_chat_pytorch_llm.py | 27 +-- .../test_pipeline_chat_pytorch_mllm.py | 20 +- .../test_pipeline_chat_turbomind_llm.py | 30 +-- .../test_pipeline_chat_turbomind_mllm.py | 28 +-- autotest/utils/config_utils.py | 47 ++++- autotest/utils/get_run_config.py | 173 +++++++++++++++--- autotest/utils/quantization_utils.py | 8 +- autotest/utils/run_client_chat.py | 17 +- autotest/utils/run_restful_chat.py | 14 +- 13 files changed, 384 insertions(+), 99 deletions(-) create mode 100644 autotest/config-ascend.yaml diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml new file mode 100644 index 0000000000..bc70824420 --- /dev/null +++ b/autotest/config-ascend.yaml @@ -0,0 +1,58 @@ +model_path: /mnt/deeplink/group01/deeplink-test/weight +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /test/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: a100 + +tp_config: + Qwen3-0.6B: 2 + + +turbomind_chat_model: + - /Qwen3-0.6B + + +pytorch_chat_model: + - /Qwen3-0.6B + +turbomind_vl_model: + - /Qwen3-0.6B + +pytorch_vl_model: + - /Qwen3-0.6B + + +turbomind_base_model: + - /Qwen3-0.6B + +pytorch_base_model: + - /Qwen3-0.6B + +turbomind_quatization: + no_awq: + - /Qwen3-0.6B + + gptq: + - /Qwen3-0.6B + no_kvint4: + - /Qwen3-0.6B + no_kvint8: + - /Qwen3-0.6B + +pytorch_quatization: + awq: + - /Qwen3-0.6B + w8a8: + - /Qwen3-0.6B + no_kvint4: + - /Qwen3-0.6B + no_kvint8: + - /Qwen3-0.6B + +longtext_model: + - /Qwen3-0.6B + +benchmark_model: + - /Qwen3-0.6B diff --git a/autotest/conftest.py b/autotest/conftest.py index 7d5a34c480..8f29975382 100644 --- a/autotest/conftest.py +++ b/autotest/conftest.py @@ -10,7 +10,17 @@ @pytest.fixture(scope='session') def config(): - config_path = os.path.join(config_file) + # Use device-specific config file if DEVICE environment variable is set + device = os.environ.get('DEVICE', '') + if device: + device_config_path = f'autotest/config-{device}.yaml' + if os.path.exists(device_config_path): + config_path = device_config_path + else: + config_path = config_file + else: + config_path = config_file + with open(config_path) as f: env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) return env_config @@ -34,8 +44,19 @@ def common_case_config(): def pytest_addoption(parser): parser.addoption('--run_id', action='store', default='', help='github run_id') + parser.addoption('--device', action='store', default='', help='device config suffix') + +def pytest_configure(config): + # Set DEVICE environment variable before test execution + device = config.getoption('--device') + if device: + os.environ['DEVICE'] = device @pytest.fixture(scope='session') def run_id(request): return request.config.getoption('--run_id') + +@pytest.fixture(scope='session') +def device(request): + return request.config.getoption('--device') diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 2de77d2bd3..9879300b87 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -9,7 +9,16 @@ gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) - +def _is_bf16_supported_by_device(): + """Check if bf16 is supported based on the current device""" + device = os.environ.get('DEVICE', 'cuda') + if device == 'ascend': + # For Ascend, bf16 support check would be different + # Placeholder implementation + return True + else: + # For CUDA and default, use the existing check + return is_bf16_supported() def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): if 'pytorch' in backend_type: @@ -17,6 +26,11 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, else: backend_config = TurbomindEngineConfig(tp=tp) + # Add device_type based on DEVICE environment variable + device = os.environ.get('DEVICE', '') + if device: + backend_config.device_type = device + if 'lora' in backend_type: backend_config.adapters = extra.get('adapters') if 'kvint' in backend_type: @@ -31,7 +45,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, backend_config.model_format = 'awq' if 'gptq' in model_path.lower(): backend_config.model_format = 'gptq' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): backend_config.dtype = 'float16' print('backend_config config: ' + str(backend_config)) diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 9689581ef9..8932a60fcd 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -1,5 +1,5 @@ import json - +import os import fire import numpy as np from PIL import Image @@ -21,7 +21,16 @@ DESC = 'What are the similarities and differences between these two images.' DESC_ZH = '两张图有什么相同和不同的地方.' - +def _is_bf16_supported_by_device(): + """Check if bf16 is supported based on the current device""" + device = os.environ.get('DEVICE', 'cuda') + if device == 'ascend': + # For Ascend, bf16 support check would be different + # Placeholder implementation + return True + else: + # For CUDA and default, use the existing check + return is_bf16_supported() def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_test, extra: object = None): if 'pytorch' in backend_type: backend_config = PytorchEngineConfig(tp=tp, session_len=32576, cache_max_entry_count=0.6) @@ -33,12 +42,17 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') + # Add device_type based on DEVICE environment variable + device = os.environ.get('DEVICE', '') + if device: + backend_config.device_type = device + if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): backend_config.model_format = 'awq' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): backend_config.dtype = 'float16' print('backend_config config: ' + str(backend_config)) diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index dca119649e..c7abafcff5 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list +from utils.config_utils import set_device_env_variable, get_torch_model_list from utils.pipeline_chat import run_pipeline_chat_test @@ -14,7 +14,7 @@ @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True)) def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id) @@ -23,10 +23,11 @@ def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id) @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, exclude_dup=True)) def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id) @@ -39,7 +40,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, exclude_dup=True)) def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id) @@ -63,7 +64,7 @@ def test_pipeline_chat_pytorch_tp8(config, common_case_config, model, worker_id) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True)) def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4}) @@ -75,7 +76,7 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=4, exclude_dup=True)) def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4}) @@ -88,7 +89,7 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=4, exclude_dup=True)) def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4}) @@ -102,7 +103,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True)) def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8}) @@ -114,7 +115,7 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=8, exclude_dup=True)) def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8}) @@ -127,7 +128,7 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=8, exclude_dup=True)) def test_pipeline_chat_kvint8_tp4(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8}) @@ -161,7 +162,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id): @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id, use_local_model=True) del os.environ['LMDEPLOY_USE_MODELSCOPE'] @@ -175,7 +176,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf']) def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'pytorch_lora', worker_id, {'adapters': { 'adapter0': 'lora/Llama2-Chinese-7b-Chat-LoRA' @@ -190,7 +191,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat']) def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'pytorch_lora', worker_id, {'adapters': { diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py index a65465fe0c..90e9fc61f4 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list +from utils.config_utils import set_device_env_variable, get_torch_model_list from utils.pipeline_chat import run_pipeline_vl_chat_test BACKEND = 'pytorch' @@ -16,7 +16,7 @@ @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model')) def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id) @@ -27,7 +27,7 @@ def test_pipeline_chat_tp1(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, model_type='vl_model')) def test_pipeline_chat_tp2(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id) @@ -39,7 +39,7 @@ def test_pipeline_chat_tp2(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, model_type='vl_model')) def test_pipeline_chat_tp4(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id) @@ -52,7 +52,7 @@ def test_pipeline_chat_tp4(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) def test_pipeline_chat_kvint4_tp1(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4}) @@ -63,7 +63,7 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=4, model_type='vl_model')) def test_pipeline_chat_kvint4_tp2(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4}) @@ -75,7 +75,7 @@ def test_pipeline_chat_kvint4_tp2(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=4, model_type='vl_model')) def test_pipeline_chat_kvint4_tp4(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4}) @@ -88,7 +88,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) def test_pipeline_chat_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}) @@ -99,7 +99,7 @@ def test_pipeline_chat_kvint8_tp1(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=8, model_type='vl_model')) def test_pipeline_chat_kvint8_tp2(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}) @@ -111,6 +111,6 @@ def test_pipeline_chat_kvint8_tp2(config, model, worker_id): @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=8, model_type='vl_model')) def test_pipeline_chat_kvint8_tp4(config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8}) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 58255fd5bc..1c9d091e56 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list +from utils.config_utils import get_communicator_list, set_device_env_variable, get_turbomind_model_list from utils.pipeline_chat import run_pipeline_chat_test @@ -15,7 +15,7 @@ @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator}) @@ -28,7 +28,7 @@ def test_pipeline_chat_tp1(config, common_case_config, model, communicator, work @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator}) @@ -42,7 +42,7 @@ def test_pipeline_chat_tp2(config, common_case_config, model, communicator, work @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp4(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator}) @@ -68,7 +68,7 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 4, 'communicator': communicator @@ -84,7 +84,7 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 4, @@ -101,7 +101,7 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 4, @@ -119,7 +119,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 8, 'communicator': communicator @@ -135,7 +135,7 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 8, @@ -152,7 +152,7 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp4(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, { 'quant_policy': 8, @@ -186,7 +186,7 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) + set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, @@ -208,7 +208,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) + set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, @@ -231,7 +231,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, @@ -251,7 +251,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_chat_test(config, common_case_config, @@ -292,7 +292,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct']) def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True' run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, use_local_model=True) del os.environ['LMDEPLOY_USE_MODELSCOPE'] diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 8e2490413a..c8f1f5c759 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list +from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list, set_device_env_variable from utils.pipeline_chat import run_pipeline_vl_chat_test BACKEND = 'turbomind' @@ -17,7 +17,7 @@ @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}) @@ -29,7 +29,7 @@ def test_pipeline_chat_tp1(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) if ('MiniCPM-V-2_6' in model or 'InternVL2_5-26B' in model or 'InternVL2-26B' in model or 'InternVL3-38B' in model) and communicator == 'native': @@ -45,7 +45,7 @@ def test_pipeline_chat_tp2(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp4(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}) @@ -59,7 +59,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 4, 'communicator': communicator @@ -74,7 +74,7 @@ def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 4, @@ -90,7 +90,7 @@ def test_pipeline_chat_kvint4_tp2(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 4, @@ -107,7 +107,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 8, 'communicator': communicator @@ -122,7 +122,7 @@ def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 8, @@ -138,7 +138,7 @@ def test_pipeline_chat_kvint8_tp2(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, { 'quant_policy': 8, @@ -157,7 +157,7 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) + set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) @@ -173,7 +173,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_ @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1) + set_device_env_variable(worker_id, tp_num=1) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, @@ -193,7 +193,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) @@ -206,7 +206,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_ @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) run_pipeline_vl_chat_test(config, model, diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index c53e33bf0f..34ea10acc6 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -76,7 +76,7 @@ def get_torch_model_list(tp_num: int = None, def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: str = 'chat_model'): case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type, quant_policy=quant_policy) - if is_bf16_supported(): + if _is_bf16_supported_by_device(): for case in get_torch_model_list(tp_num=tp_num, quant_policy=quant_policy, model_type=model_type): if case not in case_list: case_list.append(case) @@ -84,7 +84,7 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: def get_communicator_list(): - if is_bf16_supported(): + if _is_bf16_supported_by_device(): return ['native', 'nccl'] return ['nccl'] @@ -142,7 +142,11 @@ def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1): if cuda_id is None or 'gw' not in worker_id: return None else: - return 'CUDA_VISIBLE_DEVICES=' + cuda_id + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + return 'ASCEND_RT_VISIBLE_DEVICES=' + cuda_id + else: + return 'CUDA_VISIBLE_DEVICES=' + cuda_id def get_cuda_id_by_workerid(worker_id, tp_num: int = 1): @@ -160,7 +164,16 @@ def get_cuda_id_by_workerid(worker_id, tp_num: int = 1): def get_config(): - config_path = os.path.join('autotest/config.yaml') + # Determine config file based on DEVICE environment variable + device = os.environ.get('DEVICE', '') + if device: + config_path = f'autotest/config-{device}.yaml' + # Fallback to default config if device-specific config doesn't exist + if not os.path.exists(config_path): + config_path = 'autotest/config.yaml' + else: + config_path = 'autotest/config.yaml' + with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) return config @@ -223,3 +236,29 @@ def get_workerid(worker_id): def is_quantization_model(name): return 'awq' in name.lower() or '4bits' in name.lower() or 'w4' in name.lower() or 'int4' in name.lower() + + +def _is_bf16_supported_by_device(): + """Check if bf16 is supported based on the current device""" + device = os.environ.get('DEVICE', 'cuda') + if device == 'ascend': + # For Ascend, bf16 support check would be different + # Placeholder implementation + return True + else: + # For CUDA and default, use the existing check + return is_bf16_supported() + + +def set_device_env_variable(worker_id, tp_num: int = 1): + """Set device environment variable based on the device type""" + device = os.environ.get('DEVICE', 'cuda') # Default to cuda + + if device == 'ascend': + device_id = get_cuda_id_by_workerid(worker_id, tp_num) + if device_id is not None: + os.environ['ASCEND_RT_VISIBLE_DEVICES'] = device_id + else: # Default to cuda + cuda_id = get_cuda_id_by_workerid(worker_id, tp_num) + if cuda_id is not None: + os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py index 9674b3ed64..eb1b4c328d 100644 --- a/autotest/utils/get_run_config.py +++ b/autotest/utils/get_run_config.py @@ -1,5 +1,6 @@ import random from time import sleep +import os, subprocess, re import torch @@ -7,18 +8,9 @@ def get_conda_allcate_prefix(config, model): - cuda_prefix = '' - tp_num = get_tp_num(config, model) - if tp_num is None or tp_num == 8: - return cuda_prefix - available_cuda = _get_available_cude() - if len(available_cuda) < tp_num: - raise torch.cuda.OutOfMemoryError - - cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num)) - - torch.cuda.empty_cache() - return cuda_prefix + device = os.environ.get('DEVICE', 'cuda') # Default to cuda if not set + handler = _get_device_handler(device) + return handler.get_device_prefix(config, model) def get_tp_config(config, model, need_tp): @@ -60,7 +52,7 @@ def get_command_with_extra(cmd, if extra is not None and len(extra) > 0: cmd = ' '.join([cmd, extra]) - torch.cuda.empty_cache() + _clear_device_cache() return cmd @@ -110,21 +102,6 @@ def get_model_name(model): return model_name.split('-')[0] -def _get_available_cude(): - devices = torch.cuda.device_count() - - available_cuda = [] - for i in range(devices): - if (torch.cuda.utilization(i) > 5): - continue - if ('no processes are running' not in torch.cuda.list_gpu_processes(i)): - continue - - available_cuda.append(str(i)) - - return available_cuda - - def _simple_model_name(model): if '/' in model: model_name = model.split('/')[1] @@ -140,4 +117,142 @@ def close_pipeline(pipe): pipe.close() import gc gc.collect() - torch.cuda.empty_cache() + _clear_device_cache() + + +def _clear_device_cache(): + """Clear cache based on the current device type""" + device = os.environ.get('DEVICE', 'cuda') + handler = _get_device_handler(device) + handler.clear_cache() + + +def _get_device_handler(device): + """Get the appropriate device handler based on device type""" + handlers = { + 'cuda': CudaDeviceHandler(), + 'ascend': AscendDeviceHandler(), + } + + # Return the specific handler if available, otherwise return default cuda handler + return handlers.get(device, handlers['cuda']) + + +class DeviceHandler: + """Base class for device handlers""" + + def get_device_prefix(self, config, model): + """Get device-specific prefix for command execution""" + return '' + + def clear_cache(self): + """Clear device-specific cache""" + pass + + def get_available_devices(self): + """Get list of available devices""" + return [] + + +class CudaDeviceHandler(DeviceHandler): + """Handler for CUDA devices""" + + def get_device_prefix(self, config, model): + cuda_prefix = '' + tp_num = get_tp_num(config, model) + if tp_num is None or tp_num == 8: + return cuda_prefix + available_cuda = self.get_available_devices() + if len(available_cuda) < tp_num: + raise torch.cuda.OutOfMemoryError + + cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num)) + self.clear_cache() + return cuda_prefix + + def clear_cache(self): + torch.cuda.empty_cache() + + def get_available_devices(self): + devices = torch.cuda.device_count() + available_cuda = [] + for i in range(devices): + if (torch.cuda.utilization(i) > 5): + continue + if ('no processes are running' not in torch.cuda.list_gpu_processes(i)): + continue + available_cuda.append(str(i)) + return available_cuda + + +class AscendDeviceHandler(DeviceHandler): + """Handler for Ascend devices""" + + def get_device_prefix(self, config, model): + ascend_prefix = '' + tp_num = get_tp_num(config, model) + if tp_num is None or tp_num == 8: + return ascend_prefix + available_ascend = self.get_available_devices() + if len(available_ascend) < tp_num: + raise RuntimeError("Not enough Ascend devices available") + + ascend_prefix = 'ASCEND_RT_VISIBLE_DEVICES=' + ','.join(random.sample(available_ascend, tp_num)) + self.clear_cache() + return ascend_prefix + + def clear_cache(self): + try: + import torch_npu + torch_npu.npu.empty_cache() + except ImportError: + pass # torch_npu not available + + def get_available_devices(self): + """Get list of available Ascend devices by checking AICPU usage rate""" + available_ascend = [] + try: + # Get the number of NPU devices + result = subprocess.run(['npu-smi', 'info', '-l'], + capture_output=True, text=True, timeout=10) + if result.returncode != 0: + return available_ascend + + # Parse the output to get device count + # Looking for lines like "Device Count : X" + device_count = 0 + for line in result.stdout.split('\n'): + if 'Total Count' in line: + match = re.search(r'Total Count\s*:\s*(\d+)', line) + if match: + device_count = int(match.group(1)) + break + + # Check each device's AICPU usage + for i in range(device_count): + try: + result = subprocess.run(['npu-smi', 'info', '-t', 'usages', '-i', str(i)], + capture_output=True, text=True, timeout=10) + if result.returncode != 0: + continue + + # Parse the output to get AICPU Usage Rate + # Looking for lines like "Aicpu Usage Rate(%) : X" + aicpu_usage = 100 # Default to 100% (busy) + for line in result.stdout.split('\n'): + if 'Aicpu Usage Rate(%)' in line: + match = re.search(r'Aicpu Usage Rate\(%\)\s*:\s*(\d+)', line) + if match: + aicpu_usage = int(match.group(1)) + break + + # If AICPU usage is 0, consider the device available + if aicpu_usage == 0: + available_ascend.append(str(i)) + except (subprocess.TimeoutExpired, subprocess.SubprocessError): + continue + + except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError): + # npu-smi command not found or other error + pass + return available_ascend \ No newline at end of file diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 3606e0bbbb..04595829e2 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -3,6 +3,7 @@ from subprocess import PIPE from lmdeploy.utils import is_bf16_supported +from utils.config_utils import _is_bf16_supported_by_device def quantization(config, @@ -30,6 +31,11 @@ def quantization(config, else: return False, 'quantization type should in [awq, gptq, w8a8], \ now the type is ' + quantization_type + + # Add device option if specified in environment + device = os.environ.get('DEVICE', '') + if device: + quantization_cmd += f' --device npu' if cuda_prefix is not None: quantization_cmd = ' '.join([cuda_prefix, quantization_cmd]) @@ -37,7 +43,7 @@ def quantization(config, if 'llama-3' in origin_model_name.lower(): quantization_cmd += ' --search-scale' - if not is_bf16_supported() or quantization_type == 'gptq': + if not _is_bf16_supported_by_device() or quantization_type == 'gptq': quantization_cmd += ' --batch-size 8' elif str(config.get('env_tag')) == '3090': quantization_cmd += ' --batch-size 8' diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index bba662b0c8..8f345efe9c 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -2,6 +2,7 @@ from subprocess import PIPE, Popen from utils.get_run_config import get_command_with_extra, get_model_name +from utils.config_utils import _is_bf16_supported_by_device from utils.rule_condition_assert import assert_result from lmdeploy.utils import is_bf16_supported @@ -30,6 +31,12 @@ def command_line_test(config, cmd += ' --model-format gptq' if case == 'base_testcase': cmd += ' --chat-template ' + TEMPLATE + + # Add device option if specified in environment + device = os.environ.get('DEVICE', '') + if device: + cmd += f' --device {device}' + return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind', worker_id=worker_id) @@ -57,7 +64,7 @@ def hf_command_line_test(config, cuda_prefix=cuda_prefix) if type == 'pytorch': - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): cmd += ' --dtype float16' if type == 'turbomind': if ('w4' in model_case or ('4bits' in model_case or 'awq' in model_case.lower())): @@ -67,6 +74,12 @@ def hf_command_line_test(config, if case == 'base_testcase': cmd += ' --chat-template ' + TEMPLATE + + # Add device option if specified in environment + device = os.environ.get('DEVICE', '') + if device: + cmd += f' --device {device}' + return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True) @@ -162,4 +175,4 @@ def extract_output(output: str, model: str): if len(output.split('[/INST]')) >= 2: return output.split('[/INST]')[1] - return output + return output \ No newline at end of file diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index b0c0e3b1cd..b484cbcdb2 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -7,8 +7,8 @@ import psutil from openai import OpenAI from pytest_assume.plugin import assume -from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid -from utils.get_run_config import get_command_with_extra +from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid, _is_bf16_supported_by_device +from utils.get_run_config import get_command_with_extra from utils.restful_return_check import assert_chat_completions_batch_return from utils.rule_condition_assert import assert_result @@ -60,6 +60,10 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) need_tp=True, cuda_prefix=cuda_prefix, extra=extra) + + device = os.environ.get('DEVICE', '') + if device: + cmd += f' --device {device}' if backend_type == 'turbomind': if ('w4' in model or '4bits' in model or 'awq' in model.lower()): @@ -68,13 +72,13 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) cmd += ' --model-format gptq' if backend_type == 'pytorch': cmd += ' --backend pytorch' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): cmd += ' --dtype float16' if 'quant_policy' in param.keys() and param['quant_policy'] is not None: quant_policy = param['quant_policy'] cmd += f' --quant-policy {quant_policy}' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): cmd += ' --cache-max-entry-count 0.5' if str(config.get('env_tag')) == '3090': cmd += ' --cache-max-entry-count 0.5' @@ -91,7 +95,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) start_timeout = 300 - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): start_timeout = 600 sleep(5) From 28907ca31ec4427bae796395f7fa2c1b3c2628fc Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 29 Aug 2025 10:50:38 +0800 Subject: [PATCH 02/32] AOTOTEST: fix lint --- .../benchmark/test_throughput_performance.py | 10 +- autotest/conftest.py | 4 +- .../interface/pipeline/test_pipeline_func.py | 180 +++++++++--------- .../pipeline/test_pipeline_longtext_func.py | 12 +- .../chat/test_command_chat_hf_pytorch.py | 8 +- .../chat/test_command_chat_hf_turbomind.py | 40 +--- autotest/tools/pipeline/llm_case.py | 26 ++- autotest/tools/pipeline/mllm_case.py | 27 ++- .../test_pipeline_chat_pytorch_llm.py | 2 +- .../test_pipeline_chat_pytorch_mllm.py | 2 +- .../test_pipeline_chat_turbomind_llm.py | 10 +- .../test_pipeline_chat_turbomind_mllm.py | 28 ++- .../quantization/test_quantization_awq.py | 1 + .../quantization/test_quantization_w8a8.py | 1 + .../test_restful_chat_hf_pytorch_llm.py | 1 + autotest/utils/benchmark_utils.py | 8 +- autotest/utils/config_utils.py | 20 +- autotest/utils/get_run_config.py | 66 +++---- autotest/utils/pipeline_chat.py | 6 +- autotest/utils/quantization_utils.py | 7 +- autotest/utils/run_client_chat.py | 22 ++- autotest/utils/run_restful_chat.py | 43 +++-- 22 files changed, 291 insertions(+), 233 deletions(-) diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py index 8df4a3b7f5..493f90e0bd 100644 --- a/autotest/benchmark/test_throughput_performance.py +++ b/autotest/benchmark/test_throughput_performance.py @@ -1,3 +1,5 @@ +import os + import pytest from utils.benchmark_utils import throughput_test from utils.config_utils import get_benchmark_model_list, get_cuda_id_by_workerid, get_cuda_prefix_by_workerid @@ -92,11 +94,15 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id): 'tp_num': 1 }]) def test_throughput_prtest_tp1(config, run_id, run_config, worker_id): + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + env_var = 'ASCEND_RT_VISIBLE_DEVICES=' + else: + env_var = 'CUDA_VISIBLE_DEVICES=' result, msg = throughput_test(config, run_id, run_config, - cuda_prefix='CUDA_VISIBLE_DEVICES=' + - str(int(get_cuda_id_by_workerid(worker_id)) + 5), + cuda_prefix=f'{env_var}' + str(int(get_cuda_id_by_workerid(worker_id)) + 5), worker_id=worker_id, is_smoke=True) diff --git a/autotest/conftest.py b/autotest/conftest.py index 8f29975382..dee954d2cb 100644 --- a/autotest/conftest.py +++ b/autotest/conftest.py @@ -20,7 +20,7 @@ def config(): config_path = config_file else: config_path = config_file - + with open(config_path) as f: env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) return env_config @@ -46,6 +46,7 @@ def pytest_addoption(parser): parser.addoption('--run_id', action='store', default='', help='github run_id') parser.addoption('--device', action='store', default='', help='device config suffix') + def pytest_configure(config): # Set DEVICE environment variable before test execution device = config.getoption('--device') @@ -57,6 +58,7 @@ def pytest_configure(config): def run_id(request): return request.config.getoption('--run_id') + @pytest.fixture(scope='session') def device(request): return request.config.getoption('--device') diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 87b87d3bba..8c8b0d45c3 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -1,21 +1,20 @@ -import os from multiprocessing import Process import pydantic import pytest import torch -from utils.config_utils import get_cuda_id_by_workerid +from utils.config_utils import _is_bf16_supported_by_device, set_device_env_variable, unset_device_env_variable +from utils.get_run_config import _clear_device_cache from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return, assert_pipeline_common_log, assert_pipeline_single_return, assert_pipeline_single_stream_return, save_pipeline_common_log) from utils.restful_return_check import get_repeat_times from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline -from lmdeploy.utils import is_bf16_supported def init_pipeline(model_path, backend_config): - if not is_bf16_supported() and isinstance(backend_config, PytorchEngineConfig): + if not _is_bf16_supported_by_device() and isinstance(backend_config, PytorchEngineConfig): backend_config.dtype = 'float16' return pipeline(model_path, backend_config=backend_config) @@ -33,18 +32,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -62,18 +61,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_stream_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -89,18 +88,18 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name): result, msg = assert_pipeline_batch_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_with_prompt, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -118,18 +117,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_batch_stream_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -146,18 +145,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_batch_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -175,18 +174,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_stream_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -209,18 +208,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_batch_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -244,18 +243,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_batch_stream_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -272,18 +271,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_return(response, logprobs_num=10) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -302,18 +301,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_stream_return(response, logprobs_num=10) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -329,23 +328,22 @@ def run_pipeline_testcase(config, model, backend, file_name): result = True for i in range(2): - result &= response[i].finish_reason == 'error' - result &= response[i].text == 'internal error happened, status code ResponseType.INPUT_LENGTH_ERROR' + result &= response[i].finish_reason == 'length' result &= response[i].generate_token_len == 0 save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -366,18 +364,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result &= response[i].index == i save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_min_new_tokens_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -399,18 +397,18 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): result &= response[i].finish_reason == 'stop' and response[i].generate_token_len < 50 save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_stop_words_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_stop_words, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -430,18 +428,18 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): result &= '浦' not in response[i].text and ' and' not in response[i].text and ' to ' not in response[i].text save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_bad_words_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_bad_words, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @@ -468,18 +466,18 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name): result = '<|action_start|><|interpreter|>' in response.text save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_special_words_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_special_words, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -506,18 +504,18 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name): result = '<|action_start|><|interpreter|>' not in response.text save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_special_words_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_special_words, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @@ -536,18 +534,18 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, file_name): result = get_repeat_times(response.text, 'is a name') > 5 or get_repeat_times(response.text, 'Shanghai is') > 5 save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_repetition_penalty, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -565,18 +563,18 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, file_name): result, msg = assert_pipeline_single_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_repetition_penalty, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -594,18 +592,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result, msg = assert_pipeline_single_return(response) save_pipeline_common_log(config, file_name, result, response, msg) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -625,18 +623,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result = response_list[0].text == response_list[1].text and response_list[1].text == response_list[2].text save_pipeline_common_log(config, file_name, result, response_list) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -655,18 +653,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result = response_list[0].text != response_list[1].text and response_list[1].text != response_list[2].text save_pipeline_common_log(config, file_name, result, response_list) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -685,18 +683,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result = response_list[0].text == response_list[1].text and response_list[1].text == response_list[2].text save_pipeline_common_log(config, file_name, result, response_list) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -713,18 +711,18 @@ def run_pipeline_testcase(config, model, backend, file_name): result = response[0].text != response[1].text and response[1].text != response[2].text save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -745,18 +743,18 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend, file_name): result &= response[i].generate_token_len == 6 or response[i].generate_token_len == 5 save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_max_new_tokens_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_max_new_tokens, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @@ -777,25 +775,25 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name): result &= response[i].generate_token_len == 257 or response[i].generate_token_len == 256 save_pipeline_common_log(config, file_name, result, response) del pipe - torch.cuda.empty_cache() + _clear_device_cache() file_name = f'pipeline_log_ignore_eos_{worker_id}.txt' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) p = Process(target=run_pipeline_testcase_ignore_eos, args=(config, model, backend, file_name)) p.start() p.join() assert_pipeline_common_log(config, file_name) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_backend_config_input_validation(config, model, backend, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = init_pipeline(model_path, backend_config=backend_config) @@ -824,16 +822,16 @@ def test_backend_config_input_validation(config, model, backend, worker_id): pipe('Shanghai is', gen_config=gen_config) del pipe - torch.cuda.empty_cache() + _clear_device_cache() if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_validate_turbomind(config, model, backend, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) with pytest.raises(pydantic.ValidationError, match='tp must be a positive integer'): backend_config = backend(tp=0) @@ -864,14 +862,14 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id): pipeline(model_path, backend_config=backend_config) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) @pytest.mark.parametrize('backend', [PytorchEngineConfig]) def test_backend_config_validate_pytorch(config, model, backend, worker_id): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) with pytest.raises(AssertionError): backend_config = backend(tp=0) @@ -894,7 +892,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id): init_pipeline(model_path, backend_config=backend_config) if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @@ -902,11 +900,11 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id): def test_backend_config_tp(config, model, backend, worker_id): with pytest.raises(AssertionError): if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=100) pipe = init_pipeline(model_path, backend_config=backend_config) del pipe - torch.cuda.empty_cache() + _clear_device_cache() if 'gw' in worker_id: - del os.environ['CUDA_VISIBLE_DEVICES'] + unset_device_env_variable() diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py index 6687eb1d63..90f6a087bf 100644 --- a/autotest/interface/pipeline/test_pipeline_longtext_func.py +++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from utils.config_utils import get_cuda_id_by_workerid +from utils.config_utils import set_device_env_variable from utils.get_run_config import close_pipeline, get_tp_num from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline @@ -19,7 +19,7 @@ def test_history_issue_tp1(config, model, worker_id): log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log']) if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) stream_infer_basic(config, model, log_name) @@ -28,7 +28,7 @@ def test_history_issue_tp1(config, model, worker_id): def test_history_issue_tp2(config, model, worker_id): log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log']) if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) stream_infer_basic(config, model, log_name) @@ -63,7 +63,7 @@ def stream_infer_basic(config, model, log_name): def test_long_test_passkey_tp1(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + set_device_env_variable(worker_id) passkey_retrival(config, model, backend, log_name, 1) @@ -74,7 +74,7 @@ def test_long_test_passkey_tp1(config, model, backend, worker_id): def test_long_test_passkey_tp2(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2) + set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) passkey_retrival(config, model, backend, log_name, 2) @@ -85,7 +85,7 @@ def test_long_test_passkey_tp2(config, model, backend, worker_id): def test_long_test_passkey_tp4(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4) + set_device_env_variable(worker_id, tp_num=4) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) passkey_retrival(config, model, backend, log_name, 4, SESSION_LEN_PASSKEY_1M) diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 3c13cb1ebf..4aaddd580c 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -32,6 +32,7 @@ def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_2 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2)) def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -280,12 +281,17 @@ def test_hf_pytorch_base_tp2(config, model, cli_case_config, worker_id): @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'mistralai/Mixtral-8x7B-Instruct-v0.1']) def test_hf_pytorch_chat_pr(config, model, cli_case_config): usercase = 'chat_testcase' + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + env_var = 'ASCEND_RT_VISIBLE_DEVICES=' + else: + env_var = 'CUDA_VISIBLE_DEVICES=' result, chat_log, msg = hf_command_line_test(config, usercase, cli_case_config.get(usercase), model, 'pytorch', - cuda_prefix='CUDA_VISIBLE_DEVICES=5,6') + cuda_prefix=f'{env_var}5,6') if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 834853f5e9..42ed56d83d 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -315,8 +315,8 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', - ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize( + 'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -338,8 +338,8 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', - ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize( + 'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -413,37 +413,17 @@ def test_hf_turbomind_base_tp2(config, model, communicator, cli_case_config, wor @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config): usercase = 'chat_testcase' - - result, chat_log, msg = hf_command_line_test(config, - usercase, - cli_case_config.get(usercase), - model, - 'turbomind', - cuda_prefix='CUDA_VISIBLE_DEVICES=5,6', - extra=f'--communicator {communicator}') - - if chat_log is not None: - allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) - - assert result, msg - - -@pytest.mark.order(10) -@pytest.mark.usefixtures('cli_case_config') -@pytest.mark.hf_turbomind_chat -@pytest.mark.gpu_num_1 -@pytest.mark.pr_test -@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B']) -@pytest.mark.parametrize('communicator', get_communicator_list()) -def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config): - usercase = 'chat_testcase' - + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + env_var = 'ASCEND_RT_VISIBLE_DEVICES=' + else: + env_var = 'CUDA_VISIBLE_DEVICES=' result, chat_log, msg = hf_command_line_test(config, usercase, cli_case_config.get(usercase), model, 'turbomind', - cuda_prefix='CUDA_VISIBLE_DEVICES=5,6', + cuda_prefix=f'{env_var}5,6', extra=f'--communicator {communicator}') if chat_log is not None: diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 9879300b87..74a00de128 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -9,8 +9,9 @@ gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) + def _is_bf16_supported_by_device(): - """Check if bf16 is supported based on the current device""" + """Check if bf16 is supported based on the current device.""" device = os.environ.get('DEVICE', 'cuda') if device == 'ascend': # For Ascend, bf16 support check would be different @@ -19,6 +20,22 @@ def _is_bf16_supported_by_device(): else: # For CUDA and default, use the existing check return is_bf16_supported() + + +def _clear_device_cache(): + """Clear cache based on the current device type.""" + device = os.environ.get('DEVICE', 'cuda') + if device == 'ascend': + try: + import torch_npu + torch_npu.npu.empty_cache() + except ImportError: + pass # torch_npu not available + else: + import torch + torch.cuda.empty_cache() + + def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): if 'pytorch' in backend_type: @@ -30,6 +47,8 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, device = os.environ.get('DEVICE', '') if device: backend_config.device_type = device + if device == 'ascend': + backend_config.eager_mode = True if 'lora' in backend_type: backend_config.adapters = extra.get('adapters') @@ -74,12 +93,13 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, print(f'[caseresult {case} start]' + json.dumps(response_list, ensure_ascii=False) + f'[caseresult {case} end]\n') - pipe.close() + # TODO fix for ascend + # pipe.close() import gc import torch gc.collect() - torch.cuda.empty_cache() + _clear_device_cache() if __name__ == '__main__': diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 8932a60fcd..e2829466fe 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -1,5 +1,6 @@ import json import os + import fire import numpy as np from PIL import Image @@ -21,8 +22,9 @@ DESC = 'What are the similarities and differences between these two images.' DESC_ZH = '两张图有什么相同和不同的地方.' + def _is_bf16_supported_by_device(): - """Check if bf16 is supported based on the current device""" + """Check if bf16 is supported based on the current device.""" device = os.environ.get('DEVICE', 'cuda') if device == 'ascend': # For Ascend, bf16 support check would be different @@ -31,6 +33,22 @@ def _is_bf16_supported_by_device(): else: # For CUDA and default, use the existing check return is_bf16_supported() + + +def _clear_device_cache(): + """Clear cache based on the current device type.""" + device = os.environ.get('DEVICE', 'cuda') + if device == 'ascend': + try: + import torch_npu + torch_npu.npu.empty_cache() + except ImportError: + pass # torch_npu not available + else: + import torch + torch.cuda.empty_cache() + + def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_test, extra: object = None): if 'pytorch' in backend_type: backend_config = PytorchEngineConfig(tp=tp, session_len=32576, cache_max_entry_count=0.6) @@ -46,6 +64,8 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te device = os.environ.get('DEVICE', '') if device: backend_config.device_type = device + if device == 'ascend': + backend_config.eager_mode = True if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') @@ -115,12 +135,13 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te if 'qwen' in model_path.lower(): Qwen_vl_testcase(pipe, resource_path) - pipe.close() + # TODO fix for ascend + # pipe.close() import gc import torch gc.collect() - torch.cuda.empty_cache() + _clear_device_cache() def internvl_vl_testcase(pipe, resource_path, lang='en'): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index c7abafcff5..3738056c56 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import set_device_env_variable, get_torch_model_list +from utils.config_utils import get_torch_model_list, set_device_env_variable from utils.pipeline_chat import run_pipeline_chat_test diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py index 90e9fc61f4..65948209cd 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import set_device_env_variable, get_torch_model_list +from utils.config_utils import get_torch_model_list, set_device_env_variable from utils.pipeline_chat import run_pipeline_vl_chat_test BACKEND = 'pytorch' diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 31af132b1e..17d4f89505 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -1,7 +1,7 @@ import os import pytest -from utils.config_utils import get_communicator_list, set_device_env_variable, get_turbomind_model_list +from utils.config_utils import get_communicator_list, get_turbomind_model_list, set_device_env_variable from utils.pipeline_chat import run_pipeline_chat_test @@ -226,8 +226,8 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', - ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize( + 'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: @@ -246,8 +246,8 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', - ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize( + 'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id): if 'gw' in worker_id: diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 2a4a48ac47..2325b4246c 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -1,7 +1,8 @@ import os import pytest -from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list, set_device_env_variable +from utils.config_utils import (get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list, + set_device_env_variable) from utils.pipeline_chat import run_pipeline_vl_chat_test BACKEND = 'turbomind' @@ -150,9 +151,10 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'model', - ['OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct-inner-4bits', 'THUDM/glm-4v-9b', + 'THUDM/glm-4v-9b-inner-4bits' +]) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: @@ -165,9 +167,10 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'model', - ['OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct-inner-4bits', 'THUDM/glm-4v-9b', + 'THUDM/glm-4v-9b-inner-4bits' +]) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id): if 'gw' in worker_id: @@ -187,7 +190,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: @@ -200,7 +203,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-VL-32B-Instruct']) +@pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id): if 'gw' in worker_id: @@ -225,6 +228,11 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_pr_test(config, model, communicator, worker_id): + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + env_var = 'ASCEND_RT_VISIBLE_DEVICES' + else: + env_var = 'CUDA_VISIBLE_DEVICES' if 'gw' in worker_id: - os.environ['CUDA_VISIBLE_DEVICES'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5) + os.environ[f'{env_var}'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5) run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True) diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index 7552e6e2aa..afa31d402b 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -8,6 +8,7 @@ @pytest.mark.order(3) @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('awq')) def test_quantization_awq(config, model, worker_id): diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py index d210acdf1b..9ddc454ae6 100644 --- a/autotest/tools/quantization/test_quantization_w8a8.py +++ b/autotest/tools/quantization/test_quantization_w8a8.py @@ -8,6 +8,7 @@ @pytest.mark.order(2) @pytest.mark.quantization_w8a8 +@pytest.mark.test_ascend @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('w8a8')) def test_quantization_w8a8(config, model, worker_id): diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index eaf574c591..57ac524912 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -41,6 +41,7 @@ def test_restful_chat_tp1(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_2 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) def test_restful_chat_tp2(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index c20a06884b..852c21c047 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -4,11 +4,9 @@ import allure import psutil -from utils.config_utils import get_workerid +from utils.config_utils import _is_bf16_supported_by_device, get_workerid from utils.run_restful_chat import health_check -from lmdeploy.utils import is_bf16_supported - DEFAULT_PORT = 23333 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048' GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000' @@ -38,7 +36,7 @@ def generation_test(config, run_config = '' if backend == 'pytorch': command += ' --backend pytorch' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): command += ' --dtype float16' else: if '4bit' in model: @@ -89,7 +87,7 @@ def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_ run_config = '--num-prompts 5000' if backend == 'pytorch': command += ' --backend pytorch' - if not is_bf16_supported(): + if not _is_bf16_supported_by_device(): command += ' --dtype float16' else: if '4bit' in model: diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 34ea10acc6..0df8858b2c 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -173,7 +173,7 @@ def get_config(): config_path = 'autotest/config.yaml' else: config_path = 'autotest/config.yaml' - + with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) return config @@ -239,7 +239,7 @@ def is_quantization_model(name): def _is_bf16_supported_by_device(): - """Check if bf16 is supported based on the current device""" + """Check if bf16 is supported based on the current device.""" device = os.environ.get('DEVICE', 'cuda') if device == 'ascend': # For Ascend, bf16 support check would be different @@ -248,12 +248,12 @@ def _is_bf16_supported_by_device(): else: # For CUDA and default, use the existing check return is_bf16_supported() - + def set_device_env_variable(worker_id, tp_num: int = 1): - """Set device environment variable based on the device type""" + """Set device environment variable based on the device type.""" device = os.environ.get('DEVICE', 'cuda') # Default to cuda - + if device == 'ascend': device_id = get_cuda_id_by_workerid(worker_id, tp_num) if device_id is not None: @@ -262,3 +262,13 @@ def set_device_env_variable(worker_id, tp_num: int = 1): cuda_id = get_cuda_id_by_workerid(worker_id, tp_num) if cuda_id is not None: os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id + + +def unset_device_env_variable(): + device_type = os.environ.get('DEVICE', 'cuda') + if device_type == 'ascend': + if 'ASCEND_RT_VISIBLE_DEVICES' in os.environ: + del os.environ['ASCEND_RT_VISIBLE_DEVICES'] + else: + if 'CUDA_VISIBLE_DEVICES' in os.environ: + del os.environ['CUDA_VISIBLE_DEVICES'] diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py index eb1b4c328d..6db2705214 100644 --- a/autotest/utils/get_run_config.py +++ b/autotest/utils/get_run_config.py @@ -1,6 +1,8 @@ +import os import random +import re +import subprocess from time import sleep -import os, subprocess, re import torch @@ -121,42 +123,42 @@ def close_pipeline(pipe): def _clear_device_cache(): - """Clear cache based on the current device type""" + """Clear cache based on the current device type.""" device = os.environ.get('DEVICE', 'cuda') handler = _get_device_handler(device) handler.clear_cache() def _get_device_handler(device): - """Get the appropriate device handler based on device type""" + """Get the appropriate device handler based on device type.""" handlers = { 'cuda': CudaDeviceHandler(), 'ascend': AscendDeviceHandler(), } - + # Return the specific handler if available, otherwise return default cuda handler return handlers.get(device, handlers['cuda']) class DeviceHandler: - """Base class for device handlers""" - + """Base class for device handlers.""" + def get_device_prefix(self, config, model): - """Get device-specific prefix for command execution""" + """Get device-specific prefix for command execution.""" return '' - + def clear_cache(self): - """Clear device-specific cache""" + """Clear device-specific cache.""" pass - + def get_available_devices(self): - """Get list of available devices""" + """Get list of available devices.""" return [] class CudaDeviceHandler(DeviceHandler): - """Handler for CUDA devices""" - + """Handler for CUDA devices.""" + def get_device_prefix(self, config, model): cuda_prefix = '' tp_num = get_tp_num(config, model) @@ -169,10 +171,10 @@ def get_device_prefix(self, config, model): cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num)) self.clear_cache() return cuda_prefix - + def clear_cache(self): torch.cuda.empty_cache() - + def get_available_devices(self): devices = torch.cuda.device_count() available_cuda = [] @@ -186,8 +188,8 @@ def get_available_devices(self): class AscendDeviceHandler(DeviceHandler): - """Handler for Ascend devices""" - + """Handler for Ascend devices.""" + def get_device_prefix(self, config, model): ascend_prefix = '' tp_num = get_tp_num(config, model) @@ -195,29 +197,29 @@ def get_device_prefix(self, config, model): return ascend_prefix available_ascend = self.get_available_devices() if len(available_ascend) < tp_num: - raise RuntimeError("Not enough Ascend devices available") + raise RuntimeError('Not enough Ascend devices available') ascend_prefix = 'ASCEND_RT_VISIBLE_DEVICES=' + ','.join(random.sample(available_ascend, tp_num)) self.clear_cache() return ascend_prefix - + def clear_cache(self): try: import torch_npu torch_npu.npu.empty_cache() except ImportError: pass # torch_npu not available - + def get_available_devices(self): - """Get list of available Ascend devices by checking AICPU usage rate""" + """Get list of available Ascend devices by checking AICPU usage + rate.""" available_ascend = [] try: # Get the number of NPU devices - result = subprocess.run(['npu-smi', 'info', '-l'], - capture_output=True, text=True, timeout=10) + result = subprocess.run(['npu-smi', 'info', '-l'], capture_output=True, text=True, timeout=10) if result.returncode != 0: return available_ascend - + # Parse the output to get device count # Looking for lines like "Device Count : X" device_count = 0 @@ -227,15 +229,15 @@ def get_available_devices(self): if match: device_count = int(match.group(1)) break - + # Check each device's AICPU usage for i in range(device_count): try: - result = subprocess.run(['npu-smi', 'info', '-t', 'usages', '-i', str(i)], - capture_output=True, text=True, timeout=10) + result = subprocess.run( + ['npu-smi', 'info', '-t', 'usages', '-i', str(i)], capture_output=True, text=True, timeout=10) if result.returncode != 0: continue - + # Parse the output to get AICPU Usage Rate # Looking for lines like "Aicpu Usage Rate(%) : X" aicpu_usage = 100 # Default to 100% (busy) @@ -245,14 +247,14 @@ def get_available_devices(self): if match: aicpu_usage = int(match.group(1)) break - + # If AICPU usage is 0, consider the device available if aicpu_usage == 0: available_ascend.append(str(i)) except (subprocess.TimeoutExpired, subprocess.SubprocessError): continue - + except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError): # npu-smi command not found or other error - pass - return available_ascend \ No newline at end of file + pass + return available_ascend diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index ab87ed0604..6841d36d0b 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -44,13 +44,14 @@ def run_pipeline_chat_test(config, if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') + env = os.environ.copy() with open(pipeline_chat_log, 'w') as f: cmd = f'python3 autotest/tools/pipeline/llm_case.py run_pipeline_chat_test {hf_path} autotest/prompt_case.yaml {tp} {backend_type} {is_smoke} {extra}' # noqa E501 f.writelines('reproduce command: ' + cmd + '\n') print('reproduce command: ' + cmd) # quantization - response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8') + response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env) output_text = response.stdout print(output_text) @@ -109,13 +110,14 @@ def run_pipeline_vl_chat_test(config, if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') + env = os.environ.copy() with open(pipeline_chat_log, 'w') as f: cmd = f'python3 autotest/tools/pipeline/mllm_case.py run_pipeline_mllm_test {hf_path} {resource_path} {tp} {backend_type} {is_smoke} {extra}' # noqa E501 f.writelines('reproduce command: ' + cmd + '\n') print('reproduce command: ' + cmd) # quantization - response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8') + response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env) output_text = response.stdout print(output_text) diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 04595829e2..02932e7e21 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -2,7 +2,6 @@ import subprocess from subprocess import PIPE -from lmdeploy.utils import is_bf16_supported from utils.config_utils import _is_bf16_supported_by_device @@ -31,11 +30,11 @@ def quantization(config, else: return False, 'quantization type should in [awq, gptq, w8a8], \ now the type is ' + quantization_type - + # Add device option if specified in environment device = os.environ.get('DEVICE', '') - if device: - quantization_cmd += f' --device npu' + if device == 'ascend': + quantization_cmd += f' --device npu ' if cuda_prefix is not None: quantization_cmd = ' '.join([cuda_prefix, quantization_cmd]) diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index 8f345efe9c..889cd3e158 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -1,12 +1,10 @@ import os from subprocess import PIPE, Popen -from utils.get_run_config import get_command_with_extra, get_model_name from utils.config_utils import _is_bf16_supported_by_device +from utils.get_run_config import get_command_with_extra, get_model_name from utils.rule_condition_assert import assert_result -from lmdeploy.utils import is_bf16_supported - TEMPLATE = 'autotest/template.json' @@ -31,12 +29,14 @@ def command_line_test(config, cmd += ' --model-format gptq' if case == 'base_testcase': cmd += ' --chat-template ' + TEMPLATE - + # Add device option if specified in environment device = os.environ.get('DEVICE', '') if device: - cmd += f' --device {device}' - + cmd += f' --device {device} ' + if device == 'ascend': + cmd += '--eager-mode ' + return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind', worker_id=worker_id) @@ -74,12 +74,14 @@ def hf_command_line_test(config, if case == 'base_testcase': cmd += ' --chat-template ' + TEMPLATE - + # Add device option if specified in environment device = os.environ.get('DEVICE', '') if device: - cmd += f' --device {device}' - + cmd += f' --device {device} ' + if device == 'ascend': + cmd += '--eager-mode ' + return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True) @@ -175,4 +177,4 @@ def extract_output(output: str, model: str): if len(output.split('[/INST]')) >= 2: return output.split('[/INST]')[1] - return output \ No newline at end of file + return output diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index a7922a0035..a3c3c99b77 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -7,13 +7,12 @@ import psutil from openai import OpenAI from pytest_assume.plugin import assume -from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid, _is_bf16_supported_by_device -from utils.get_run_config import get_command_with_extra +from utils.config_utils import _is_bf16_supported_by_device, get_cuda_prefix_by_workerid, get_workerid +from utils.get_run_config import get_command_with_extra from utils.restful_return_check import assert_chat_completions_batch_return from utils.rule_condition_assert import assert_result from lmdeploy.serve.openai.api_client import APIClient -from lmdeploy.utils import is_bf16_supported BASE_HTTP_URL = 'http://localhost' DEFAULT_PORT = 23333 @@ -60,10 +59,12 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) need_tp=True, cuda_prefix=cuda_prefix, extra=extra) - + device = os.environ.get('DEVICE', '') if device: - cmd += f' --device {device}' + cmd += f' --device {device} ' + if device == 'ascend': + cmd += '--eager-mode ' if backend_type == 'turbomind': if ('w4' in model or '4bits' in model or 'awq' in model.lower()): @@ -137,7 +138,6 @@ def stop_restful_api(pid, startRes, param): def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PORT): http_url = BASE_HTTP_URL + ':' + str(port) - model = get_model(http_url) if model is None: @@ -164,8 +164,8 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): result = True - client = OpenAI(api_key='YOUR_API_KEY', base_url=f'{url}/v1') - model_name = client.models.list().data[0].id + api_client = APIClient(url) + model_name = api_client.available_models[0] messages = [] msg = '' @@ -176,17 +176,18 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): messages.append({'role': 'user', 'content': prompt}) file.writelines('prompt:' + prompt + '\n') - response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, top_p=0.8) + for output in api_client.chat_completions_v1(model=model_name, messages=messages, top_k=1, max_tokens=256): + output_message = output.get('choices')[0].get('message') + messages.append(output_message) - output_content = response.choices[0].message.content - file.writelines('output:' + output_content + '\n') - messages.append({'role': 'assistant', 'content': output_content}) + output_content = output_message.get('content') + file.writelines('output:' + output_content + '\n') - case_result, reason = assert_result(output_content, prompt_detail.values(), model_name) - file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') - if not case_result: - msg += reason - result = result & case_result + case_result, reason = assert_result(output_content, prompt_detail.values(), model_name) + file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') + if not case_result: + msg += reason + result = result & case_result file.close() return result, restful_log, msg @@ -457,9 +458,9 @@ def get_temperature_date(location: str, date: str, unit: str = 'celsius'): """Get temperature at a location and date. Args: - location: The location to get the temperature for, in the format 'City, State, Country'. - date: The date to get the temperature for, in the format 'Year-Month-Day'. - unit: The unit to return the temperature in. Defaults to 'celsius'. (choices: ['celsius', 'fahrenheit']) + location: The location to get the temperature for, in the format "City, State, Country". + date: The date to get the temperature for, in the format "Year-Month-Day". + unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"]) Returns: the temperature, the location, the date and the unit in a dict @@ -617,7 +618,7 @@ def run_tools_case(config, port: int = DEFAULT_PORT): }, } }] - messages = [{'role': 'user', 'content': 'What\'s the weather like in Boston today?'}] + messages = [{'role': 'user', 'content': "What's the weather like in Boston today?"}] response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, From fb57adaabcb97ff0ea8a5b28dd83b8ef3507fd32 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 29 Aug 2025 11:14:59 +0800 Subject: [PATCH 03/32] AUTOTEST: add pipeline test timeout --- autotest/tools/pipeline/llm_case.py | 5 ++++- autotest/tools/pipeline/mllm_case.py | 6 ++++-- autotest/utils/pipeline_chat.py | 22 ++++++++++++++++++++-- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 74a00de128..9bfe91a8ef 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -94,7 +94,10 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, f'[caseresult {case} end]\n') # TODO fix for ascend - # pipe.close() + if device == 'ascend': + pass + else: + pipe.close() import gc import torch diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index e2829466fe..e48b443e9c 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -135,8 +135,10 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te if 'qwen' in model_path.lower(): Qwen_vl_testcase(pipe, resource_path) - # TODO fix for ascend - # pipe.close() + if device == 'ascend': + pass + else: + pipe.close() import gc import torch diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 6841d36d0b..a59e84e137 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -51,7 +51,16 @@ def run_pipeline_chat_test(config, f.writelines('reproduce command: ' + cmd + '\n') print('reproduce command: ' + cmd) # quantization - response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env) + try: + response = subprocess.run([cmd], + shell=True, + capture_output=True, + text=True, + encoding='utf-8', + env=env, + timeout=600) + except subprocess.TimeoutExpired as e: + assert False, f'Test command timed out after 10 minutes: {e.cmd}' output_text = response.stdout print(output_text) @@ -117,7 +126,16 @@ def run_pipeline_vl_chat_test(config, f.writelines('reproduce command: ' + cmd + '\n') print('reproduce command: ' + cmd) # quantization - response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env) + try: + response = subprocess.run([cmd], + shell=True, + capture_output=True, + text=True, + encoding='utf-8', + env=env, + timeout=600) + except subprocess.TimeoutExpired as e: + assert False, f'Test command timed out after 10 minutes: {e.cmd}' output_text = response.stdout print(output_text) From 4869f6400ccccc993a5081e268d493b12441c25b Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 29 Aug 2025 11:24:58 +0800 Subject: [PATCH 04/32] AUTOTEST: fix lint flake8 --- autotest/interface/pipeline/test_pipeline_func.py | 1 - autotest/tools/pipeline/llm_case.py | 1 - autotest/tools/pipeline/mllm_case.py | 1 - autotest/utils/quantization_utils.py | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 8c8b0d45c3..42f6e95d86 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -2,7 +2,6 @@ import pydantic import pytest -import torch from utils.config_utils import _is_bf16_supported_by_device, set_device_env_variable, unset_device_env_variable from utils.get_run_config import _clear_device_cache from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return, diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 9bfe91a8ef..0555dce6f2 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -100,7 +100,6 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, pipe.close() import gc - import torch gc.collect() _clear_device_cache() diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index e48b443e9c..5a649a1cca 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -141,7 +141,6 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te pipe.close() import gc - import torch gc.collect() _clear_device_cache() diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 02932e7e21..78e7d62d1e 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -34,7 +34,7 @@ def quantization(config, # Add device option if specified in environment device = os.environ.get('DEVICE', '') if device == 'ascend': - quantization_cmd += f' --device npu ' + quantization_cmd += ' --device npu ' if cuda_prefix is not None: quantization_cmd = ' '.join([cuda_prefix, quantization_cmd]) From d8f85d4981d21fc9e6faf17df285db57ea682b0c Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 29 Aug 2025 16:41:41 +0800 Subject: [PATCH 05/32] Create api_eva.yml --- .github/workflows/api_eva.yml | 187 ++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 .github/workflows/api_eva.yml diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml new file mode 100644 index 0000000000..9c15c7b825 --- /dev/null +++ b/.github/workflows/api_eva.yml @@ -0,0 +1,187 @@ +name: api_evalate + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['evaluation']" + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda11.8 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, 140-test] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + + + test_evaluation: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + runs-on: [self-hosted, 140-test] + timeout-minutes: 120 # 2hours + strategy: + fail-fast: false + matrix: + evaluate_type: ['chat', 'base'] + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install sentence_transformers==2.2.2 --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt + python3 -m pip install -e . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . + - name: Evaluate models + if: matrix.evaluate_type == 'chat' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir From 82188f7c9a4984a04ec3d7385691dad8303180e2 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 16:38:01 +0800 Subject: [PATCH 06/32] WORKFLOW: add ascend workflow --- .github/workflows/daily_ete_test_ascend.yml | 223 ++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 .github/workflows/daily_ete_test_ascend.yml diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml new file mode 100644 index 0000000000..214fa87390 --- /dev/null +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -0,0 +1,223 @@ +name: daily_ete_test + +on: + push: + branches: + - hw_runner + +env: + REPORT_DIR: /test/test-reports/${{ github.run_id }} + COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /test/lmdeploy_hw + LOG_PATH: /test/log + OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt + # Default values for former workflow_dispatch inputs + BACKEND: "['turbomind', 'pytorch']" + MODEL: "['llm','mllm']" + FUNCTION: '["pipeline", "restful", "chat"]' + OFFLINE_MODE: false + REGRESSION_FUNC: "['quant', 'pipeline', 'restful', 'chat']" + TMPDIR: /mnt/deeplink/docker-tmp + RAY_TMPDIR: /mnt/deeplink/docker-tmp + + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 150 + container: + image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest + options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + volumes: + - /usr/local/Ascend/driver:/usr/local/Ascend/driver + - /usr/local/sbin:/usr/local/sbin + - /var/log/npu/slog:/var/log/npu/slog + - /var/log/npu/profiling:/var/log/npu/profiling + - /var/log/npu/dump:/var/log/npu/dump + - /var/log/npu:/usr/slog + - /etc/hccn.conf:/etc/hccn.conf + - /root/qa_test:/test + - /mnt:/mnt + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + python3 -m pip install transformers==4.53.1 + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf ${{env.LOG_PATH}}/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: contains(fromJSON(env.BACKEND), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - quantization w8a8 + continue-on-error: true + if: contains(fromJSON(env.BACKEND), 'pytorch') + run: | + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + # - name: Clear workfile + # if: always() + # run: | + # chmod -R 777 $REPORT_DIR + # export workdir=$(pwd) + # cd .. + # rm -rf $workdir + # mkdir $workdir + # chmod -R 777 $workdir + + test_tools: + if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}} + runs-on: [self-hosted, linux-a100] + needs: test_quantization + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(env.BACKEND) }} + model: ${{ fromJSON(env.MODEL) }} + function: ${{ fromJSON(env.FUNCTION) }} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + include: + - backend: turbomind + model: llm + function: local_case + container: + image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest + options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + volumes: + - /usr/local/Ascend/driver:/usr/local/Ascend/driver + - /usr/local/sbin:/usr/local/sbin + - /var/log/npu/slog:/var/log/npu/slog + - /var/log/npu/profiling:/var/log/npu/profiling + - /var/log/npu/dump:/var/log/npu/dump + - /var/log/npu:/usr/slog + - /etc/hccn.conf:/etc/hccn.conf + - /root/qa_test:/test + - /mnt:/mnt + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + cp -r /root/lora . + rm -rf allure-results + # remove tmp log in testcase + rm -rf ${{env.LOG_PATH}}/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + # - name: Clear workfile + # if: always() + # run: | + # chmod -R 777 $REPORT_DIR + # export workdir=$(pwd) + # cd .. + # rm -rf $workdir + # mkdir $workdir + # chmod -R 777 $workdir + + # get_coverage_report: + # if: ${{!cancelled()}} + # runs-on: [self-hosted, linux-a100] + # needs: [test_tools, test_restful, test_pipeline, test_benchmark] + # timeout-minutes: 5 + # container: + # image: openmmlab/lmdeploy:latest-cu11 + # options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + # volumes: + # - /nvme/github-actions/pip-cache:/root/.cache/pip + # - /nvme/github-actions/packages:/root/packages + # - /nvme/qa_test_models:/nvme/qa_test_models + # - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + # steps: + # - name: Copy repository and Artifacts + # run: cp -r ${{env.TEST_CODE_PATH}}/. . + # - name: Install lmdeploy + # run: | + # python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + # python3 -m pip install -r requirements/test.txt + # - name: Get coverage report + # run: | + # pip install coverage + # coverage combine ${{env.REPORT_DIR}} + # coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + # coverage report -m + # mv .coverage ${{env.REPORT_DIR}}/.coverage + # - name: Clear workfile + # if: always() + # run: | + # chmod -R 777 $REPORT_DIR + # export workdir=$(pwd) + # cd .. + # rm -rf $workdir + # mkdir $workdir + # chmod -R 777 $workdir From ddaa36c35323a971e0707492693542a3855ea8f9 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 17:01:21 +0800 Subject: [PATCH 07/32] WORKFLOW: update ascend runner --- .github/workflows/daily_ete_test_ascend.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 214fa87390..f17b0bb225 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -21,11 +21,10 @@ env: TMPDIR: /mnt/deeplink/docker-tmp RAY_TMPDIR: /mnt/deeplink/docker-tmp - +jobs: test_quantization: - needs: download_pkgs if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} - runs-on: [self-hosted, linux-a100] + runs-on: [self-hosted, ascend-013] timeout-minutes: 150 container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest @@ -85,7 +84,7 @@ env: test_tools: if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}} - runs-on: [self-hosted, linux-a100] + runs-on: [self-hosted, ascend-013] needs: test_quantization timeout-minutes: 300 strategy: @@ -104,7 +103,6 @@ env: include: - backend: turbomind model: llm - function: local_case container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" @@ -187,7 +185,7 @@ env: # get_coverage_report: # if: ${{!cancelled()}} - # runs-on: [self-hosted, linux-a100] + # runs-on: [self-hosted, ascend-013] # needs: [test_tools, test_restful, test_pipeline, test_benchmark] # timeout-minutes: 5 # container: From 2cda270d8496c0964291a8ed08825b68abc35f03 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 17:10:46 +0800 Subject: [PATCH 08/32] fix yaml --- .github/workflows/daily_ete_test_ascend.yml | 152 +++++++------------- 1 file changed, 49 insertions(+), 103 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index f17b0bb225..f0a51690eb 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -13,17 +13,17 @@ env: LOG_PATH: /test/log OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt # Default values for former workflow_dispatch inputs - BACKEND: "['turbomind', 'pytorch']" - MODEL: "['llm','mllm']" + BACKEND: '["turbomind", "pytorch"]' + MODEL: '["llm","mllm"]' FUNCTION: '["pipeline", "restful", "chat"]' OFFLINE_MODE: false - REGRESSION_FUNC: "['quant', 'pipeline', 'restful', 'chat']" + REGRESSION_FUNC: '["quant", "pipeline", "restful", "chat"]' TMPDIR: /mnt/deeplink/docker-tmp RAY_TMPDIR: /mnt/deeplink/docker-tmp jobs: test_quantization: - if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} + if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} runs-on: [self-hosted, ascend-013] timeout-minutes: 150 container: @@ -42,10 +42,10 @@ jobs: steps: - name: Copy repository and Artifacts run: | - cp -r ${{env.TEST_CODE_PATH}}/. . + cp -r ${{ env.TEST_CODE_PATH }}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps @@ -57,33 +57,24 @@ jobs: lmdeploy check_env rm -rf allure-results # remove tmp log in testcase - rm -rf ${{env.LOG_PATH}}/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + rm -rf ${{ env.LOG_PATH }}/* + mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p + ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true - if: contains(fromJSON(env.BACKEND), 'turbomind') + if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }} run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true - if: contains(fromJSON(env.BACKEND), 'pytorch') + if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }} run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - # - name: Clear workfile - # if: always() - # run: | - # chmod -R 777 $REPORT_DIR - # export workdir=$(pwd) - # cd .. - # rm -rf $workdir - # mkdir $workdir - # chmod -R 777 $workdir + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') test_tools: - if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}} + if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }} runs-on: [self-hosted, ascend-013] needs: test_quantization timeout-minutes: 300 @@ -103,6 +94,7 @@ jobs: include: - backend: turbomind model: llm + function: chat container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" @@ -119,10 +111,10 @@ jobs: steps: - name: Copy repository and Artifacts run: | - cp -r ${{env.TEST_CODE_PATH}}/. . + cp -r ${{ env.TEST_CODE_PATH }}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps @@ -134,88 +126,42 @@ jobs: cp -r /root/lora . rm -rf allure-results # remove tmp log in testcase - rm -rf ${{env.LOG_PATH}}/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + rm -rf ${{ env.LOG_PATH }}/* + mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p + ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true - if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true - if: matrix.function == 'pipeline' + if: ${{ matrix.function == 'pipeline' }} run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true - if: matrix.function == 'restful' + if: ${{ matrix.function == 'restful' }} run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - # - name: Clear workfile - # if: always() - # run: | - # chmod -R 777 $REPORT_DIR - # export workdir=$(pwd) - # cd .. - # rm -rf $workdir - # mkdir $workdir - # chmod -R 777 $workdir - - # get_coverage_report: - # if: ${{!cancelled()}} - # runs-on: [self-hosted, ascend-013] - # needs: [test_tools, test_restful, test_pipeline, test_benchmark] - # timeout-minutes: 5 - # container: - # image: openmmlab/lmdeploy:latest-cu11 - # options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - # volumes: - # - /nvme/github-actions/pip-cache:/root/.cache/pip - # - /nvme/github-actions/packages:/root/packages - # - /nvme/qa_test_models:/nvme/qa_test_models - # - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - # steps: - # - name: Copy repository and Artifacts - # run: cp -r ${{env.TEST_CODE_PATH}}/. . - # - name: Install lmdeploy - # run: | - # python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - # python3 -m pip install -r requirements/test.txt - # - name: Get coverage report - # run: | - # pip install coverage - # coverage combine ${{env.REPORT_DIR}} - # coverage xml -o ${{env.REPORT_DIR}}/coverage.xml - # coverage report -m - # mv .coverage ${{env.REPORT_DIR}}/.coverage - # - name: Clear workfile - # if: always() - # run: | - # chmod -R 777 $REPORT_DIR - # export workdir=$(pwd) - # cd .. - # rm -rf $workdir - # mkdir $workdir - # chmod -R 777 $workdir + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') \ No newline at end of file From 755cee605c0d2d3d36e4916b209f5daaaeb057b5 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 17:31:35 +0800 Subject: [PATCH 09/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index f0a51690eb..248d008130 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -62,39 +62,39 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true - if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }} + # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }} run: | pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true - if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }} + # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }} run: | pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') test_tools: - if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }} + # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }} runs-on: [self-hosted, ascend-013] needs: test_quantization timeout-minutes: 300 strategy: fail-fast: false - matrix: - backend: ${{ fromJSON(env.BACKEND) }} - model: ${{ fromJSON(env.MODEL) }} - function: ${{ fromJSON(env.FUNCTION) }} - exclude: - - backend: turbomind - model: mllm - function: chat - - backend: pytorch - model: mllm - function: chat - include: - - backend: turbomind - model: llm - function: chat + # matrix: + # backend: ${{ fromJSON(env.BACKEND) }} + # model: ${{ fromJSON(env.MODEL) }} + # function: ${{ fromJSON(env.FUNCTION) }} + # exclude: + # - backend: turbomind + # model: mllm + # function: chat + # - backend: pytorch + # model: mllm + # function: chat + # include: + # - backend: turbomind + # model: llm + # function: chat container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" @@ -131,7 +131,7 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true - if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} + # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true @@ -143,7 +143,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true - if: ${{ matrix.function == 'pipeline' }} + # if: ${{ matrix.function == 'pipeline' }} run: | pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true @@ -155,7 +155,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true - if: ${{ matrix.function == 'restful' }} + # if: ${{ matrix.function == 'restful' }} run: | pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true From 28f4df644d9e07328c6b3ec4c42c8a5053ecbbaf Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 17:32:36 +0800 Subject: [PATCH 10/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 248d008130..0321c935bc 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -23,7 +23,7 @@ env: jobs: test_quantization: - if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} + # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} runs-on: [self-hosted, ascend-013] timeout-minutes: 150 container: From 0ef80d682009c8455b29c89b5b91834f56424aaa Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 17:42:19 +0800 Subject: [PATCH 11/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 0321c935bc..1ced07212d 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -23,6 +23,7 @@ env: jobs: test_quantization: + if: ${{ !cancelled() }} # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} runs-on: [self-hosted, ascend-013] timeout-minutes: 150 @@ -62,18 +63,21 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true + if: ${{ !cancelled() }} # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }} run: | pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true + if: ${{ !cancelled() }} # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }} run: | pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') test_tools: + if: ${{ !cancelled() }} # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }} runs-on: [self-hosted, ascend-013] needs: test_quantization @@ -131,6 +135,7 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true + if: ${{ !cancelled() }} # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true @@ -143,6 +148,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true + if: ${{ !cancelled() }} # if: ${{ matrix.function == 'pipeline' }} run: | pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true @@ -155,6 +161,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true + if: ${{ !cancelled() }} # if: ${{ matrix.function == 'restful' }} run: | pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true From c6c618312a68b0e5df11d775a1f7b9bd896613a8 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 18:52:17 +0800 Subject: [PATCH 12/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 1ced07212d..5be7e0baf2 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -29,15 +29,28 @@ jobs: timeout-minutes: 150 container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + options: >- + --net=host + --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 + --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 + --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc + -e PIP_CACHE_DIR=/root/.cache/pip + --shm-size=150g + --pull missing + --memory=256g + --cpus=48 + --security-opt=no-new-privileges:true + --health-cmd="npu-smi info" + --health-interval=30s + --restart=on-failure:5 volumes: - - /usr/local/Ascend/driver:/usr/local/Ascend/driver - - /usr/local/sbin:/usr/local/sbin + - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro + - /usr/local/sbin:/usr/local/sbin:ro - /var/log/npu/slog:/var/log/npu/slog - /var/log/npu/profiling:/var/log/npu/profiling - /var/log/npu/dump:/var/log/npu/dump - /var/log/npu:/usr/slog - - /etc/hccn.conf:/etc/hccn.conf + - /etc/hccn.conf:/etc/hccn.conf:ro - /root/qa_test:/test - /mnt:/mnt steps: From 39b6dc3d997a9b47b909bbdf7853ee2d72c075b6 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 19:14:46 +0800 Subject: [PATCH 13/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 5be7e0baf2..978dccb503 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -29,20 +29,7 @@ jobs: timeout-minutes: 150 container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: >- - --net=host - --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 - --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 - --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc - -e PIP_CACHE_DIR=/root/.cache/pip - --shm-size=150g - --pull missing - --memory=256g - --cpus=48 - --security-opt=no-new-privileges:true - --health-cmd="npu-smi info" - --health-interval=30s - --restart=on-failure:5 + options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" volumes: - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro - /usr/local/sbin:/usr/local/sbin:ro From 88c84614eec350d614a63ca82e8d4a32150d531d Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 4 Sep 2025 19:16:50 +0800 Subject: [PATCH 14/32] fix yaml ii --- .github/workflows/daily_ete_test_ascend.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 978dccb503..c079364511 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -29,7 +29,7 @@ jobs: timeout-minutes: 150 container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" volumes: - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro - /usr/local/sbin:/usr/local/sbin:ro @@ -101,7 +101,7 @@ jobs: # function: chat container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" volumes: - /usr/local/Ascend/driver:/usr/local/Ascend/driver - /usr/local/sbin:/usr/local/sbin From 82cec68c02207e33c37eff0df0993b15df35a361 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 5 Sep 2025 10:23:02 +0800 Subject: [PATCH 15/32] update ascend --- .github/workflows/daily_ete_test_ascend.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index c079364511..dc262bdfbb 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -49,7 +49,7 @@ jobs: python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps python3 -m pip install -r requirements/test.txt python3 -m pip install transformers==4.53.1 - name: Check env @@ -121,7 +121,7 @@ jobs: python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | From 49b3d59670dc4de773f7715b78f03d0719ed9aee Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 5 Sep 2025 10:35:08 +0800 Subject: [PATCH 16/32] update ascend --- .github/workflows/daily_ete_test_ascend.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index dc262bdfbb..b5e9886725 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -49,7 +49,6 @@ jobs: python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps python3 -m pip install -r requirements/test.txt python3 -m pip install transformers==4.53.1 - name: Check env @@ -121,7 +120,6 @@ jobs: python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - name: Install lmdeploy run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | From 4b591d4d73b04788a3d724bb98afd1ec44f88e7e Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 5 Sep 2025 10:56:18 +0800 Subject: [PATCH 17/32] update ascend --- .github/workflows/daily_ete_test_ascend.yml | 24 ++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index b5e9886725..cab1cb4e3b 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -136,37 +136,37 @@ jobs: if: ${{ !cancelled() }} # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true if: ${{ !cancelled() }} # if: ${{ matrix.function == 'pipeline' }} run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true if: ${{ !cancelled() }} # if: ${{ matrix.function == 'restful' }} run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') \ No newline at end of file From ee49e02cfe87bb8f8472d26dd3301d242091b4fc Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 14:34:06 +0800 Subject: [PATCH 18/32] AUTOTEST: update hw yml --- .github/workflows/daily_ete_test_ascend.yml | 161 ++++++++++++++------ 1 file changed, 113 insertions(+), 48 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index cab1cb4e3b..b977c740ca 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -1,30 +1,83 @@ -name: daily_ete_test +name: daily_ete_test_ascend on: - push: - branches: - - hw_runner + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, vllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']" env: REPORT_DIR: /test/test-reports/${{ github.run_id }} COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} - TEST_CODE_PATH: /test/lmdeploy_hw + TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }} LOG_PATH: /test/log - OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt - # Default values for former workflow_dispatch inputs - BACKEND: '["turbomind", "pytorch"]' - MODEL: '["llm","mllm"]' - FUNCTION: '["pipeline", "restful", "chat"]' - OFFLINE_MODE: false - REGRESSION_FUNC: '["quant", "pipeline", "restful", "chat"]' TMPDIR: /mnt/deeplink/docker-tmp RAY_TMPDIR: /mnt/deeplink/docker-tmp jobs: + download_pkgs: + if: ${{!cancelled()}} + runs-on: [self-hosted, ascend-013] + timeout-minutes: 50 + container: + image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest + options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" + volumes: + - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro + - /usr/local/sbin:/usr/local/sbin:ro + - /var/log/npu/slog:/var/log/npu/slog + - /var/log/npu/profiling:/var/log/npu/profiling + - /var/log/npu/dump:/var/log/npu/dump + - /var/log/npu:/usr/slog + - /etc/hccn.conf:/etc/hccn.conf:ro + - /root/qa_test:/test + - /mnt:/mnt + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{ !cancelled() }} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{ !cancelled() }} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} test_quantization: if: ${{ !cancelled() }} - # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }} runs-on: [self-hosted, ascend-013] timeout-minutes: 150 container: @@ -43,11 +96,11 @@ jobs: steps: - name: Copy repository and Artifacts run: | - cp -r ${{ env.TEST_CODE_PATH }}/. . + cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - - name: Install lmdeploy + python3 -m pip install -r requirements_ascend.txt + - name: Install lmdeploy - offline run: | python3 -m pip install -r requirements/test.txt python3 -m pip install transformers==4.53.1 @@ -62,42 +115,48 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true - if: ${{ !cancelled() }} - # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }} + if: contains(fromJSON(github.event.inputs.backend), 'turbomind') run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true - if: ${{ !cancelled() }} - # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }} + if: contains(fromJSON(github.event.inputs.backend), 'pytorch') run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir test_tools: - if: ${{ !cancelled() }} - # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }} + if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}} runs-on: [self-hosted, ascend-013] needs: test_quantization timeout-minutes: 300 strategy: fail-fast: false - # matrix: - # backend: ${{ fromJSON(env.BACKEND) }} - # model: ${{ fromJSON(env.MODEL) }} - # function: ${{ fromJSON(env.FUNCTION) }} - # exclude: - # - backend: turbomind - # model: mllm - # function: chat - # - backend: pytorch - # model: mllm - # function: chat - # include: - # - backend: turbomind - # model: llm - # function: chat + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + include: + - backend: turbomind + model: llm + function: local_case container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" @@ -133,8 +192,7 @@ jobs: ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - name: Test lmdeploy - chat continue-on-error: true - if: ${{ !cancelled() }} - # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} + if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true @@ -146,8 +204,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true - if: ${{ !cancelled() }} - # if: ${{ matrix.function == 'pipeline' }} + if: ${{ matrix.function == 'pipeline' }} run: | pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true @@ -159,8 +216,7 @@ jobs: mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true - if: ${{ !cancelled() }} - # if: ${{ matrix.function == 'restful' }} + if: ${{ matrix.function == 'restful' }} run: | pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true @@ -169,4 +225,13 @@ jobs: pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') \ No newline at end of file + mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir From 85bac1ec85aac0577f2c2ceeaf5973ce5f2797d9 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 14:39:36 +0800 Subject: [PATCH 19/32] AUTOTEST: fix hw yml --- .github/workflows/daily_ete_test_ascend.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index b977c740ca..5a7a637acd 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -28,16 +28,11 @@ on: description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' type: string default: '["pipeline", "restful", "chat"]' - offline_mode: - required: true - description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - type: boolean - default: false regression_func: required: true description: 'regression functions' type: string - default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']" + default: "['tools']" env: REPORT_DIR: /test/test-reports/${{ github.run_id }} @@ -77,6 +72,7 @@ jobs: if: ${{ !cancelled() }} run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} test_quantization: + needs: download_pkgs if: ${{ !cancelled() }} runs-on: [self-hosted, ascend-013] timeout-minutes: 150 From 06568e2fb31ac6a76300bec38dd7fddbcf898afb Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 14:43:58 +0800 Subject: [PATCH 20/32] AUTOTEST: add ascend device --- .github/workflows/daily_ete_test_ascend.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 5a7a637acd..9e28735df6 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -113,13 +113,13 @@ jobs: continue-on-error: true if: contains(fromJSON(github.event.inputs.backend), 'turbomind') run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true if: contains(fromJSON(github.event.inputs.backend), 'pytorch') run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() From c1ae0b1fbacd3c8661f611aead5598cf4de13bde Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 15:27:18 +0800 Subject: [PATCH 21/32] CI: fix yml --- .github/workflows/daily_ete_test_ascend.yml | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 9e28735df6..26f6a01551 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -28,6 +28,11 @@ on: description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' type: string default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false regression_func: required: true description: 'regression functions' @@ -93,13 +98,14 @@ jobs: - name: Copy repository and Artifacts run: | cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r requirements_ascend.txt - name: Install lmdeploy - offline + if: ${{inputs.offline_mode}} run: | - python3 -m pip install -r requirements/test.txt python3 -m pip install transformers==4.53.1 + python3 -m pip install -r requirements_ascend.txt + - name: Install lmdeploy - test + run: | + python3 -m pip install -r requirements/test.txt - name: Check env run: | python3 -m pip list @@ -170,17 +176,18 @@ jobs: - name: Copy repository and Artifacts run: | cp -r ${{ env.TEST_CODE_PATH }}/. . - - name: Install lmdeploy - dependency + - name: Install lmdeploy - offline + if: ${{inputs.offline_mode}} run: | - python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }} - - name: Install lmdeploy + python3 -m pip install transformers==4.53.1 + python3 -m pip install -r requirements_ascend.txt + - name: Install lmdeploy - test run: | - python3 -m pip install -r requirements/test.txt + python3 -m pip install -r requirements/test.txt - name: Check env run: | python3 -m pip list lmdeploy check_env - cp -r /root/lora . rm -rf allure-results # remove tmp log in testcase rm -rf ${{ env.LOG_PATH }}/* From 55b2f76ca52f11d26ef43d27ba6a33b6cb6b80e1 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 15:59:42 +0800 Subject: [PATCH 22/32] CI: add pip cache --- .github/workflows/daily_ete_test_ascend.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 26f6a01551..534c1e6ec5 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -66,6 +66,7 @@ jobs: - /etc/hccn.conf:/etc/hccn.conf:ro - /root/qa_test:/test - /mnt:/mnt + - /root/.cache/pip:/root/.cache/pip steps: - name: Clone repository uses: actions/checkout@v2 @@ -94,6 +95,7 @@ jobs: - /etc/hccn.conf:/etc/hccn.conf:ro - /root/qa_test:/test - /mnt:/mnt + - /root/.cache/pip:/root/.cache/pip steps: - name: Copy repository and Artifacts run: | @@ -172,6 +174,7 @@ jobs: - /etc/hccn.conf:/etc/hccn.conf - /root/qa_test:/test - /mnt:/mnt + - /root/.cache/pip:/root/.cache/pip steps: - name: Copy repository and Artifacts run: | From b012c5a375f8084c327e1148867291d372e2888b Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 16:06:13 +0800 Subject: [PATCH 23/32] CI: add pip cache --- .github/workflows/daily_ete_test_ascend.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 534c1e6ec5..ebc734dcad 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -107,7 +107,7 @@ jobs: python3 -m pip install -r requirements_ascend.txt - name: Install lmdeploy - test run: | - python3 -m pip install -r requirements/test.txt + python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Check env run: | python3 -m pip list @@ -186,7 +186,7 @@ jobs: python3 -m pip install -r requirements_ascend.txt - name: Install lmdeploy - test run: | - python3 -m pip install -r requirements/test.txt + python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Check env run: | python3 -m pip list From 43e66662819d6424f7715a8676bdf6a498e7b321 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 16:20:08 +0800 Subject: [PATCH 24/32] CI: add pip cache --- .github/workflows/daily_ete_test_ascend.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index ebc734dcad..4f060abcc0 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -28,11 +28,11 @@ on: description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' type: string default: '["pipeline", "restful", "chat"]' - offline_mode: - required: true - description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - type: boolean - default: false + # offline_mode: + # required: true + # description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + # type: boolean + # default: false regression_func: required: true description: 'regression functions' @@ -101,10 +101,9 @@ jobs: run: | cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - offline - if: ${{inputs.offline_mode}} + # if: ${{inputs.offline_mode}} run: | - python3 -m pip install transformers==4.53.1 - python3 -m pip install -r requirements_ascend.txt + python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Install lmdeploy - test run: | python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ @@ -180,10 +179,9 @@ jobs: run: | cp -r ${{ env.TEST_CODE_PATH }}/. . - name: Install lmdeploy - offline - if: ${{inputs.offline_mode}} + # if: ${{inputs.offline_mode}} run: | - python3 -m pip install transformers==4.53.1 - python3 -m pip install -r requirements_ascend.txt + python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Install lmdeploy - test run: | python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ From b3dcf405c7e49e4344ce584359557db519177dfa Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 10 Sep 2025 16:35:31 +0800 Subject: [PATCH 25/32] CI: add pip cache --- .github/workflows/daily_ete_test_ascend.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 4f060abcc0..181221d3ec 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -47,6 +47,7 @@ env: LOG_PATH: /test/log TMPDIR: /mnt/deeplink/docker-tmp RAY_TMPDIR: /mnt/deeplink/docker-tmp + HF_ENDPOINT: https://hf-mirror.com jobs: download_pkgs: From dd8bac76c1bde5117d8d29a6d2befe84e402b932 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 11 Sep 2025 16:48:43 +0800 Subject: [PATCH 26/32] TEST: update ascend test --- .github/workflows/daily_ete_test_ascend.yml | 109 +++--------------- autotest/config-ascend.yaml | 6 +- .../chat/test_command_chat_hf_pytorch.py | 3 + autotest/tools/pipeline/llm_case.py | 6 +- .../test_pipeline_chat_pytorch_llm.py | 3 + .../test_pipeline_chat_pytorch_mllm.py | 3 + .../quantization/test_quantization_awq.py | 1 - .../quantization/test_quantization_w8a8.py | 1 - .../test_restful_chat_hf_pytorch_llm.py | 3 + .../test_restful_chat_hf_pytorch_mllm.py | 3 + 10 files changed, 38 insertions(+), 100 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 181221d3ec..24be9c9452 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -17,22 +17,17 @@ on: required: true description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' type: string - default: "['turbomind', 'pytorch']" + default: "['pytorch']" model: required: true description: 'Set testcase module filter: llm, vllm. Default contains all models' type: string - default: "['llm','mllm']" + default: "['llm']" function: required: true description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' type: string default: '["pipeline", "restful", "chat"]' - # offline_mode: - # required: true - # description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - # type: boolean - # default: false regression_func: required: true description: 'regression functions' @@ -41,13 +36,12 @@ on: env: REPORT_DIR: /test/test-reports/${{ github.run_id }} - COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /usr/local/python3.10.17/lib/python3.10/site-packages/lmdeploy FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }} - LOG_PATH: /test/log + LOG_PATH: /test/log/${{ github.run_id }} TMPDIR: /mnt/deeplink/docker-tmp RAY_TMPDIR: /mnt/deeplink/docker-tmp - HF_ENDPOINT: https://hf-mirror.com jobs: download_pkgs: @@ -78,71 +72,11 @@ jobs: - name: Copy repository if: ${{ !cancelled() }} run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} - test_quantization: - needs: download_pkgs - if: ${{ !cancelled() }} - runs-on: [self-hosted, ascend-013] - timeout-minutes: 150 - container: - image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" - volumes: - - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro - - /usr/local/sbin:/usr/local/sbin:ro - - /var/log/npu/slog:/var/log/npu/slog - - /var/log/npu/profiling:/var/log/npu/profiling - - /var/log/npu/dump:/var/log/npu/dump - - /var/log/npu:/usr/slog - - /etc/hccn.conf:/etc/hccn.conf:ro - - /root/qa_test:/test - - /mnt:/mnt - - /root/.cache/pip:/root/.cache/pip - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - offline - # if: ${{inputs.offline_mode}} - run: | - python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ - - name: Install lmdeploy - test - run: | - python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf ${{ env.LOG_PATH }}/* - mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p - ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - - name: Test lmdeploy - quantization w4a16 - continue-on-error: true - if: contains(fromJSON(github.event.inputs.backend), 'turbomind') - run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - quantization w8a8 - continue-on-error: true - if: contains(fromJSON(github.event.inputs.backend), 'pytorch') - run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir test_tools: if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}} runs-on: [self-hosted, ascend-013] - needs: test_quantization + needs: download_pkgs timeout-minutes: 300 strategy: fail-fast: false @@ -157,10 +91,6 @@ jobs: - backend: pytorch model: mllm function: chat - include: - - backend: turbomind - model: llm - function: local_case container: image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" @@ -180,12 +110,11 @@ jobs: run: | cp -r ${{ env.TEST_CODE_PATH }}/. . - name: Install lmdeploy - offline - # if: ${{inputs.offline_mode}} run: | - python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ + python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Install lmdeploy - test run: | - python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ + python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ - name: Check env run: | python3 -m pip list @@ -199,37 +128,37 @@ jobs: continue-on-error: true if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} run: | - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline continue-on-error: true if: ${{ matrix.function == 'pipeline' }} run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful continue-on-error: true if: ${{ matrix.function == 'restful' }} run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml index bc70824420..1b0588387f 100644 --- a/autotest/config-ascend.yaml +++ b/autotest/config-ascend.yaml @@ -7,7 +7,7 @@ dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split env_tag: a100 tp_config: - Qwen3-0.6B: 2 + Qwen2.5-32B-Instruct: 2 turbomind_chat_model: @@ -43,9 +43,9 @@ turbomind_quatization: pytorch_quatization: awq: - - /Qwen3-0.6B + - meta-llama/Meta-Llama-3-8B-Instruct w8a8: - - /Qwen3-0.6B + - meta-llama/Meta-Llama-3-8B-Instruct no_kvint4: - /Qwen3-0.6B no_kvint8: diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 4aaddd580c..5dbcb6256a 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -11,6 +11,7 @@ @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1)) def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -52,6 +53,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_4 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4)) def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -71,6 +73,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_8 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=8)) def test_hf_pytorch_chat_tp8(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 0555dce6f2..14285f3c91 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -93,11 +93,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, print(f'[caseresult {case} start]' + json.dumps(response_list, ensure_ascii=False) + f'[caseresult {case} end]\n') - # TODO fix for ascend - if device == 'ascend': - pass - else: - pipe.close() + pipe.close() import gc gc.collect() diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index 3738056c56..b9a6939675 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -10,6 +10,7 @@ @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True)) def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): @@ -36,6 +37,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_4 +@pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, exclude_dup=True)) def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id): @@ -49,6 +51,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_8 +@pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=8, exclude_dup=True)) def test_pipeline_chat_pytorch_tp8(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py index 65948209cd..2902deeb65 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py @@ -13,6 +13,7 @@ @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model')) def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: @@ -24,6 +25,7 @@ def test_pipeline_chat_tp1(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, model_type='vl_model')) def test_pipeline_chat_tp2(config, model, worker_id): if 'gw' in worker_id: @@ -36,6 +38,7 @@ def test_pipeline_chat_tp2(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_4 +@pytest.mark.test_ascend @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, model_type='vl_model')) def test_pipeline_chat_tp4(config, model, worker_id): if 'gw' in worker_id: diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index afa31d402b..7552e6e2aa 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -8,7 +8,6 @@ @pytest.mark.order(3) @pytest.mark.test_3090 -@pytest.mark.test_ascend @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('awq')) def test_quantization_awq(config, model, worker_id): diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py index 9ddc454ae6..d210acdf1b 100644 --- a/autotest/tools/quantization/test_quantization_w8a8.py +++ b/autotest/tools/quantization/test_quantization_w8a8.py @@ -8,7 +8,6 @@ @pytest.mark.order(2) @pytest.mark.quantization_w8a8 -@pytest.mark.test_ascend @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('w8a8')) def test_quantization_w8a8(config, model, worker_id): diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index 57ac524912..6c48007565 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -29,6 +29,7 @@ def getModelList(tp_num): @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_1 @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -54,6 +55,7 @@ def test_restful_chat_tp2(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_4 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) def test_restful_chat_tp4(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -66,6 +68,7 @@ def test_restful_chat_tp4(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_8 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True) def test_restful_chat_tp8(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py index 82d7a7bf7a..63c700d7aa 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py @@ -29,6 +29,7 @@ def getModelList(tp_num): @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 @pytest.mark.test_3090 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -40,6 +41,7 @@ def test_restful_chat_tp1(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_2 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) def test_restful_chat_tp2(config, worker_id): if get_workerid(worker_id) is None: @@ -51,6 +53,7 @@ def test_restful_chat_tp2(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_4 +@pytest.mark.test_ascend @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) def test_restful_chat_tp4(config, worker_id): if get_workerid(worker_id) is None: From 9bda18536cff3e50ee44e4bab1849180a3990faa Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 11 Sep 2025 16:49:52 +0800 Subject: [PATCH 27/32] TEST: rm api eval --- .github/workflows/api_eva.yml | 187 ---------------------------------- 1 file changed, 187 deletions(-) delete mode 100644 .github/workflows/api_eva.yml diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml deleted file mode 100644 index 9c15c7b825..0000000000 --- a/.github/workflows/api_eva.yml +++ /dev/null @@ -1,187 +0,0 @@ -name: api_evalate - -on: - workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is InternLM' - type: string - default: 'InternLM/lmdeploy' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - offline_mode: - required: true - description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - type: boolean - default: false - regression_func: - required: true - description: 'regression functions' - type: string - default: "['evaluation']" - -env: - HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache - HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} - TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }} - OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy - OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt - DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL - -jobs: - linux-build: - if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} - strategy: - matrix: - pyver: [py310] - runs-on: ubuntu-latest - env: - PYTHON_VERSION: ${{ matrix.pyver }} - PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda11.8 - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Build - run: | - echo ${PYTHON_VERSION} - echo ${PLAT_NAME} - echo ${DOCKER_TAG} - echo ${OUTPUT_FOLDER} - echo ${GITHUB_RUN_ID} - # remove -it - sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh - bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - - name: Upload Artifacts - uses: actions/upload-artifact@v4 - with: - if-no-files-found: error - path: builder/manywheel/${{ env.OUTPUT_FOLDER }} - retention-days: 1 - name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} - - - download_pkgs: - needs: linux-build - if: ${{!cancelled()}} - runs-on: [self-hosted, 140-test] - timeout-minutes: 50 - container: - image: openmmlab/lmdeploy:latest - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/qa_test_models:/nvme/qa_test_models - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Clone repository - uses: actions/checkout@v2 - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Copy repository - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} - - name: Copy repository - offline - if: ${{inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} - - name: Download Artifacts - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }}-py310 - - name: Copy Artifacts - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - - name: Copy Artifacts - offline - if: ${{inputs.offline_mode}} - run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - - - - test_evaluation: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} - runs-on: [self-hosted, 140-test] - timeout-minutes: 120 # 2hours - strategy: - fail-fast: false - matrix: - evaluate_type: ['chat', 'base'] - container: - image: openmmlab/lmdeploy:latest - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - - /nvme/qa_test_models:/nvme/qa_test_models - - /mnt/shared:/mnt/shared - - /mnt/bigdisk:/mnt/bigdisk - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install sentence_transformers==2.2.2 --no-deps - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - - name: Install lmdeploy - run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - pip install ${{env.DEEPSEEK_VL}} --no-deps - - name: Install opencompass - run: | - git clone --depth=1 https://github.com/open-compass/opencompass.git - cd opencompass - cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt - python3 -m pip install -e . - echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Setup paths for evaluation - run: | - ln -s /root/opencompass-data ./data - python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . - - name: Evaluate models - if: matrix.evaluate_type == 'chat' - run: | - export LMDEPLOY_DIR=$(pwd) - - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - - name: Evaluate base models - if: matrix.evaluate_type == 'base' - run: | - export LMDEPLOY_DIR=$(pwd) - - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true - - name: Clear workspace - if: always() - run: | - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir From 9ce68649bdaf9531c5ac57ff56f4b7fe0ffc7ca1 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 11 Sep 2025 17:10:40 +0800 Subject: [PATCH 28/32] TEST: update chat test --- autotest/utils/run_restful_chat.py | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index a3c3c99b77..04033f2482 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -164,8 +164,8 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): result = True - api_client = APIClient(url) - model_name = api_client.available_models[0] + client = OpenAI(api_key='YOUR_API_KEY', base_url=f'{url}/v1') + model_name = client.models.list().data[0].id messages = [] msg = '' @@ -176,18 +176,17 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): messages.append({'role': 'user', 'content': prompt}) file.writelines('prompt:' + prompt + '\n') - for output in api_client.chat_completions_v1(model=model_name, messages=messages, top_k=1, max_tokens=256): - output_message = output.get('choices')[0].get('message') - messages.append(output_message) + response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, top_p=0.8) - output_content = output_message.get('content') - file.writelines('output:' + output_content + '\n') + output_content = response.choices[0].message.content + file.writelines('output:' + output_content + '\n') + messages.append({'role': 'assistant', 'content': output_content}) - case_result, reason = assert_result(output_content, prompt_detail.values(), model_name) - file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') - if not case_result: - msg += reason - result = result & case_result + case_result, reason = assert_result(output_content, prompt_detail.values(), model_name) + file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') + if not case_result: + msg += reason + result = result & case_result file.close() return result, restful_log, msg @@ -458,9 +457,9 @@ def get_temperature_date(location: str, date: str, unit: str = 'celsius'): """Get temperature at a location and date. Args: - location: The location to get the temperature for, in the format "City, State, Country". - date: The date to get the temperature for, in the format "Year-Month-Day". - unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"]) + location: The location to get the temperature for, in the format 'City, State, Country'. + date: The date to get the temperature for, in the format 'Year-Month-Day'. + unit: The unit to return the temperature in. Defaults to 'celsius'. (choices: ['celsius', 'fahrenheit']) Returns: the temperature, the location, the date and the unit in a dict @@ -618,7 +617,7 @@ def run_tools_case(config, port: int = DEFAULT_PORT): }, } }] - messages = [{'role': 'user', 'content': "What's the weather like in Boston today?"}] + messages = [{'role': 'user', 'content': 'What\'s the weather like in Boston today?'}] response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, From 4e62a33ccf1a3f3b7e70391907b63c60439d512a Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 11 Sep 2025 17:23:16 +0800 Subject: [PATCH 29/32] TEST: fix tmp dir --- .github/workflows/daily_ete_test_ascend.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml index 24be9c9452..0e4f1c30c0 100644 --- a/.github/workflows/daily_ete_test_ascend.yml +++ b/.github/workflows/daily_ete_test_ascend.yml @@ -40,8 +40,8 @@ env: FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }} LOG_PATH: /test/log/${{ github.run_id }} - TMPDIR: /mnt/deeplink/docker-tmp - RAY_TMPDIR: /mnt/deeplink/docker-tmp + TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp + RAY_TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp/ray jobs: download_pkgs: From 858cb5aff2755116024a9958fa2adbcbde93791f Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 12 Sep 2025 15:21:50 +0800 Subject: [PATCH 30/32] TEST: fix lint --- autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 19c12e4e79..44ded4473f 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -180,7 +180,6 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, is_smoke=True) - @pytest.mark.order(6) @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @@ -201,7 +200,6 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, is_smoke=True) - @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 From a78654354420961815b9191e310752fcbf74ae28 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Mon, 15 Sep 2025 17:46:20 +0800 Subject: [PATCH 31/32] TEST: update ascend config --- autotest/config-ascend.yaml | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml index 1b0588387f..d30c880463 100644 --- a/autotest/config-ascend.yaml +++ b/autotest/config-ascend.yaml @@ -9,38 +9,15 @@ env_tag: a100 tp_config: Qwen2.5-32B-Instruct: 2 - -turbomind_chat_model: - - /Qwen3-0.6B - - pytorch_chat_model: - /Qwen3-0.6B -turbomind_vl_model: - - /Qwen3-0.6B - pytorch_vl_model: - /Qwen3-0.6B - -turbomind_base_model: - - /Qwen3-0.6B - pytorch_base_model: - /Qwen3-0.6B -turbomind_quatization: - no_awq: - - /Qwen3-0.6B - - gptq: - - /Qwen3-0.6B - no_kvint4: - - /Qwen3-0.6B - no_kvint8: - - /Qwen3-0.6B - pytorch_quatization: awq: - meta-llama/Meta-Llama-3-8B-Instruct From 5e06c9f0c3e9fe6035e5bf8f607dd9ea27acd8e1 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Tue, 16 Sep 2025 18:26:19 +0800 Subject: [PATCH 32/32] TEST: rm ascend config --- .github/workflows/daily_ete_test_ascend.yml | 171 -------------------- autotest/config-ascend.yaml | 35 ---- 2 files changed, 206 deletions(-) delete mode 100644 .github/workflows/daily_ete_test_ascend.yml delete mode 100644 autotest/config-ascend.yaml diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml deleted file mode 100644 index 0e4f1c30c0..0000000000 --- a/.github/workflows/daily_ete_test_ascend.yml +++ /dev/null @@ -1,171 +0,0 @@ -name: daily_ete_test_ascend - -on: - workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is InternLM' - type: string - default: 'InternLM/lmdeploy' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - backend: - required: true - description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' - type: string - default: "['pytorch']" - model: - required: true - description: 'Set testcase module filter: llm, vllm. Default contains all models' - type: string - default: "['llm']" - function: - required: true - description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' - type: string - default: '["pipeline", "restful", "chat"]' - regression_func: - required: true - description: 'regression functions' - type: string - default: "['tools']" - -env: - REPORT_DIR: /test/test-reports/${{ github.run_id }} - COV_PARAM: --cov /usr/local/python3.10.17/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} - TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }} - LOG_PATH: /test/log/${{ github.run_id }} - TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp - RAY_TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp/ray - -jobs: - download_pkgs: - if: ${{!cancelled()}} - runs-on: [self-hosted, ascend-013] - timeout-minutes: 50 - container: - image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" - volumes: - - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro - - /usr/local/sbin:/usr/local/sbin:ro - - /var/log/npu/slog:/var/log/npu/slog - - /var/log/npu/profiling:/var/log/npu/profiling - - /var/log/npu/dump:/var/log/npu/dump - - /var/log/npu:/usr/slog - - /etc/hccn.conf:/etc/hccn.conf:ro - - /root/qa_test:/test - - /mnt:/mnt - - /root/.cache/pip:/root/.cache/pip - steps: - - name: Clone repository - uses: actions/checkout@v2 - if: ${{ !cancelled() }} - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Copy repository - if: ${{ !cancelled() }} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} - - test_tools: - if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}} - runs-on: [self-hosted, ascend-013] - needs: download_pkgs - timeout-minutes: 300 - strategy: - fail-fast: false - matrix: - backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} - model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} - function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} - exclude: - - backend: turbomind - model: mllm - function: chat - - backend: pytorch - model: mllm - function: chat - container: - image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest - options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never" - volumes: - - /usr/local/Ascend/driver:/usr/local/Ascend/driver - - /usr/local/sbin:/usr/local/sbin - - /var/log/npu/slog:/var/log/npu/slog - - /var/log/npu/profiling:/var/log/npu/profiling - - /var/log/npu/dump:/var/log/npu/dump - - /var/log/npu:/usr/slog - - /etc/hccn.conf:/etc/hccn.conf - - /root/qa_test:/test - - /mnt:/mnt - - /root/.cache/pip:/root/.cache/pip - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{ env.TEST_CODE_PATH }}/. . - - name: Install lmdeploy - offline - run: | - python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/ - - name: Install lmdeploy - test - run: | - python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/ - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf ${{ env.LOG_PATH }}/* - mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p - ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest - - name: Test lmdeploy - chat - continue-on-error: true - if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }} - run: | - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - pipeline - continue-on-error: true - if: ${{ matrix.function == 'pipeline' }} - run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - restful - continue-on-error: true - if: ${{ matrix.function == 'restful' }} - run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true - mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml deleted file mode 100644 index d30c880463..0000000000 --- a/autotest/config-ascend.yaml +++ /dev/null @@ -1,35 +0,0 @@ -model_path: /mnt/deeplink/group01/deeplink-test/weight -resource_path: /nvme/qa_test_models/resource -dst_path: /nvme/qa_test_models/autotest_model -log_path: /test/log -benchmark_path: /nvme/qa_test_models/benchmark-reports -dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json -env_tag: a100 - -tp_config: - Qwen2.5-32B-Instruct: 2 - -pytorch_chat_model: - - /Qwen3-0.6B - -pytorch_vl_model: - - /Qwen3-0.6B - -pytorch_base_model: - - /Qwen3-0.6B - -pytorch_quatization: - awq: - - meta-llama/Meta-Llama-3-8B-Instruct - w8a8: - - meta-llama/Meta-Llama-3-8B-Instruct - no_kvint4: - - /Qwen3-0.6B - no_kvint8: - - /Qwen3-0.6B - -longtext_model: - - /Qwen3-0.6B - -benchmark_model: - - /Qwen3-0.6B