From 372b2c513f8f0a96a5b31cd0e5b9719ea2efe378 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 27 Aug 2025 10:35:58 +0000
Subject: [PATCH 01/32] AUTOTEST: add device type for ascend

---
 autotest/config-ascend.yaml                   |  58 ++++++
 autotest/conftest.py                          |  23 ++-
 autotest/tools/pipeline/llm_case.py           |  18 +-
 autotest/tools/pipeline/mllm_case.py          |  20 +-
 .../test_pipeline_chat_pytorch_llm.py         |  27 +--
 .../test_pipeline_chat_pytorch_mllm.py        |  20 +-
 .../test_pipeline_chat_turbomind_llm.py       |  30 +--
 .../test_pipeline_chat_turbomind_mllm.py      |  28 +--
 autotest/utils/config_utils.py                |  47 ++++-
 autotest/utils/get_run_config.py              | 173 +++++++++++++++---
 autotest/utils/quantization_utils.py          |   8 +-
 autotest/utils/run_client_chat.py             |  17 +-
 autotest/utils/run_restful_chat.py            |  14 +-
 13 files changed, 384 insertions(+), 99 deletions(-)
 create mode 100644 autotest/config-ascend.yaml

diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml
new file mode 100644
index 0000000000..bc70824420
--- /dev/null
+++ b/autotest/config-ascend.yaml
@@ -0,0 +1,58 @@
+model_path: /mnt/deeplink/group01/deeplink-test/weight
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /test/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: a100
+
+tp_config:
+    Qwen3-0.6B: 2
+
+
+turbomind_chat_model:
+    - /Qwen3-0.6B
+
+
+pytorch_chat_model:
+    - /Qwen3-0.6B
+
+turbomind_vl_model:
+    - /Qwen3-0.6B
+
+pytorch_vl_model:
+    - /Qwen3-0.6B
+
+
+turbomind_base_model:
+    - /Qwen3-0.6B
+
+pytorch_base_model:
+    - /Qwen3-0.6B
+
+turbomind_quatization:
+    no_awq:
+        - /Qwen3-0.6B
+
+    gptq:
+        - /Qwen3-0.6B
+    no_kvint4:
+        - /Qwen3-0.6B
+    no_kvint8:
+        - /Qwen3-0.6B
+
+pytorch_quatization:
+    awq:
+        - /Qwen3-0.6B
+    w8a8:
+        - /Qwen3-0.6B
+    no_kvint4:
+        - /Qwen3-0.6B
+    no_kvint8:
+        - /Qwen3-0.6B
+
+longtext_model:
+    - /Qwen3-0.6B
+
+benchmark_model:
+    - /Qwen3-0.6B
diff --git a/autotest/conftest.py b/autotest/conftest.py
index 7d5a34c480..8f29975382 100644
--- a/autotest/conftest.py
+++ b/autotest/conftest.py
@@ -10,7 +10,17 @@
 
 @pytest.fixture(scope='session')
 def config():
-    config_path = os.path.join(config_file)
+    # Use device-specific config file if DEVICE environment variable is set
+    device = os.environ.get('DEVICE', '')
+    if device:
+        device_config_path = f'autotest/config-{device}.yaml'
+        if os.path.exists(device_config_path):
+            config_path = device_config_path
+        else:
+            config_path = config_file
+    else:
+        config_path = config_file
+        
     with open(config_path) as f:
         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
     return env_config
@@ -34,8 +44,19 @@ def common_case_config():
 
 def pytest_addoption(parser):
     parser.addoption('--run_id', action='store', default='', help='github run_id')
+    parser.addoption('--device', action='store', default='', help='device config suffix')
+
+def pytest_configure(config):
+    # Set DEVICE environment variable before test execution
+    device = config.getoption('--device')
+    if device:
+        os.environ['DEVICE'] = device
 
 
 @pytest.fixture(scope='session')
 def run_id(request):
     return request.config.getoption('--run_id')
+
+@pytest.fixture(scope='session')
+def device(request):
+    return request.config.getoption('--device')
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 2de77d2bd3..9879300b87 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -9,7 +9,16 @@
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
-
+def _is_bf16_supported_by_device():
+    """Check if bf16 is supported based on the current device"""
+    device = os.environ.get('DEVICE', 'cuda')
+    if device == 'ascend':
+        # For Ascend, bf16 support check would be different
+        # Placeholder implementation
+        return True
+    else:
+        # For CUDA and default, use the existing check
+        return is_bf16_supported()
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
 
     if 'pytorch' in backend_type:
@@ -17,6 +26,11 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
     else:
         backend_config = TurbomindEngineConfig(tp=tp)
 
+    # Add device_type based on DEVICE environment variable
+    device = os.environ.get('DEVICE', '')
+    if device:
+        backend_config.device_type = device
+
     if 'lora' in backend_type:
         backend_config.adapters = extra.get('adapters')
     if 'kvint' in backend_type:
@@ -31,7 +45,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
         backend_config.model_format = 'awq'
     if 'gptq' in model_path.lower():
         backend_config.model_format = 'gptq'
-    if not is_bf16_supported():
+    if not _is_bf16_supported_by_device():
         backend_config.dtype = 'float16'
 
     print('backend_config config: ' + str(backend_config))
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 9689581ef9..8932a60fcd 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -1,5 +1,5 @@
 import json
-
+import os
 import fire
 import numpy as np
 from PIL import Image
@@ -21,7 +21,16 @@
 DESC = 'What are the similarities and differences between these two images.'
 DESC_ZH = '两张图有什么相同和不同的地方.'
 
-
+def _is_bf16_supported_by_device():
+    """Check if bf16 is supported based on the current device"""
+    device = os.environ.get('DEVICE', 'cuda')
+    if device == 'ascend':
+        # For Ascend, bf16 support check would be different
+        # Placeholder implementation
+        return True
+    else:
+        # For CUDA and default, use the existing check
+        return is_bf16_supported()
 def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_test, extra: object = None):
     if 'pytorch' in backend_type:
         backend_config = PytorchEngineConfig(tp=tp, session_len=32576, cache_max_entry_count=0.6)
@@ -33,12 +42,17 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
+    # Add device_type based on DEVICE environment variable
+    device = os.environ.get('DEVICE', '')
+    if device:
+        backend_config.device_type = device
+
     if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
         backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
 
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
         backend_config.model_format = 'awq'
-    if not is_bf16_supported():
+    if not _is_bf16_supported_by_device():
         backend_config.dtype = 'float16'
 
     print('backend_config config: ' + str(backend_config))
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index dca119649e..c7abafcff5 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list
+from utils.config_utils import set_device_env_variable, get_torch_model_list
 from utils.pipeline_chat import run_pipeline_chat_test
 
 
@@ -14,7 +14,7 @@
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id)
 
 
@@ -23,10 +23,11 @@ def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id)
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id)
 
@@ -39,7 +40,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id)
 
@@ -63,7 +64,7 @@ def test_pipeline_chat_pytorch_tp8(config, common_case_config, model, worker_id)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4})
 
 
@@ -75,7 +76,7 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=4, exclude_dup=True))
 def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4})
 
@@ -88,7 +89,7 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=4, exclude_dup=True))
 def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 4})
 
@@ -102,7 +103,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True))
 def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8})
 
 
@@ -114,7 +115,7 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=8, exclude_dup=True))
 def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8})
 
@@ -127,7 +128,7 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=8, exclude_dup=True))
 def test_pipeline_chat_kvint8_tp4(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch-kvint', worker_id, {'quant_policy': 8})
 
@@ -161,7 +162,7 @@ def test_pipeline_chat_pytorch_pr(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch', worker_id, use_local_model=True)
     del os.environ['LMDEPLOY_USE_MODELSCOPE']
@@ -175,7 +176,7 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config, model,
 @pytest.mark.parametrize('model', ['meta-llama/Llama-2-7b-chat-hf'])
 def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch_lora', worker_id,
                            {'adapters': {
                                'adapter0': 'lora/Llama2-Chinese-7b-Chat-LoRA'
@@ -190,7 +191,7 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model,
 @pytest.mark.parametrize('model', ['baichuan-inc/Baichuan2-13B-Chat'])
 def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'pytorch_lora', worker_id,
                            {'adapters': {
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index a65465fe0c..90e9fc61f4 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list
+from utils.config_utils import set_device_env_variable, get_torch_model_list
 from utils.pipeline_chat import run_pipeline_vl_chat_test
 
 BACKEND = 'pytorch'
@@ -16,7 +16,7 @@
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model'))
 def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id)
 
 
@@ -27,7 +27,7 @@ def test_pipeline_chat_tp1(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, model_type='vl_model'))
 def test_pipeline_chat_tp2(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id)
 
@@ -39,7 +39,7 @@ def test_pipeline_chat_tp2(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, model_type='vl_model'))
 def test_pipeline_chat_tp4(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id)
 
@@ -52,7 +52,7 @@ def test_pipeline_chat_tp4(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4})
 
 
@@ -63,7 +63,7 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=4, model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4})
 
@@ -75,7 +75,7 @@ def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=4, model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp4(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 4})
 
@@ -88,7 +88,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
 def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8})
 
 
@@ -99,7 +99,7 @@ def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, quant_policy=8, model_type='vl_model'))
 def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8})
 
@@ -111,6 +111,6 @@ def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, quant_policy=8, model_type='vl_model'))
 def test_pipeline_chat_kvint8_tp4(config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {'quant_policy': 8})
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 58255fd5bc..1c9d091e56 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list
+from utils.config_utils import get_communicator_list, set_device_env_variable, get_turbomind_model_list
 from utils.pipeline_chat import run_pipeline_chat_test
 
 
@@ -15,7 +15,7 @@
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp1(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator})
 
 
@@ -28,7 +28,7 @@ def test_pipeline_chat_tp1(config, common_case_config, model, communicator, work
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator})
 
@@ -42,7 +42,7 @@ def test_pipeline_chat_tp2(config, common_case_config, model, communicator, work
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp4(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, {'communicator': communicator})
 
@@ -68,7 +68,7 @@ def test_pipeline_chat_tp8(config, common_case_config, model, communicator, work
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 4,
         'communicator': communicator
@@ -84,7 +84,7 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 4,
@@ -101,7 +101,7 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 4,
@@ -119,7 +119,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 8,
         'communicator': communicator
@@ -135,7 +135,7 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 8,
@@ -152,7 +152,7 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp4(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind-kvint', worker_id, {
         'quant_policy': 8,
@@ -186,7 +186,7 @@ def test_pipeline_chat_kvint8_tp8(config, common_case_config, model, communicato
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
+        set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config,
                            common_case_config,
@@ -208,7 +208,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, common_case_config, model, c
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
+        set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config,
                            common_case_config,
@@ -231,7 +231,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config,
                            common_case_config,
@@ -251,7 +251,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_chat_test(config,
                            common_case_config,
@@ -292,7 +292,7 @@ def test_pipeline_chat_pr(config, common_case_config, model, communicator, worke
 @pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
     run_pipeline_chat_test(config, common_case_config, model, 'turbomind', worker_id, use_local_model=True)
     del os.environ['LMDEPLOY_USE_MODELSCOPE']
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 8e2490413a..c8f1f5c759 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list
+from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list, set_device_env_variable
 from utils.pipeline_chat import run_pipeline_vl_chat_test
 
 BACKEND = 'turbomind'
@@ -17,7 +17,7 @@
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator})
 
 
@@ -29,7 +29,7 @@ def test_pipeline_chat_tp1(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     if ('MiniCPM-V-2_6' in model or 'InternVL2_5-26B' in model or 'InternVL2-26B' in model
             or 'InternVL3-38B' in model) and communicator == 'native':
@@ -45,7 +45,7 @@ def test_pipeline_chat_tp2(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp4(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator})
 
@@ -59,7 +59,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 4,
         'communicator': communicator
@@ -74,7 +74,7 @@ def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 4,
@@ -90,7 +90,7 @@ def test_pipeline_chat_kvint4_tp2(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 4,
@@ -107,7 +107,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 8,
         'communicator': communicator
@@ -122,7 +122,7 @@ def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 8,
@@ -138,7 +138,7 @@ def test_pipeline_chat_kvint8_tp2(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND_KVINT, worker_id, {
         'quant_policy': 8,
@@ -157,7 +157,7 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
+        set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
 
@@ -173,7 +173,7 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=1)
+        set_device_env_variable(worker_id, tp_num=1)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config,
                               model,
@@ -193,7 +193,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator,
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
 
@@ -206,7 +206,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     run_pipeline_vl_chat_test(config,
                               model,
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index c53e33bf0f..34ea10acc6 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -76,7 +76,7 @@ def get_torch_model_list(tp_num: int = None,
 def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: str = 'chat_model'):
 
     case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type, quant_policy=quant_policy)
-    if is_bf16_supported():
+    if _is_bf16_supported_by_device():
         for case in get_torch_model_list(tp_num=tp_num, quant_policy=quant_policy, model_type=model_type):
             if case not in case_list:
                 case_list.append(case)
@@ -84,7 +84,7 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type:
 
 
 def get_communicator_list():
-    if is_bf16_supported():
+    if _is_bf16_supported_by_device():
         return ['native', 'nccl']
     return ['nccl']
 
@@ -142,7 +142,11 @@ def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1):
     if cuda_id is None or 'gw' not in worker_id:
         return None
     else:
-        return 'CUDA_VISIBLE_DEVICES=' + cuda_id
+        device_type = os.environ.get('DEVICE', 'cuda')
+        if device_type == 'ascend':
+            return 'ASCEND_RT_VISIBLE_DEVICES=' + cuda_id
+        else:
+            return 'CUDA_VISIBLE_DEVICES=' + cuda_id
 
 
 def get_cuda_id_by_workerid(worker_id, tp_num: int = 1):
@@ -160,7 +164,16 @@ def get_cuda_id_by_workerid(worker_id, tp_num: int = 1):
 
 
 def get_config():
-    config_path = os.path.join('autotest/config.yaml')
+    # Determine config file based on DEVICE environment variable
+    device = os.environ.get('DEVICE', '')
+    if device:
+        config_path = f'autotest/config-{device}.yaml'
+        # Fallback to default config if device-specific config doesn't exist
+        if not os.path.exists(config_path):
+            config_path = 'autotest/config.yaml'
+    else:
+        config_path = 'autotest/config.yaml'
+        
     with open(config_path) as f:
         config = yaml.load(f.read(), Loader=yaml.SafeLoader)
     return config
@@ -223,3 +236,29 @@ def get_workerid(worker_id):
 
 def is_quantization_model(name):
     return 'awq' in name.lower() or '4bits' in name.lower() or 'w4' in name.lower() or 'int4' in name.lower()
+
+
+def _is_bf16_supported_by_device():
+    """Check if bf16 is supported based on the current device"""
+    device = os.environ.get('DEVICE', 'cuda')
+    if device == 'ascend':
+        # For Ascend, bf16 support check would be different
+        # Placeholder implementation
+        return True
+    else:
+        # For CUDA and default, use the existing check
+        return is_bf16_supported()
+    
+
+def set_device_env_variable(worker_id, tp_num: int = 1):
+    """Set device environment variable based on the device type"""
+    device = os.environ.get('DEVICE', 'cuda')  # Default to cuda
+    
+    if device == 'ascend':
+        device_id = get_cuda_id_by_workerid(worker_id, tp_num)
+        if device_id is not None:
+            os.environ['ASCEND_RT_VISIBLE_DEVICES'] = device_id
+    else:  # Default to cuda
+        cuda_id = get_cuda_id_by_workerid(worker_id, tp_num)
+        if cuda_id is not None:
+            os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id
diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py
index 9674b3ed64..eb1b4c328d 100644
--- a/autotest/utils/get_run_config.py
+++ b/autotest/utils/get_run_config.py
@@ -1,5 +1,6 @@
 import random
 from time import sleep
+import os, subprocess, re
 
 import torch
 
@@ -7,18 +8,9 @@
 
 
 def get_conda_allcate_prefix(config, model):
-    cuda_prefix = ''
-    tp_num = get_tp_num(config, model)
-    if tp_num is None or tp_num == 8:
-        return cuda_prefix
-    available_cuda = _get_available_cude()
-    if len(available_cuda) < tp_num:
-        raise torch.cuda.OutOfMemoryError
-
-    cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num))
-
-    torch.cuda.empty_cache()
-    return cuda_prefix
+    device = os.environ.get('DEVICE', 'cuda')  # Default to cuda if not set
+    handler = _get_device_handler(device)
+    return handler.get_device_prefix(config, model)
 
 
 def get_tp_config(config, model, need_tp):
@@ -60,7 +52,7 @@ def get_command_with_extra(cmd,
     if extra is not None and len(extra) > 0:
         cmd = ' '.join([cmd, extra])
 
-    torch.cuda.empty_cache()
+    _clear_device_cache()
     return cmd
 
 
@@ -110,21 +102,6 @@ def get_model_name(model):
     return model_name.split('-')[0]
 
 
-def _get_available_cude():
-    devices = torch.cuda.device_count()
-
-    available_cuda = []
-    for i in range(devices):
-        if (torch.cuda.utilization(i) > 5):
-            continue
-        if ('no processes are running' not in torch.cuda.list_gpu_processes(i)):
-            continue
-
-        available_cuda.append(str(i))
-
-    return available_cuda
-
-
 def _simple_model_name(model):
     if '/' in model:
         model_name = model.split('/')[1]
@@ -140,4 +117,142 @@ def close_pipeline(pipe):
     pipe.close()
     import gc
     gc.collect()
-    torch.cuda.empty_cache()
+    _clear_device_cache()
+
+
+def _clear_device_cache():
+    """Clear cache based on the current device type"""
+    device = os.environ.get('DEVICE', 'cuda')
+    handler = _get_device_handler(device)
+    handler.clear_cache()
+
+
+def _get_device_handler(device):
+    """Get the appropriate device handler based on device type"""
+    handlers = {
+        'cuda': CudaDeviceHandler(),
+        'ascend': AscendDeviceHandler(),
+    }
+    
+    # Return the specific handler if available, otherwise return default cuda handler
+    return handlers.get(device, handlers['cuda'])
+
+
+class DeviceHandler:
+    """Base class for device handlers"""
+    
+    def get_device_prefix(self, config, model):
+        """Get device-specific prefix for command execution"""
+        return ''
+    
+    def clear_cache(self):
+        """Clear device-specific cache"""
+        pass
+    
+    def get_available_devices(self):
+        """Get list of available devices"""
+        return []
+
+
+class CudaDeviceHandler(DeviceHandler):
+    """Handler for CUDA devices"""
+    
+    def get_device_prefix(self, config, model):
+        cuda_prefix = ''
+        tp_num = get_tp_num(config, model)
+        if tp_num is None or tp_num == 8:
+            return cuda_prefix
+        available_cuda = self.get_available_devices()
+        if len(available_cuda) < tp_num:
+            raise torch.cuda.OutOfMemoryError
+
+        cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num))
+        self.clear_cache()
+        return cuda_prefix
+    
+    def clear_cache(self):
+        torch.cuda.empty_cache()
+    
+    def get_available_devices(self):
+        devices = torch.cuda.device_count()
+        available_cuda = []
+        for i in range(devices):
+            if (torch.cuda.utilization(i) > 5):
+                continue
+            if ('no processes are running' not in torch.cuda.list_gpu_processes(i)):
+                continue
+            available_cuda.append(str(i))
+        return available_cuda
+
+
+class AscendDeviceHandler(DeviceHandler):
+    """Handler for Ascend devices"""
+    
+    def get_device_prefix(self, config, model):
+        ascend_prefix = ''
+        tp_num = get_tp_num(config, model)
+        if tp_num is None or tp_num == 8:
+            return ascend_prefix
+        available_ascend = self.get_available_devices()
+        if len(available_ascend) < tp_num:
+            raise RuntimeError("Not enough Ascend devices available")
+
+        ascend_prefix = 'ASCEND_RT_VISIBLE_DEVICES=' + ','.join(random.sample(available_ascend, tp_num))
+        self.clear_cache()
+        return ascend_prefix
+    
+    def clear_cache(self):
+        try:
+            import torch_npu
+            torch_npu.npu.empty_cache()
+        except ImportError:
+            pass  # torch_npu not available
+    
+    def get_available_devices(self):
+        """Get list of available Ascend devices by checking AICPU usage rate"""
+        available_ascend = []
+        try:
+            # Get the number of NPU devices
+            result = subprocess.run(['npu-smi', 'info', '-l'], 
+                                  capture_output=True, text=True, timeout=10)
+            if result.returncode != 0:
+                return available_ascend
+                
+            # Parse the output to get device count
+            # Looking for lines like "Device Count : X"
+            device_count = 0
+            for line in result.stdout.split('\n'):
+                if 'Total Count' in line:
+                    match = re.search(r'Total Count\s*:\s*(\d+)', line)
+                    if match:
+                        device_count = int(match.group(1))
+                        break
+            
+            # Check each device's AICPU usage
+            for i in range(device_count):
+                try:
+                    result = subprocess.run(['npu-smi', 'info', '-t', 'usages', '-i', str(i)], 
+                                          capture_output=True, text=True, timeout=10)
+                    if result.returncode != 0:
+                        continue
+                        
+                    # Parse the output to get AICPU Usage Rate
+                    # Looking for lines like "Aicpu Usage Rate(%) : X"
+                    aicpu_usage = 100  # Default to 100% (busy)
+                    for line in result.stdout.split('\n'):
+                        if 'Aicpu Usage Rate(%)' in line:
+                            match = re.search(r'Aicpu Usage Rate\(%\)\s*:\s*(\d+)', line)
+                            if match:
+                                aicpu_usage = int(match.group(1))
+                                break
+                    
+                    # If AICPU usage is 0, consider the device available
+                    if aicpu_usage == 0:
+                        available_ascend.append(str(i))
+                except (subprocess.TimeoutExpired, subprocess.SubprocessError):
+                    continue
+                    
+        except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError):
+            # npu-smi command not found or other error
+            pass 
+        return available_ascend
\ No newline at end of file
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 3606e0bbbb..04595829e2 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -3,6 +3,7 @@
 from subprocess import PIPE
 
 from lmdeploy.utils import is_bf16_supported
+from utils.config_utils import _is_bf16_supported_by_device
 
 
 def quantization(config,
@@ -30,6 +31,11 @@ def quantization(config,
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
             now the type is ' + quantization_type
+    
+    # Add device option if specified in environment
+    device = os.environ.get('DEVICE', '')
+    if device:
+        quantization_cmd += f' --device npu'
 
     if cuda_prefix is not None:
         quantization_cmd = ' '.join([cuda_prefix, quantization_cmd])
@@ -37,7 +43,7 @@ def quantization(config,
     if 'llama-3' in origin_model_name.lower():
         quantization_cmd += ' --search-scale'
 
-    if not is_bf16_supported() or quantization_type == 'gptq':
+    if not _is_bf16_supported_by_device() or quantization_type == 'gptq':
         quantization_cmd += ' --batch-size 8'
     elif str(config.get('env_tag')) == '3090':
         quantization_cmd += ' --batch-size 8'
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index bba662b0c8..8f345efe9c 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -2,6 +2,7 @@
 from subprocess import PIPE, Popen
 
 from utils.get_run_config import get_command_with_extra, get_model_name
+from utils.config_utils import _is_bf16_supported_by_device
 from utils.rule_condition_assert import assert_result
 
 from lmdeploy.utils import is_bf16_supported
@@ -30,6 +31,12 @@ def command_line_test(config,
             cmd += ' --model-format gptq'
     if case == 'base_testcase':
         cmd += ' --chat-template ' + TEMPLATE
+    
+    # Add device option if specified in environment
+    device = os.environ.get('DEVICE', '')
+    if device:
+        cmd += f' --device {device}'
+        
     return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind', worker_id=worker_id)
 
 
@@ -57,7 +64,7 @@ def hf_command_line_test(config,
                                  cuda_prefix=cuda_prefix)
 
     if type == 'pytorch':
-        if not is_bf16_supported():
+        if not _is_bf16_supported_by_device():
             cmd += ' --dtype float16'
     if type == 'turbomind':
         if ('w4' in model_case or ('4bits' in model_case or 'awq' in model_case.lower())):
@@ -67,6 +74,12 @@ def hf_command_line_test(config,
 
     if case == 'base_testcase':
         cmd += ' --chat-template ' + TEMPLATE
+    
+    # Add device option if specified in environment
+    device = os.environ.get('DEVICE', '')
+    if device:
+        cmd += f' --device {device}'
+        
     return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True)
 
 
@@ -162,4 +175,4 @@ def extract_output(output: str, model: str):
         if len(output.split('[/INST]')) >= 2:
             return output.split('[/INST]')[1]
 
-    return output
+    return output
\ No newline at end of file
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index b0c0e3b1cd..b484cbcdb2 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -7,8 +7,8 @@
 import psutil
 from openai import OpenAI
 from pytest_assume.plugin import assume
-from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
-from utils.get_run_config import get_command_with_extra
+from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid, _is_bf16_supported_by_device
+from utils.get_run_config import get_command_with_extra 
 from utils.restful_return_check import assert_chat_completions_batch_return
 from utils.rule_condition_assert import assert_result
 
@@ -60,6 +60,10 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
                                  need_tp=True,
                                  cuda_prefix=cuda_prefix,
                                  extra=extra)
+    
+    device = os.environ.get('DEVICE', '')
+    if device:
+        cmd += f' --device {device}'
 
     if backend_type == 'turbomind':
         if ('w4' in model or '4bits' in model or 'awq' in model.lower()):
@@ -68,13 +72,13 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
             cmd += ' --model-format gptq'
     if backend_type == 'pytorch':
         cmd += ' --backend pytorch'
-        if not is_bf16_supported():
+        if not _is_bf16_supported_by_device():
             cmd += ' --dtype float16'
     if 'quant_policy' in param.keys() and param['quant_policy'] is not None:
         quant_policy = param['quant_policy']
         cmd += f' --quant-policy {quant_policy}'
 
-    if not is_bf16_supported():
+    if not _is_bf16_supported_by_device():
         cmd += ' --cache-max-entry-count 0.5'
     if str(config.get('env_tag')) == '3090':
         cmd += ' --cache-max-entry-count 0.5'
@@ -91,7 +95,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     http_url = BASE_HTTP_URL + ':' + str(port)
     start_time = int(time())
     start_timeout = 300
-    if not is_bf16_supported():
+    if not _is_bf16_supported_by_device():
         start_timeout = 600
 
     sleep(5)

From 28907ca31ec4427bae796395f7fa2c1b3c2628fc Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 29 Aug 2025 10:50:38 +0800
Subject: [PATCH 02/32] AOTOTEST: fix lint

---
 .../benchmark/test_throughput_performance.py  |  10 +-
 autotest/conftest.py                          |   4 +-
 .../interface/pipeline/test_pipeline_func.py  | 180 +++++++++---------
 .../pipeline/test_pipeline_longtext_func.py   |  12 +-
 .../chat/test_command_chat_hf_pytorch.py      |   8 +-
 .../chat/test_command_chat_hf_turbomind.py    |  40 +---
 autotest/tools/pipeline/llm_case.py           |  26 ++-
 autotest/tools/pipeline/mllm_case.py          |  27 ++-
 .../test_pipeline_chat_pytorch_llm.py         |   2 +-
 .../test_pipeline_chat_pytorch_mllm.py        |   2 +-
 .../test_pipeline_chat_turbomind_llm.py       |  10 +-
 .../test_pipeline_chat_turbomind_mllm.py      |  28 ++-
 .../quantization/test_quantization_awq.py     |   1 +
 .../quantization/test_quantization_w8a8.py    |   1 +
 .../test_restful_chat_hf_pytorch_llm.py       |   1 +
 autotest/utils/benchmark_utils.py             |   8 +-
 autotest/utils/config_utils.py                |  20 +-
 autotest/utils/get_run_config.py              |  66 +++----
 autotest/utils/pipeline_chat.py               |   6 +-
 autotest/utils/quantization_utils.py          |   7 +-
 autotest/utils/run_client_chat.py             |  22 ++-
 autotest/utils/run_restful_chat.py            |  43 +++--
 22 files changed, 291 insertions(+), 233 deletions(-)

diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
index 8df4a3b7f5..493f90e0bd 100644
--- a/autotest/benchmark/test_throughput_performance.py
+++ b/autotest/benchmark/test_throughput_performance.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 from utils.benchmark_utils import throughput_test
 from utils.config_utils import get_benchmark_model_list, get_cuda_id_by_workerid, get_cuda_prefix_by_workerid
@@ -92,11 +94,15 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id):
     'tp_num': 1
 }])
 def test_throughput_prtest_tp1(config, run_id, run_config, worker_id):
+    device_type = os.environ.get('DEVICE', 'cuda')
+    if device_type == 'ascend':
+        env_var = 'ASCEND_RT_VISIBLE_DEVICES='
+    else:
+        env_var = 'CUDA_VISIBLE_DEVICES='
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix='CUDA_VISIBLE_DEVICES=' +
-                                  str(int(get_cuda_id_by_workerid(worker_id)) + 5),
+                                  cuda_prefix=f'{env_var}' + str(int(get_cuda_id_by_workerid(worker_id)) + 5),
                                   worker_id=worker_id,
                                   is_smoke=True)
 
diff --git a/autotest/conftest.py b/autotest/conftest.py
index 8f29975382..dee954d2cb 100644
--- a/autotest/conftest.py
+++ b/autotest/conftest.py
@@ -20,7 +20,7 @@ def config():
             config_path = config_file
     else:
         config_path = config_file
-        
+
     with open(config_path) as f:
         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
     return env_config
@@ -46,6 +46,7 @@ def pytest_addoption(parser):
     parser.addoption('--run_id', action='store', default='', help='github run_id')
     parser.addoption('--device', action='store', default='', help='device config suffix')
 
+
 def pytest_configure(config):
     # Set DEVICE environment variable before test execution
     device = config.getoption('--device')
@@ -57,6 +58,7 @@ def pytest_configure(config):
 def run_id(request):
     return request.config.getoption('--run_id')
 
+
 @pytest.fixture(scope='session')
 def device(request):
     return request.config.getoption('--device')
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 87b87d3bba..8c8b0d45c3 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -1,21 +1,20 @@
-import os
 from multiprocessing import Process
 
 import pydantic
 import pytest
 import torch
-from utils.config_utils import get_cuda_id_by_workerid
+from utils.config_utils import _is_bf16_supported_by_device, set_device_env_variable, unset_device_env_variable
+from utils.get_run_config import _clear_device_cache
 from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return,
                                  assert_pipeline_common_log, assert_pipeline_single_return,
                                  assert_pipeline_single_stream_return, save_pipeline_common_log)
 from utils.restful_return_check import get_repeat_times
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.utils import is_bf16_supported
 
 
 def init_pipeline(model_path, backend_config):
-    if not is_bf16_supported() and isinstance(backend_config, PytorchEngineConfig):
+    if not _is_bf16_supported_by_device() and isinstance(backend_config, PytorchEngineConfig):
         backend_config.dtype = 'float16'
     return pipeline(model_path, backend_config=backend_config)
 
@@ -33,18 +32,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -62,18 +61,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_stream_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -89,18 +88,18 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name):
         result, msg = assert_pipeline_batch_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_with_prompt, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -118,18 +117,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_batch_stream_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -146,18 +145,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_batch_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -175,18 +174,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_stream_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -209,18 +208,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_batch_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -244,18 +243,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_batch_stream_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -272,18 +271,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_return(response, logprobs_num=10)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -302,18 +301,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_stream_return(response, logprobs_num=10)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -329,23 +328,22 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         result = True
         for i in range(2):
-            result &= response[i].finish_reason == 'error'
-            result &= response[i].text == 'internal error happened, status code ResponseType.INPUT_LENGTH_ERROR'
+            result &= response[i].finish_reason == 'length'
             result &= response[i].generate_token_len == 0
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -366,18 +364,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
             result &= response[i].index == i
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_min_new_tokens_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -399,18 +397,18 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
             result &= response[i].finish_reason == 'stop' and response[i].generate_token_len < 50
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_stop_words_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_stop_words, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -430,18 +428,18 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
             result &= '浦' not in response[i].text and ' and' not in response[i].text and ' to ' not in response[i].text
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_bad_words_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_bad_words, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -468,18 +466,18 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name):
         result = '<|action_start|><|interpreter|>' in response.text
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_special_words_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_special_words, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -506,18 +504,18 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name):
         result = '<|action_start|><|interpreter|>' not in response.text
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_special_words_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_special_words, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -536,18 +534,18 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, file_name):
         result = get_repeat_times(response.text, 'is a name') > 5 or get_repeat_times(response.text, 'Shanghai is') > 5
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_repetition_penalty, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -565,18 +563,18 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, file_name):
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_repetition_penalty, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -594,18 +592,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -625,18 +623,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result = response_list[0].text == response_list[1].text and response_list[1].text == response_list[2].text
         save_pipeline_common_log(config, file_name, result, response_list)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -655,18 +653,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result = response_list[0].text != response_list[1].text and response_list[1].text != response_list[2].text
         save_pipeline_common_log(config, file_name, result, response_list)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -685,18 +683,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result = response_list[0].text == response_list[1].text and response_list[1].text == response_list[2].text
         save_pipeline_common_log(config, file_name, result, response_list)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -713,18 +711,18 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result = response[0].text != response[1].text and response[1].text != response[2].text
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -745,18 +743,18 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend, file_name):
             result &= response[i].generate_token_len == 6 or response[i].generate_token_len == 5
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_max_new_tokens_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_max_new_tokens, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
@@ -777,25 +775,25 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
             result &= response[i].generate_token_len == 257 or response[i].generate_token_len == 256
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
 
     file_name = f'pipeline_log_ignore_eos_{worker_id}.txt'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     p = Process(target=run_pipeline_testcase_ignore_eos, args=(config, model, backend, file_name))
 
     p.start()
     p.join()
     assert_pipeline_common_log(config, file_name)
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_backend_config_input_validation(config, model, backend, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
     pipe = init_pipeline(model_path, backend_config=backend_config)
@@ -824,16 +822,16 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
         pipe('Shanghai is', gen_config=gen_config)
 
     del pipe
-    torch.cuda.empty_cache()
+    _clear_device_cache()
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_validate_turbomind(config, model, backend, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     with pytest.raises(pydantic.ValidationError, match='tp must be a positive integer'):
         backend_config = backend(tp=0)
@@ -864,14 +862,14 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id):
         pipeline(model_path, backend_config=backend_config)
 
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
 @pytest.mark.parametrize('backend', [PytorchEngineConfig])
 def test_backend_config_validate_pytorch(config, model, backend, worker_id):
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     with pytest.raises(AssertionError):
         backend_config = backend(tp=0)
@@ -894,7 +892,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
         init_pipeline(model_path, backend_config=backend_config)
 
     if 'gw' in worker_id:
-        del os.environ['CUDA_VISIBLE_DEVICES']
+        unset_device_env_variable()
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -902,11 +900,11 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
 def test_backend_config_tp(config, model, backend, worker_id):
     with pytest.raises(AssertionError):
         if 'gw' in worker_id:
-            os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+            set_device_env_variable(worker_id, tp_num=2)
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=100)
         pipe = init_pipeline(model_path, backend_config=backend_config)
         del pipe
-        torch.cuda.empty_cache()
+        _clear_device_cache()
         if 'gw' in worker_id:
-            del os.environ['CUDA_VISIBLE_DEVICES']
+            unset_device_env_variable()
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
index 6687eb1d63..90f6a087bf 100644
--- a/autotest/interface/pipeline/test_pipeline_longtext_func.py
+++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 import pytest
-from utils.config_utils import get_cuda_id_by_workerid
+from utils.config_utils import set_device_env_variable
 from utils.get_run_config import close_pipeline, get_tp_num
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
@@ -19,7 +19,7 @@
 def test_history_issue_tp1(config, model, worker_id):
     log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log'])
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     stream_infer_basic(config, model, log_name)
 
 
@@ -28,7 +28,7 @@ def test_history_issue_tp1(config, model, worker_id):
 def test_history_issue_tp2(config, model, worker_id):
     log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log'])
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     stream_infer_basic(config, model, log_name)
 
@@ -63,7 +63,7 @@ def stream_infer_basic(config, model, log_name):
 def test_long_test_passkey_tp1(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+        set_device_env_variable(worker_id)
     passkey_retrival(config, model, backend, log_name, 1)
 
 
@@ -74,7 +74,7 @@ def test_long_test_passkey_tp1(config, model, backend, worker_id):
 def test_long_test_passkey_tp2(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=2)
+        set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     passkey_retrival(config, model, backend, log_name, 2)
 
@@ -85,7 +85,7 @@ def test_long_test_passkey_tp2(config, model, backend, worker_id):
 def test_long_test_passkey_tp4(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, tp_num=4)
+        set_device_env_variable(worker_id, tp_num=4)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     passkey_retrival(config, model, backend, log_name, 4, SESSION_LEN_PASSKEY_1M)
 
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 3c13cb1ebf..4aaddd580c 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -32,6 +32,7 @@ def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_2
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2))
 def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -280,12 +281,17 @@ def test_hf_pytorch_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'mistralai/Mixtral-8x7B-Instruct-v0.1'])
 def test_hf_pytorch_chat_pr(config, model, cli_case_config):
     usercase = 'chat_testcase'
+    device_type = os.environ.get('DEVICE', 'cuda')
+    if device_type == 'ascend':
+        env_var = 'ASCEND_RT_VISIBLE_DEVICES='
+    else:
+        env_var = 'CUDA_VISIBLE_DEVICES='
     result, chat_log, msg = hf_command_line_test(config,
                                                  usercase,
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'pytorch',
-                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6')
+                                                 cuda_prefix=f'{env_var}5,6')
     if chat_log is not None:
         allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
 
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 834853f5e9..42ed56d83d 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -315,8 +315,8 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp1(config, model, communicat
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model',
-                         ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize(
+    'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -338,8 +338,8 @@ def test_hf_turbomind_chat_fallback_backend_tp2(config, model, communicator, cli
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model',
-                         ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize(
+    'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicator, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -413,37 +413,17 @@ def test_hf_turbomind_base_tp2(config, model, communicator, cli_case_config, wor
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_pr(config, model, communicator, cli_case_config):
     usercase = 'chat_testcase'
-
-    result, chat_log, msg = hf_command_line_test(config,
-                                                 usercase,
-                                                 cli_case_config.get(usercase),
-                                                 model,
-                                                 'turbomind',
-                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6',
-                                                 extra=f'--communicator {communicator}')
-
-    if chat_log is not None:
-        allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT)
-
-    assert result, msg
-
-
-@pytest.mark.order(10)
-@pytest.mark.usefixtures('cli_case_config')
-@pytest.mark.hf_turbomind_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.pr_test
-@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3-8B'])
-@pytest.mark.parametrize('communicator', get_communicator_list())
-def test_hf_turbomind_chat_pr_gpu1(config, model, communicator, cli_case_config):
-    usercase = 'chat_testcase'
-
+    device_type = os.environ.get('DEVICE', 'cuda')
+    if device_type == 'ascend':
+        env_var = 'ASCEND_RT_VISIBLE_DEVICES='
+    else:
+        env_var = 'CUDA_VISIBLE_DEVICES='
     result, chat_log, msg = hf_command_line_test(config,
                                                  usercase,
                                                  cli_case_config.get(usercase),
                                                  model,
                                                  'turbomind',
-                                                 cuda_prefix='CUDA_VISIBLE_DEVICES=5,6',
+                                                 cuda_prefix=f'{env_var}5,6',
                                                  extra=f'--communicator {communicator}')
 
     if chat_log is not None:
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 9879300b87..74a00de128 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -9,8 +9,9 @@
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
+
 def _is_bf16_supported_by_device():
-    """Check if bf16 is supported based on the current device"""
+    """Check if bf16 is supported based on the current device."""
     device = os.environ.get('DEVICE', 'cuda')
     if device == 'ascend':
         # For Ascend, bf16 support check would be different
@@ -19,6 +20,22 @@ def _is_bf16_supported_by_device():
     else:
         # For CUDA and default, use the existing check
         return is_bf16_supported()
+
+
+def _clear_device_cache():
+    """Clear cache based on the current device type."""
+    device = os.environ.get('DEVICE', 'cuda')
+    if device == 'ascend':
+        try:
+            import torch_npu
+            torch_npu.npu.empty_cache()
+        except ImportError:
+            pass  # torch_npu not available
+    else:
+        import torch
+        torch.cuda.empty_cache()
+
+
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
 
     if 'pytorch' in backend_type:
@@ -30,6 +47,8 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
     device = os.environ.get('DEVICE', '')
     if device:
         backend_config.device_type = device
+        if device == 'ascend':
+            backend_config.eager_mode = True
 
     if 'lora' in backend_type:
         backend_config.adapters = extra.get('adapters')
@@ -74,12 +93,13 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
         print(f'[caseresult {case} start]' + json.dumps(response_list, ensure_ascii=False) +
               f'[caseresult {case} end]\n')
 
-    pipe.close()
+    # TODO fix for ascend
+    # pipe.close()
     import gc
 
     import torch
     gc.collect()
-    torch.cuda.empty_cache()
+    _clear_device_cache()
 
 
 if __name__ == '__main__':
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 8932a60fcd..e2829466fe 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -1,5 +1,6 @@
 import json
 import os
+
 import fire
 import numpy as np
 from PIL import Image
@@ -21,8 +22,9 @@
 DESC = 'What are the similarities and differences between these two images.'
 DESC_ZH = '两张图有什么相同和不同的地方.'
 
+
 def _is_bf16_supported_by_device():
-    """Check if bf16 is supported based on the current device"""
+    """Check if bf16 is supported based on the current device."""
     device = os.environ.get('DEVICE', 'cuda')
     if device == 'ascend':
         # For Ascend, bf16 support check would be different
@@ -31,6 +33,22 @@ def _is_bf16_supported_by_device():
     else:
         # For CUDA and default, use the existing check
         return is_bf16_supported()
+
+
+def _clear_device_cache():
+    """Clear cache based on the current device type."""
+    device = os.environ.get('DEVICE', 'cuda')
+    if device == 'ascend':
+        try:
+            import torch_npu
+            torch_npu.npu.empty_cache()
+        except ImportError:
+            pass  # torch_npu not available
+    else:
+        import torch
+        torch.cuda.empty_cache()
+
+
 def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_test, extra: object = None):
     if 'pytorch' in backend_type:
         backend_config = PytorchEngineConfig(tp=tp, session_len=32576, cache_max_entry_count=0.6)
@@ -46,6 +64,8 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
     device = os.environ.get('DEVICE', '')
     if device:
         backend_config.device_type = device
+        if device == 'ascend':
+            backend_config.eager_mode = True
 
     if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
         backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
@@ -115,12 +135,13 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
         if 'qwen' in model_path.lower():
             Qwen_vl_testcase(pipe, resource_path)
 
-    pipe.close()
+    # TODO fix for ascend
+    # pipe.close()
     import gc
 
     import torch
     gc.collect()
-    torch.cuda.empty_cache()
+    _clear_device_cache()
 
 
 def internvl_vl_testcase(pipe, resource_path, lang='en'):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index c7abafcff5..3738056c56 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import set_device_env_variable, get_torch_model_list
+from utils.config_utils import get_torch_model_list, set_device_env_variable
 from utils.pipeline_chat import run_pipeline_chat_test
 
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 90e9fc61f4..65948209cd 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import set_device_env_variable, get_torch_model_list
+from utils.config_utils import get_torch_model_list, set_device_env_variable
 from utils.pipeline_chat import run_pipeline_vl_chat_test
 
 BACKEND = 'pytorch'
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 31af132b1e..17d4f89505 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -1,7 +1,7 @@
 import os
 
 import pytest
-from utils.config_utils import get_communicator_list, set_device_env_variable, get_turbomind_model_list
+from utils.config_utils import get_communicator_list, get_turbomind_model_list, set_device_env_variable
 from utils.pipeline_chat import run_pipeline_chat_test
 
 
@@ -226,8 +226,8 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, common_case_config, m
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model',
-                         ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize(
+    'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
@@ -246,8 +246,8 @@ def test_pipeline_chat_fallback_backend_tp2(config, common_case_config, model, c
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model',
-                         ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize(
+    'model', ['google/gemma-2-27b-it', 'deepseek-ai/deepseek-moe-16b-chat', 'meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, common_case_config, model, communicator, worker_id):
     if 'gw' in worker_id:
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 2a4a48ac47..2325b4246c 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -1,7 +1,8 @@
 import os
 
 import pytest
-from utils.config_utils import get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list, set_device_env_variable
+from utils.config_utils import (get_communicator_list, get_cuda_id_by_workerid, get_turbomind_model_list,
+                                set_device_env_variable)
 from utils.pipeline_chat import run_pipeline_vl_chat_test
 
 BACKEND = 'turbomind'
@@ -150,9 +151,10 @@ def test_pipeline_chat_kvint8_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'model',
-    ['OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct-inner-4bits', 'THUDM/glm-4v-9b',
+    'THUDM/glm-4v-9b-inner-4bits'
+])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
@@ -165,9 +167,10 @@ def test_pipeline_chat_fallback_backend_tp1(config, model, communicator, worker_
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'model',
-    ['OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'THUDM/glm-4v-9b', 'THUDM/glm-4v-9b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'OpenGVLab/InternVL2-4B', 'Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct-inner-4bits', 'THUDM/glm-4v-9b',
+    'THUDM/glm-4v-9b-inner-4bits'
+])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator, worker_id):
     if 'gw' in worker_id:
@@ -187,7 +190,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator,
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
@@ -200,7 +203,7 @@ def test_pipeline_chat_fallback_backend_tp2(config, model, communicator, worker_
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-VL-32B-Instruct'])
+@pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id):
     if 'gw' in worker_id:
@@ -225,6 +228,11 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator,
     ['liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B', 'OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL3-8B'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_pr_test(config, model, communicator, worker_id):
+    device_type = os.environ.get('DEVICE', 'cuda')
+    if device_type == 'ascend':
+        env_var = 'ASCEND_RT_VISIBLE_DEVICES'
+    else:
+        env_var = 'CUDA_VISIBLE_DEVICES'
     if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5)
+        os.environ[f'{env_var}'] = str(int(get_cuda_id_by_workerid(worker_id)) + 5)
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}, is_smoke=True)
diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
index 7552e6e2aa..afa31d402b 100644
--- a/autotest/tools/quantization/test_quantization_awq.py
+++ b/autotest/tools/quantization/test_quantization_awq.py
@@ -8,6 +8,7 @@
 
 @pytest.mark.order(3)
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('awq'))
 def test_quantization_awq(config, model, worker_id):
diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py
index d210acdf1b..9ddc454ae6 100644
--- a/autotest/tools/quantization/test_quantization_w8a8.py
+++ b/autotest/tools/quantization/test_quantization_w8a8.py
@@ -8,6 +8,7 @@
 
 @pytest.mark.order(2)
 @pytest.mark.quantization_w8a8
+@pytest.mark.test_ascend
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('w8a8'))
 def test_quantization_w8a8(config, model, worker_id):
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index eaf574c591..57ac524912 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -41,6 +41,7 @@ def test_restful_chat_tp1(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_2
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
 def test_restful_chat_tp2(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index c20a06884b..852c21c047 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -4,11 +4,9 @@
 
 import allure
 import psutil
-from utils.config_utils import get_workerid
+from utils.config_utils import _is_bf16_supported_by_device, get_workerid
 from utils.run_restful_chat import health_check
 
-from lmdeploy.utils import is_bf16_supported
-
 DEFAULT_PORT = 23333
 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048'
 GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000'
@@ -38,7 +36,7 @@ def generation_test(config,
     run_config = ''
     if backend == 'pytorch':
         command += ' --backend pytorch'
-        if not is_bf16_supported():
+        if not _is_bf16_supported_by_device():
             command += ' --dtype float16'
     else:
         if '4bit' in model:
@@ -89,7 +87,7 @@ def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_
         run_config = '--num-prompts 5000'
     if backend == 'pytorch':
         command += ' --backend pytorch'
-        if not is_bf16_supported():
+        if not _is_bf16_supported_by_device():
             command += ' --dtype float16'
     else:
         if '4bit' in model:
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 34ea10acc6..0df8858b2c 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -173,7 +173,7 @@ def get_config():
             config_path = 'autotest/config.yaml'
     else:
         config_path = 'autotest/config.yaml'
-        
+
     with open(config_path) as f:
         config = yaml.load(f.read(), Loader=yaml.SafeLoader)
     return config
@@ -239,7 +239,7 @@ def is_quantization_model(name):
 
 
 def _is_bf16_supported_by_device():
-    """Check if bf16 is supported based on the current device"""
+    """Check if bf16 is supported based on the current device."""
     device = os.environ.get('DEVICE', 'cuda')
     if device == 'ascend':
         # For Ascend, bf16 support check would be different
@@ -248,12 +248,12 @@ def _is_bf16_supported_by_device():
     else:
         # For CUDA and default, use the existing check
         return is_bf16_supported()
-    
+
 
 def set_device_env_variable(worker_id, tp_num: int = 1):
-    """Set device environment variable based on the device type"""
+    """Set device environment variable based on the device type."""
     device = os.environ.get('DEVICE', 'cuda')  # Default to cuda
-    
+
     if device == 'ascend':
         device_id = get_cuda_id_by_workerid(worker_id, tp_num)
         if device_id is not None:
@@ -262,3 +262,13 @@ def set_device_env_variable(worker_id, tp_num: int = 1):
         cuda_id = get_cuda_id_by_workerid(worker_id, tp_num)
         if cuda_id is not None:
             os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id
+
+
+def unset_device_env_variable():
+    device_type = os.environ.get('DEVICE', 'cuda')
+    if device_type == 'ascend':
+        if 'ASCEND_RT_VISIBLE_DEVICES' in os.environ:
+            del os.environ['ASCEND_RT_VISIBLE_DEVICES']
+    else:
+        if 'CUDA_VISIBLE_DEVICES' in os.environ:
+            del os.environ['CUDA_VISIBLE_DEVICES']
diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py
index eb1b4c328d..6db2705214 100644
--- a/autotest/utils/get_run_config.py
+++ b/autotest/utils/get_run_config.py
@@ -1,6 +1,8 @@
+import os
 import random
+import re
+import subprocess
 from time import sleep
-import os, subprocess, re
 
 import torch
 
@@ -121,42 +123,42 @@ def close_pipeline(pipe):
 
 
 def _clear_device_cache():
-    """Clear cache based on the current device type"""
+    """Clear cache based on the current device type."""
     device = os.environ.get('DEVICE', 'cuda')
     handler = _get_device_handler(device)
     handler.clear_cache()
 
 
 def _get_device_handler(device):
-    """Get the appropriate device handler based on device type"""
+    """Get the appropriate device handler based on device type."""
     handlers = {
         'cuda': CudaDeviceHandler(),
         'ascend': AscendDeviceHandler(),
     }
-    
+
     # Return the specific handler if available, otherwise return default cuda handler
     return handlers.get(device, handlers['cuda'])
 
 
 class DeviceHandler:
-    """Base class for device handlers"""
-    
+    """Base class for device handlers."""
+
     def get_device_prefix(self, config, model):
-        """Get device-specific prefix for command execution"""
+        """Get device-specific prefix for command execution."""
         return ''
-    
+
     def clear_cache(self):
-        """Clear device-specific cache"""
+        """Clear device-specific cache."""
         pass
-    
+
     def get_available_devices(self):
-        """Get list of available devices"""
+        """Get list of available devices."""
         return []
 
 
 class CudaDeviceHandler(DeviceHandler):
-    """Handler for CUDA devices"""
-    
+    """Handler for CUDA devices."""
+
     def get_device_prefix(self, config, model):
         cuda_prefix = ''
         tp_num = get_tp_num(config, model)
@@ -169,10 +171,10 @@ def get_device_prefix(self, config, model):
         cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(random.sample(available_cuda, tp_num))
         self.clear_cache()
         return cuda_prefix
-    
+
     def clear_cache(self):
         torch.cuda.empty_cache()
-    
+
     def get_available_devices(self):
         devices = torch.cuda.device_count()
         available_cuda = []
@@ -186,8 +188,8 @@ def get_available_devices(self):
 
 
 class AscendDeviceHandler(DeviceHandler):
-    """Handler for Ascend devices"""
-    
+    """Handler for Ascend devices."""
+
     def get_device_prefix(self, config, model):
         ascend_prefix = ''
         tp_num = get_tp_num(config, model)
@@ -195,29 +197,29 @@ def get_device_prefix(self, config, model):
             return ascend_prefix
         available_ascend = self.get_available_devices()
         if len(available_ascend) < tp_num:
-            raise RuntimeError("Not enough Ascend devices available")
+            raise RuntimeError('Not enough Ascend devices available')
 
         ascend_prefix = 'ASCEND_RT_VISIBLE_DEVICES=' + ','.join(random.sample(available_ascend, tp_num))
         self.clear_cache()
         return ascend_prefix
-    
+
     def clear_cache(self):
         try:
             import torch_npu
             torch_npu.npu.empty_cache()
         except ImportError:
             pass  # torch_npu not available
-    
+
     def get_available_devices(self):
-        """Get list of available Ascend devices by checking AICPU usage rate"""
+        """Get list of available Ascend devices by checking AICPU usage
+        rate."""
         available_ascend = []
         try:
             # Get the number of NPU devices
-            result = subprocess.run(['npu-smi', 'info', '-l'], 
-                                  capture_output=True, text=True, timeout=10)
+            result = subprocess.run(['npu-smi', 'info', '-l'], capture_output=True, text=True, timeout=10)
             if result.returncode != 0:
                 return available_ascend
-                
+
             # Parse the output to get device count
             # Looking for lines like "Device Count : X"
             device_count = 0
@@ -227,15 +229,15 @@ def get_available_devices(self):
                     if match:
                         device_count = int(match.group(1))
                         break
-            
+
             # Check each device's AICPU usage
             for i in range(device_count):
                 try:
-                    result = subprocess.run(['npu-smi', 'info', '-t', 'usages', '-i', str(i)], 
-                                          capture_output=True, text=True, timeout=10)
+                    result = subprocess.run(
+                        ['npu-smi', 'info', '-t', 'usages', '-i', str(i)], capture_output=True, text=True, timeout=10)
                     if result.returncode != 0:
                         continue
-                        
+
                     # Parse the output to get AICPU Usage Rate
                     # Looking for lines like "Aicpu Usage Rate(%) : X"
                     aicpu_usage = 100  # Default to 100% (busy)
@@ -245,14 +247,14 @@ def get_available_devices(self):
                             if match:
                                 aicpu_usage = int(match.group(1))
                                 break
-                    
+
                     # If AICPU usage is 0, consider the device available
                     if aicpu_usage == 0:
                         available_ascend.append(str(i))
                 except (subprocess.TimeoutExpired, subprocess.SubprocessError):
                     continue
-                    
+
         except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError):
             # npu-smi command not found or other error
-            pass 
-        return available_ascend
\ No newline at end of file
+            pass
+        return available_ascend
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index ab87ed0604..6841d36d0b 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -44,13 +44,14 @@ def run_pipeline_chat_test(config,
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)
         extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,')
+    env = os.environ.copy()
     with open(pipeline_chat_log, 'w') as f:
         cmd = f'python3 autotest/tools/pipeline/llm_case.py run_pipeline_chat_test {hf_path} autotest/prompt_case.yaml {tp} {backend_type} {is_smoke} {extra}'  # noqa E501
 
         f.writelines('reproduce command: ' + cmd + '\n')
         print('reproduce command: ' + cmd)
         # quantization
-        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8')
+        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env)
 
         output_text = response.stdout
         print(output_text)
@@ -109,13 +110,14 @@ def run_pipeline_vl_chat_test(config,
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)
         extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,')
+    env = os.environ.copy()
     with open(pipeline_chat_log, 'w') as f:
         cmd = f'python3 autotest/tools/pipeline/mllm_case.py run_pipeline_mllm_test {hf_path} {resource_path} {tp} {backend_type} {is_smoke} {extra}'  # noqa E501
 
         f.writelines('reproduce command: ' + cmd + '\n')
         print('reproduce command: ' + cmd)
         # quantization
-        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8')
+        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env)
 
         output_text = response.stdout
         print(output_text)
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 04595829e2..02932e7e21 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -2,7 +2,6 @@
 import subprocess
 from subprocess import PIPE
 
-from lmdeploy.utils import is_bf16_supported
 from utils.config_utils import _is_bf16_supported_by_device
 
 
@@ -31,11 +30,11 @@ def quantization(config,
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
             now the type is ' + quantization_type
-    
+
     # Add device option if specified in environment
     device = os.environ.get('DEVICE', '')
-    if device:
-        quantization_cmd += f' --device npu'
+    if device == 'ascend':
+        quantization_cmd += f' --device npu '
 
     if cuda_prefix is not None:
         quantization_cmd = ' '.join([cuda_prefix, quantization_cmd])
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index 8f345efe9c..889cd3e158 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -1,12 +1,10 @@
 import os
 from subprocess import PIPE, Popen
 
-from utils.get_run_config import get_command_with_extra, get_model_name
 from utils.config_utils import _is_bf16_supported_by_device
+from utils.get_run_config import get_command_with_extra, get_model_name
 from utils.rule_condition_assert import assert_result
 
-from lmdeploy.utils import is_bf16_supported
-
 TEMPLATE = 'autotest/template.json'
 
 
@@ -31,12 +29,14 @@ def command_line_test(config,
             cmd += ' --model-format gptq'
     if case == 'base_testcase':
         cmd += ' --chat-template ' + TEMPLATE
-    
+
     # Add device option if specified in environment
     device = os.environ.get('DEVICE', '')
     if device:
-        cmd += f' --device {device}'
-        
+        cmd += f' --device {device} '
+        if device == 'ascend':
+            cmd += '--eager-mode '
+
     return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind', worker_id=worker_id)
 
 
@@ -74,12 +74,14 @@ def hf_command_line_test(config,
 
     if case == 'base_testcase':
         cmd += ' --chat-template ' + TEMPLATE
-    
+
     # Add device option if specified in environment
     device = os.environ.get('DEVICE', '')
     if device:
-        cmd += f' --device {device}'
-        
+        cmd += f' --device {device} '
+        if device == 'ascend':
+            cmd += '--eager-mode '
+
     return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True)
 
 
@@ -175,4 +177,4 @@ def extract_output(output: str, model: str):
         if len(output.split('[/INST]')) >= 2:
             return output.split('[/INST]')[1]
 
-    return output
\ No newline at end of file
+    return output
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index a7922a0035..a3c3c99b77 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -7,13 +7,12 @@
 import psutil
 from openai import OpenAI
 from pytest_assume.plugin import assume
-from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid, _is_bf16_supported_by_device
-from utils.get_run_config import get_command_with_extra 
+from utils.config_utils import _is_bf16_supported_by_device, get_cuda_prefix_by_workerid, get_workerid
+from utils.get_run_config import get_command_with_extra
 from utils.restful_return_check import assert_chat_completions_batch_return
 from utils.rule_condition_assert import assert_result
 
 from lmdeploy.serve.openai.api_client import APIClient
-from lmdeploy.utils import is_bf16_supported
 
 BASE_HTTP_URL = 'http://localhost'
 DEFAULT_PORT = 23333
@@ -60,10 +59,12 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
                                  need_tp=True,
                                  cuda_prefix=cuda_prefix,
                                  extra=extra)
-    
+
     device = os.environ.get('DEVICE', '')
     if device:
-        cmd += f' --device {device}'
+        cmd += f' --device {device} '
+        if device == 'ascend':
+            cmd += '--eager-mode '
 
     if backend_type == 'turbomind':
         if ('w4' in model or '4bits' in model or 'awq' in model.lower()):
@@ -137,7 +138,6 @@ def stop_restful_api(pid, startRes, param):
 
 def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PORT):
     http_url = BASE_HTTP_URL + ':' + str(port)
-
     model = get_model(http_url)
 
     if model is None:
@@ -164,8 +164,8 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''):
 
     result = True
 
-    client = OpenAI(api_key='YOUR_API_KEY', base_url=f'{url}/v1')
-    model_name = client.models.list().data[0].id
+    api_client = APIClient(url)
+    model_name = api_client.available_models[0]
 
     messages = []
     msg = ''
@@ -176,17 +176,18 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''):
         messages.append({'role': 'user', 'content': prompt})
         file.writelines('prompt:' + prompt + '\n')
 
-        response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, top_p=0.8)
+        for output in api_client.chat_completions_v1(model=model_name, messages=messages, top_k=1, max_tokens=256):
+            output_message = output.get('choices')[0].get('message')
+            messages.append(output_message)
 
-        output_content = response.choices[0].message.content
-        file.writelines('output:' + output_content + '\n')
-        messages.append({'role': 'assistant', 'content': output_content})
+            output_content = output_message.get('content')
+            file.writelines('output:' + output_content + '\n')
 
-        case_result, reason = assert_result(output_content, prompt_detail.values(), model_name)
-        file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n')
-        if not case_result:
-            msg += reason
-        result = result & case_result
+            case_result, reason = assert_result(output_content, prompt_detail.values(), model_name)
+            file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n')
+            if not case_result:
+                msg += reason
+            result = result & case_result
     file.close()
     return result, restful_log, msg
 
@@ -457,9 +458,9 @@ def get_temperature_date(location: str, date: str, unit: str = 'celsius'):
         """Get temperature at a location and date.
 
         Args:
-            location: The location to get the temperature for, in the format 'City, State, Country'.
-            date: The date to get the temperature for, in the format 'Year-Month-Day'.
-            unit: The unit to return the temperature in. Defaults to 'celsius'. (choices: ['celsius', 'fahrenheit'])
+            location: The location to get the temperature for, in the format "City, State, Country".
+            date: The date to get the temperature for, in the format "Year-Month-Day".
+            unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
 
         Returns:
             the temperature, the location, the date and the unit in a dict
@@ -617,7 +618,7 @@ def run_tools_case(config, port: int = DEFAULT_PORT):
                 },
             }
         }]
-        messages = [{'role': 'user', 'content': 'What\'s the weather like in Boston today?'}]
+        messages = [{'role': 'user', 'content': "What's the weather like in Boston today?"}]
         response = client.chat.completions.create(model=model_name,
                                                   messages=messages,
                                                   temperature=0.01,

From fb57adaabcb97ff0ea8a5b28dd83b8ef3507fd32 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 29 Aug 2025 11:14:59 +0800
Subject: [PATCH 03/32] AUTOTEST: add pipeline test timeout

---
 autotest/tools/pipeline/llm_case.py  |  5 ++++-
 autotest/tools/pipeline/mllm_case.py |  6 ++++--
 autotest/utils/pipeline_chat.py      | 22 ++++++++++++++++++++--
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 74a00de128..9bfe91a8ef 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -94,7 +94,10 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
               f'[caseresult {case} end]\n')
 
     # TODO fix for ascend
-    # pipe.close()
+    if device == 'ascend':
+        pass
+    else:
+        pipe.close()
     import gc
 
     import torch
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index e2829466fe..e48b443e9c 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -135,8 +135,10 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
         if 'qwen' in model_path.lower():
             Qwen_vl_testcase(pipe, resource_path)
 
-    # TODO fix for ascend
-    # pipe.close()
+    if device == 'ascend':
+        pass
+    else:
+        pipe.close()
     import gc
 
     import torch
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 6841d36d0b..a59e84e137 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -51,7 +51,16 @@ def run_pipeline_chat_test(config,
         f.writelines('reproduce command: ' + cmd + '\n')
         print('reproduce command: ' + cmd)
         # quantization
-        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env)
+        try:
+            response = subprocess.run([cmd],
+                                      shell=True,
+                                      capture_output=True,
+                                      text=True,
+                                      encoding='utf-8',
+                                      env=env,
+                                      timeout=600)
+        except subprocess.TimeoutExpired as e:
+            assert False, f'Test command timed out after 10 minutes: {e.cmd}'
 
         output_text = response.stdout
         print(output_text)
@@ -117,7 +126,16 @@ def run_pipeline_vl_chat_test(config,
         f.writelines('reproduce command: ' + cmd + '\n')
         print('reproduce command: ' + cmd)
         # quantization
-        response = subprocess.run([cmd], shell=True, capture_output=True, text=True, encoding='utf-8', env=env)
+        try:
+            response = subprocess.run([cmd],
+                                      shell=True,
+                                      capture_output=True,
+                                      text=True,
+                                      encoding='utf-8',
+                                      env=env,
+                                      timeout=600)
+        except subprocess.TimeoutExpired as e:
+            assert False, f'Test command timed out after 10 minutes: {e.cmd}'
 
         output_text = response.stdout
         print(output_text)

From 4869f6400ccccc993a5081e268d493b12441c25b Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 29 Aug 2025 11:24:58 +0800
Subject: [PATCH 04/32] AUTOTEST: fix lint flake8

---
 autotest/interface/pipeline/test_pipeline_func.py | 1 -
 autotest/tools/pipeline/llm_case.py               | 1 -
 autotest/tools/pipeline/mllm_case.py              | 1 -
 autotest/utils/quantization_utils.py              | 2 +-
 4 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 8c8b0d45c3..42f6e95d86 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -2,7 +2,6 @@
 
 import pydantic
 import pytest
-import torch
 from utils.config_utils import _is_bf16_supported_by_device, set_device_env_variable, unset_device_env_variable
 from utils.get_run_config import _clear_device_cache
 from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return,
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 9bfe91a8ef..0555dce6f2 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -100,7 +100,6 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
         pipe.close()
     import gc
 
-    import torch
     gc.collect()
     _clear_device_cache()
 
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index e48b443e9c..5a649a1cca 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -141,7 +141,6 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
         pipe.close()
     import gc
 
-    import torch
     gc.collect()
     _clear_device_cache()
 
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 02932e7e21..78e7d62d1e 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -34,7 +34,7 @@ def quantization(config,
     # Add device option if specified in environment
     device = os.environ.get('DEVICE', '')
     if device == 'ascend':
-        quantization_cmd += f' --device npu '
+        quantization_cmd += ' --device npu '
 
     if cuda_prefix is not None:
         quantization_cmd = ' '.join([cuda_prefix, quantization_cmd])

From d8f85d4981d21fc9e6faf17df285db57ea682b0c Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 29 Aug 2025 16:41:41 +0800
Subject: [PATCH 05/32] Create api_eva.yml

---
 .github/workflows/api_eva.yml | 187 ++++++++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 .github/workflows/api_eva.yml

diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml
new file mode 100644
index 0000000000..9c15c7b825
--- /dev/null
+++ b/.github/workflows/api_eva.yml
@@ -0,0 +1,187 @@
+name: api_evalate
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['evaluation']"
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda11.8
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, 140-test]
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+
+
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: [self-hosted, 140-test]
+    timeout-minutes: 120 # 2hours
+    strategy:
+      fail-fast: false
+      matrix:
+        evaluate_type: ['chat', 'base']
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install sentence_transformers==2.2.2 --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
+      - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir

From 82188f7c9a4984a04ec3d7385691dad8303180e2 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 16:38:01 +0800
Subject: [PATCH 06/32] WORKFLOW: add ascend workflow

---
 .github/workflows/daily_ete_test_ascend.yml | 223 ++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 .github/workflows/daily_ete_test_ascend.yml

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
new file mode 100644
index 0000000000..214fa87390
--- /dev/null
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -0,0 +1,223 @@
+name: daily_ete_test
+
+on:
+  push:
+    branches:
+      - hw_runner
+
+env:
+  REPORT_DIR: /test/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /test/lmdeploy_hw
+  LOG_PATH: /test/log
+  OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt
+  # Default values for former workflow_dispatch inputs
+  BACKEND: "['turbomind', 'pytorch']"
+  MODEL: "['llm','mllm']"
+  FUNCTION: '["pipeline", "restful", "chat"]'
+  OFFLINE_MODE: false
+  REGRESSION_FUNC: "['quant', 'pipeline', 'restful', 'chat']"
+  TMPDIR: /mnt/deeplink/docker-tmp
+  RAY_TMPDIR: /mnt/deeplink/docker-tmp
+
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 150
+    container:
+      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
+      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      volumes:
+        - /usr/local/Ascend/driver:/usr/local/Ascend/driver
+        - /usr/local/sbin:/usr/local/sbin
+        - /var/log/npu/slog:/var/log/npu/slog
+        - /var/log/npu/profiling:/var/log/npu/profiling
+        - /var/log/npu/dump:/var/log/npu/dump
+        - /var/log/npu:/usr/slog
+        - /etc/hccn.conf:/etc/hccn.conf
+        - /root/qa_test:/test
+        - /mnt:/mnt
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          python3 -m pip install transformers==4.53.1
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf ${{env.LOG_PATH}}/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: contains(fromJSON(env.BACKEND), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - quantization w8a8
+        continue-on-error: true
+        if: contains(fromJSON(env.BACKEND), 'pytorch')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      # - name: Clear workfile
+      #   if: always()
+      #   run: |
+      #     chmod -R 777 $REPORT_DIR
+      #     export workdir=$(pwd)
+      #     cd ..
+      #     rm -rf $workdir
+      #     mkdir $workdir
+      #     chmod -R 777 $workdir
+
+  test_tools:
+    if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}}
+    runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
+    timeout-minutes: 300
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(env.BACKEND) }}
+        model: ${{ fromJSON(env.MODEL) }}
+        function: ${{ fromJSON(env.FUNCTION) }}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+        include:
+          - backend: turbomind
+            model: llm
+            function: local_case
+    container:
+      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
+      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      volumes:
+        - /usr/local/Ascend/driver:/usr/local/Ascend/driver
+        - /usr/local/sbin:/usr/local/sbin
+        - /var/log/npu/slog:/var/log/npu/slog
+        - /var/log/npu/profiling:/var/log/npu/profiling
+        - /var/log/npu/dump:/var/log/npu/dump
+        - /var/log/npu:/usr/slog
+        - /etc/hccn.conf:/etc/hccn.conf
+        - /root/qa_test:/test
+        - /mnt:/mnt
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf ${{env.LOG_PATH}}/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.function == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.function == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      # - name: Clear workfile
+      #   if: always()
+      #   run: |
+      #     chmod -R 777 $REPORT_DIR
+      #     export workdir=$(pwd)
+      #     cd ..
+      #     rm -rf $workdir
+      #     mkdir $workdir
+      #     chmod -R 777 $workdir
+
+  # get_coverage_report:
+  #   if: ${{!cancelled()}}
+  #   runs-on: [self-hosted, linux-a100]
+  #   needs: [test_tools, test_restful, test_pipeline, test_benchmark]
+  #   timeout-minutes: 5
+  #   container:
+  #     image: openmmlab/lmdeploy:latest-cu11
+  #     options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+  #     volumes:
+  #       - /nvme/github-actions/pip-cache:/root/.cache/pip
+  #       - /nvme/github-actions/packages:/root/packages
+  #       - /nvme/qa_test_models:/nvme/qa_test_models
+  #       - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+  #   steps:
+  #     - name: Copy repository and Artifacts
+  #       run: cp -r ${{env.TEST_CODE_PATH}}/. .
+  #     - name: Install lmdeploy
+  #       run: |
+  #         python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+  #         python3 -m pip install -r requirements/test.txt
+  #     - name: Get coverage report
+  #       run: |
+  #         pip install coverage
+  #         coverage combine ${{env.REPORT_DIR}}
+  #         coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+  #         coverage report -m
+  #         mv .coverage ${{env.REPORT_DIR}}/.coverage
+  #     - name: Clear workfile
+  #       if: always()
+  #       run: |
+  #         chmod -R 777 $REPORT_DIR
+  #         export workdir=$(pwd)
+  #         cd ..
+  #         rm -rf $workdir
+  #         mkdir $workdir
+  #         chmod -R 777 $workdir

From ddaa36c35323a971e0707492693542a3855ea8f9 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 17:01:21 +0800
Subject: [PATCH 07/32] WORKFLOW: update ascend runner

---
 .github/workflows/daily_ete_test_ascend.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 214fa87390..f17b0bb225 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -21,11 +21,10 @@ env:
   TMPDIR: /mnt/deeplink/docker-tmp
   RAY_TMPDIR: /mnt/deeplink/docker-tmp
 
-
+jobs:
   test_quantization:
-    needs: download_pkgs
     if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
-    runs-on: [self-hosted, linux-a100]
+    runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
@@ -85,7 +84,7 @@ env:
 
   test_tools:
     if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}}
-    runs-on: [self-hosted, linux-a100]
+    runs-on: [self-hosted, ascend-013]
     needs: test_quantization
     timeout-minutes: 300
     strategy:
@@ -104,7 +103,6 @@ env:
         include:
           - backend: turbomind
             model: llm
-            function: local_case
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
       options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
@@ -187,7 +185,7 @@ env:
 
   # get_coverage_report:
   #   if: ${{!cancelled()}}
-  #   runs-on: [self-hosted, linux-a100]
+  #   runs-on: [self-hosted, ascend-013]
   #   needs: [test_tools, test_restful, test_pipeline, test_benchmark]
   #   timeout-minutes: 5
   #   container:

From 2cda270d8496c0964291a8ed08825b68abc35f03 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 17:10:46 +0800
Subject: [PATCH 08/32] fix yaml

---
 .github/workflows/daily_ete_test_ascend.yml | 152 +++++++-------------
 1 file changed, 49 insertions(+), 103 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index f17b0bb225..f0a51690eb 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -13,17 +13,17 @@ env:
   LOG_PATH: /test/log
   OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt
   # Default values for former workflow_dispatch inputs
-  BACKEND: "['turbomind', 'pytorch']"
-  MODEL: "['llm','mllm']"
+  BACKEND: '["turbomind", "pytorch"]'
+  MODEL: '["llm","mllm"]'
   FUNCTION: '["pipeline", "restful", "chat"]'
   OFFLINE_MODE: false
-  REGRESSION_FUNC: "['quant', 'pipeline', 'restful', 'chat']"
+  REGRESSION_FUNC: '["quant", "pipeline", "restful", "chat"]'
   TMPDIR: /mnt/deeplink/docker-tmp
   RAY_TMPDIR: /mnt/deeplink/docker-tmp
 
 jobs:
   test_quantization:
-    if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
+    if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
     runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150
     container:
@@ -42,10 +42,10 @@ jobs:
     steps:
       - name: Copy repository and Artifacts
         run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
+          cp -r ${{ env.TEST_CODE_PATH }}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+          python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
@@ -57,33 +57,24 @@ jobs:
           lmdeploy check_env
           rm -rf allure-results
           # remove tmp log in testcase
-          rm -rf ${{env.LOG_PATH}}/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+          rm -rf ${{ env.LOG_PATH }}/*
+          mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p
+          ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
-        if: contains(fromJSON(env.BACKEND), 'turbomind')
+        if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }}
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
-        if: contains(fromJSON(env.BACKEND), 'pytorch')
+        if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }}
         run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      # - name: Clear workfile
-      #   if: always()
-      #   run: |
-      #     chmod -R 777 $REPORT_DIR
-      #     export workdir=$(pwd)
-      #     cd ..
-      #     rm -rf $workdir
-      #     mkdir $workdir
-      #     chmod -R 777 $workdir
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
 
   test_tools:
-    if: ${{!cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools')}}
+    if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }}
     runs-on: [self-hosted, ascend-013]
     needs: test_quantization
     timeout-minutes: 300
@@ -103,6 +94,7 @@ jobs:
         include:
           - backend: turbomind
             model: llm
+            function: chat
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
       options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
@@ -119,10 +111,10 @@ jobs:
     steps:
       - name: Copy repository and Artifacts
         run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
+          cp -r ${{ env.TEST_CODE_PATH }}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+          python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
@@ -134,88 +126,42 @@ jobs:
           cp -r /root/lora .
           rm -rf allure-results
           # remove tmp log in testcase
-          rm -rf ${{env.LOG_PATH}}/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+          rm -rf ${{ env.LOG_PATH }}/*
+          mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p
+          ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: matrix.function == 'pipeline'
+        if: ${{ matrix.function == 'pipeline' }}
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: matrix.function == 'restful'
+        if: ${{ matrix.function == 'restful' }}
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      # - name: Clear workfile
-      #   if: always()
-      #   run: |
-      #     chmod -R 777 $REPORT_DIR
-      #     export workdir=$(pwd)
-      #     cd ..
-      #     rm -rf $workdir
-      #     mkdir $workdir
-      #     chmod -R 777 $workdir
-
-  # get_coverage_report:
-  #   if: ${{!cancelled()}}
-  #   runs-on: [self-hosted, ascend-013]
-  #   needs: [test_tools, test_restful, test_pipeline, test_benchmark]
-  #   timeout-minutes: 5
-  #   container:
-  #     image: openmmlab/lmdeploy:latest-cu11
-  #     options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-  #     volumes:
-  #       - /nvme/github-actions/pip-cache:/root/.cache/pip
-  #       - /nvme/github-actions/packages:/root/packages
-  #       - /nvme/qa_test_models:/nvme/qa_test_models
-  #       - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-  #   steps:
-  #     - name: Copy repository and Artifacts
-  #       run: cp -r ${{env.TEST_CODE_PATH}}/. .
-  #     - name: Install lmdeploy
-  #       run: |
-  #         python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-  #         python3 -m pip install -r requirements/test.txt
-  #     - name: Get coverage report
-  #       run: |
-  #         pip install coverage
-  #         coverage combine ${{env.REPORT_DIR}}
-  #         coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
-  #         coverage report -m
-  #         mv .coverage ${{env.REPORT_DIR}}/.coverage
-  #     - name: Clear workfile
-  #       if: always()
-  #       run: |
-  #         chmod -R 777 $REPORT_DIR
-  #         export workdir=$(pwd)
-  #         cd ..
-  #         rm -rf $workdir
-  #         mkdir $workdir
-  #         chmod -R 777 $workdir
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
\ No newline at end of file

From 755cee605c0d2d3d36e4916b209f5daaaeb057b5 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 17:31:35 +0800
Subject: [PATCH 09/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 42 ++++++++++-----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index f0a51690eb..248d008130 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -62,39 +62,39 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
-        if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }}
+        # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }}
         run: |
           pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
-        if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }}
+        # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }}
         run: |
           pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
 
   test_tools:
-    if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }}
+    # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }}
     runs-on: [self-hosted, ascend-013]
     needs: test_quantization
     timeout-minutes: 300
     strategy:
       fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(env.BACKEND) }}
-        model: ${{ fromJSON(env.MODEL) }}
-        function: ${{ fromJSON(env.FUNCTION) }}
-        exclude:
-          - backend: turbomind
-            model: mllm
-            function: chat
-          - backend: pytorch
-            model: mllm
-            function: chat
-        include:
-          - backend: turbomind
-            model: llm
-            function: chat
+      # matrix:
+      #   backend: ${{ fromJSON(env.BACKEND) }}
+      #   model: ${{ fromJSON(env.MODEL) }}
+      #   function: ${{ fromJSON(env.FUNCTION) }}
+      #   exclude:
+      #     - backend: turbomind
+      #       model: mllm
+      #       function: chat
+      #     - backend: pytorch
+      #       model: mllm
+      #       function: chat
+      #   include:
+      #     - backend: turbomind
+      #       model: llm
+      #       function: chat
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
       options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
@@ -131,7 +131,7 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
+        # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -143,7 +143,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: ${{ matrix.function == 'pipeline' }}
+        # if: ${{ matrix.function == 'pipeline' }}
         run: |
           pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -155,7 +155,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: ${{ matrix.function == 'restful' }}
+        # if: ${{ matrix.function == 'restful' }}
         run: |
           pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true

From 28f4df644d9e07328c6b3ec4c42c8a5053ecbbaf Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 17:32:36 +0800
Subject: [PATCH 10/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 248d008130..0321c935bc 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -23,7 +23,7 @@ env:
 
 jobs:
   test_quantization:
-    if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
+    # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
     runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150
     container:

From 0ef80d682009c8455b29c89b5b91834f56424aaa Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 17:42:19 +0800
Subject: [PATCH 11/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 0321c935bc..1ced07212d 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -23,6 +23,7 @@ env:
 
 jobs:
   test_quantization:
+    if: ${{ !cancelled() }}
     # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
     runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150
@@ -62,18 +63,21 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
+        if: ${{ !cancelled() }}
         # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }}
         run: |
           pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
+        if: ${{ !cancelled() }}
         # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }}
         run: |
           pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
 
   test_tools:
+    if: ${{ !cancelled() }}
     # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }}
     runs-on: [self-hosted, ascend-013]
     needs: test_quantization
@@ -131,6 +135,7 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
+        if: ${{ !cancelled() }}
         # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
@@ -143,6 +148,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
+        if: ${{ !cancelled() }}
         # if: ${{ matrix.function == 'pipeline' }}
         run: |
           pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
@@ -155,6 +161,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
+        if: ${{ !cancelled() }}
         # if: ${{ matrix.function == 'restful' }}
         run: |
           pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true

From c6c618312a68b0e5df11d775a1f7b9bd896613a8 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 18:52:17 +0800
Subject: [PATCH 12/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 1ced07212d..5be7e0baf2 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -29,15 +29,28 @@ jobs:
     timeout-minutes: 150
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      options: >-
+        --net=host
+        --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3
+        --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7
+        --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc
+        -e PIP_CACHE_DIR=/root/.cache/pip
+        --shm-size=150g
+        --pull missing
+        --memory=256g
+        --cpus=48
+        --security-opt=no-new-privileges:true
+        --health-cmd="npu-smi info"
+        --health-interval=30s
+        --restart=on-failure:5
       volumes:
-        - /usr/local/Ascend/driver:/usr/local/Ascend/driver
-        - /usr/local/sbin:/usr/local/sbin
+        - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
+        - /usr/local/sbin:/usr/local/sbin:ro
         - /var/log/npu/slog:/var/log/npu/slog
         - /var/log/npu/profiling:/var/log/npu/profiling
         - /var/log/npu/dump:/var/log/npu/dump
         - /var/log/npu:/usr/slog
-        - /etc/hccn.conf:/etc/hccn.conf
+        - /etc/hccn.conf:/etc/hccn.conf:ro
         - /root/qa_test:/test
         - /mnt:/mnt
     steps:

From 39b6dc3d997a9b47b909bbdf7853ee2d72c075b6 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 19:14:46 +0800
Subject: [PATCH 13/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 5be7e0baf2..978dccb503 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -29,20 +29,7 @@ jobs:
     timeout-minutes: 150
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: >-
-        --net=host
-        --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3
-        --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7
-        --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc
-        -e PIP_CACHE_DIR=/root/.cache/pip
-        --shm-size=150g
-        --pull missing
-        --memory=256g
-        --cpus=48
-        --security-opt=no-new-privileges:true
-        --health-cmd="npu-smi info"
-        --health-interval=30s
-        --restart=on-failure:5
+      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
       volumes:
         - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
         - /usr/local/sbin:/usr/local/sbin:ro

From 88c84614eec350d614a63ca82e8d4a32150d531d Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 4 Sep 2025 19:16:50 +0800
Subject: [PATCH 14/32] fix yaml ii

---
 .github/workflows/daily_ete_test_ascend.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 978dccb503..c079364511 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -29,7 +29,7 @@ jobs:
     timeout-minutes: 150
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
       volumes:
         - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
         - /usr/local/sbin:/usr/local/sbin:ro
@@ -101,7 +101,7 @@ jobs:
       #       function: chat
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--net=host --privileged=true --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
       volumes:
         - /usr/local/Ascend/driver:/usr/local/Ascend/driver
         - /usr/local/sbin:/usr/local/sbin

From 82cec68c02207e33c37eff0df0993b15df35a361 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 5 Sep 2025 10:23:02 +0800
Subject: [PATCH 15/32] update ascend

---
 .github/workflows/daily_ete_test_ascend.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index c079364511..dc262bdfbb 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -49,7 +49,7 @@ jobs:
           python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps
           python3 -m pip install -r requirements/test.txt
           python3 -m pip install transformers==4.53.1
       - name: Check env
@@ -121,7 +121,7 @@ jobs:
           python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |

From 49b3d59670dc4de773f7715b78f03d0719ed9aee Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 5 Sep 2025 10:35:08 +0800
Subject: [PATCH 16/32] update ascend

---
 .github/workflows/daily_ete_test_ascend.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index dc262bdfbb..b5e9886725 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -49,7 +49,6 @@ jobs:
           python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps
           python3 -m pip install -r requirements/test.txt
           python3 -m pip install transformers==4.53.1
       - name: Check env
@@ -121,7 +120,6 @@ jobs:
           python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
       - name: Install lmdeploy
         run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |

From 4b591d4d73b04788a3d724bb98afd1ec44f88e7e Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 5 Sep 2025 10:56:18 +0800
Subject: [PATCH 17/32] update ascend

---
 .github/workflows/daily_ete_test_ascend.yml | 24 ++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index b5e9886725..cab1cb4e3b 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -136,37 +136,37 @@ jobs:
         if: ${{ !cancelled() }}
         # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
         if: ${{ !cancelled() }}
         # if: ${{ matrix.function == 'pipeline' }}
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
         if: ${{ !cancelled() }}
         # if: ${{ matrix.function == 'restful' }}
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
\ No newline at end of file

From ee49e02cfe87bb8f8472d26dd3301d242091b4fc Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 14:34:06 +0800
Subject: [PATCH 18/32] AUTOTEST: update hw yml

---
 .github/workflows/daily_ete_test_ascend.yml | 161 ++++++++++++++------
 1 file changed, 113 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index cab1cb4e3b..b977c740ca 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -1,30 +1,83 @@
-name: daily_ete_test
+name: daily_ete_test_ascend
 
 on:
-  push:
-    branches:
-      - hw_runner
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
+        type: string
+        default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
 
 env:
   REPORT_DIR: /test/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy
   FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
-  TEST_CODE_PATH: /test/lmdeploy_hw
+  TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }}
   LOG_PATH: /test/log
-  OFFLINE_REQUIREMENTS: /test/lmdeploy_hw/requirements_ascend.txt
-  # Default values for former workflow_dispatch inputs
-  BACKEND: '["turbomind", "pytorch"]'
-  MODEL: '["llm","mllm"]'
-  FUNCTION: '["pipeline", "restful", "chat"]'
-  OFFLINE_MODE: false
-  REGRESSION_FUNC: '["quant", "pipeline", "restful", "chat"]'
   TMPDIR: /mnt/deeplink/docker-tmp
   RAY_TMPDIR: /mnt/deeplink/docker-tmp
 
 jobs:
+  download_pkgs:
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, ascend-013]
+    timeout-minutes: 50
+    container:
+      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
+      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
+      volumes:
+        - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
+        - /usr/local/sbin:/usr/local/sbin:ro
+        - /var/log/npu/slog:/var/log/npu/slog
+        - /var/log/npu/profiling:/var/log/npu/profiling
+        - /var/log/npu/dump:/var/log/npu/dump
+        - /var/log/npu:/usr/slog
+        - /etc/hccn.conf:/etc/hccn.conf:ro
+        - /root/qa_test:/test
+        - /mnt:/mnt
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{ !cancelled() }}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{ !cancelled() }}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
   test_quantization:
     if: ${{ !cancelled() }}
-    # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'quant') }}
     runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150
     container:
@@ -43,11 +96,11 @@ jobs:
     steps:
       - name: Copy repository and Artifacts
         run: |
-          cp -r ${{ env.TEST_CODE_PATH }}/. .
+          cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
-      - name: Install lmdeploy
+          python3 -m pip install -r requirements_ascend.txt
+      - name: Install lmdeploy - offline
         run: |
           python3 -m pip install -r requirements/test.txt
           python3 -m pip install transformers==4.53.1
@@ -62,42 +115,48 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
-        if: ${{ !cancelled() }}
-        # if: ${{ contains(fromJSON(env.BACKEND), 'turbomind') }}
+        if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{ env.REPORT_DIR }} --clean-alluredir ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
-        if: ${{ !cancelled() }}
-        # if: ${{ contains(fromJSON(env.BACKEND), 'pytorch') }}
+        if: contains(fromJSON(github.event.inputs.backend), 'pytorch')
         run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
 
   test_tools:
-    if: ${{ !cancelled() }}
-    # if: ${{ !cancelled() && contains(fromJSON(env.REGRESSION_FUNC), 'tools') }}
+    if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}}
     runs-on: [self-hosted, ascend-013]
     needs: test_quantization
     timeout-minutes: 300
     strategy:
       fail-fast: false
-      # matrix:
-      #   backend: ${{ fromJSON(env.BACKEND) }}
-      #   model: ${{ fromJSON(env.MODEL) }}
-      #   function: ${{ fromJSON(env.FUNCTION) }}
-      #   exclude:
-      #     - backend: turbomind
-      #       model: mllm
-      #       function: chat
-      #     - backend: pytorch
-      #       model: mllm
-      #       function: chat
-      #   include:
-      #     - backend: turbomind
-      #       model: llm
-      #       function: chat
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+        include:
+          - backend: turbomind
+            model: llm
+            function: local_case
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
       options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
@@ -133,8 +192,7 @@ jobs:
           ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: ${{ !cancelled() }}
-        # if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
+        if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -146,8 +204,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: ${{ !cancelled() }}
-        # if: ${{ matrix.function == 'pipeline' }}
+        if: ${{ matrix.function == 'pipeline' }}
         run: |
           pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -159,8 +216,7 @@ jobs:
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: ${{ !cancelled() }}
-        # if: ${{ matrix.function == 'restful' }}
+        if: ${{ matrix.function == 'restful' }}
         run: |
           pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -169,4 +225,13 @@ jobs:
           pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
           pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
\ No newline at end of file
+          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir

From 85bac1ec85aac0577f2c2ceeaf5973ce5f2797d9 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 14:39:36 +0800
Subject: [PATCH 19/32] AUTOTEST: fix hw yml

---
 .github/workflows/daily_ete_test_ascend.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index b977c740ca..5a7a637acd 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -28,16 +28,11 @@ on:
         description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
-      offline_mode:
-        required: true
-        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-        type: boolean
-        default: false
       regression_func:
         required: true
         description: 'regression functions'
         type: string
-        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
+        default: "['tools']"
 
 env:
   REPORT_DIR: /test/test-reports/${{ github.run_id }}
@@ -77,6 +72,7 @@ jobs:
         if: ${{ !cancelled() }}
         run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
   test_quantization:
+    needs: download_pkgs
     if: ${{ !cancelled() }}
     runs-on: [self-hosted, ascend-013]
     timeout-minutes: 150

From 06568e2fb31ac6a76300bec38dd7fddbcf898afb Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 14:43:58 +0800
Subject: [PATCH 20/32] AUTOTEST: add ascend device

---
 .github/workflows/daily_ete_test_ascend.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 5a7a637acd..9e28735df6 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -113,13 +113,13 @@ jobs:
         continue-on-error: true
         if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
         if: contains(fromJSON(github.event.inputs.backend), 'pytorch')
         run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()

From c1ae0b1fbacd3c8661f611aead5598cf4de13bde Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 15:27:18 +0800
Subject: [PATCH 21/32] CI: fix yml

---
 .github/workflows/daily_ete_test_ascend.yml | 25 +++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 9e28735df6..26f6a01551 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -28,6 +28,11 @@ on:
         description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
       regression_func:
         required: true
         description: 'regression functions'
@@ -93,13 +98,14 @@ jobs:
       - name: Copy repository and Artifacts
         run: |
           cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r requirements_ascend.txt
       - name: Install lmdeploy - offline
+        if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install -r requirements/test.txt
           python3 -m pip install transformers==4.53.1
+          python3 -m pip install -r requirements_ascend.txt
+      - name: Install lmdeploy - test
+        run: |
+          python3 -m pip install -r requirements/test.txt       
       - name: Check env
         run: |
           python3 -m pip list
@@ -170,17 +176,18 @@ jobs:
       - name: Copy repository and Artifacts
         run: |
           cp -r ${{ env.TEST_CODE_PATH }}/. .
-      - name: Install lmdeploy - dependency
+      - name: Install lmdeploy - offline
+        if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install -r ${{ env.OFFLINE_REQUIREMENTS }}
-      - name: Install lmdeploy
+          python3 -m pip install transformers==4.53.1
+          python3 -m pip install -r requirements_ascend.txt
+      - name: Install lmdeploy - test
         run: |
-          python3 -m pip install -r requirements/test.txt
+          python3 -m pip install -r requirements/test.txt   
       - name: Check env
         run: |
           python3 -m pip list
           lmdeploy check_env
-          cp -r /root/lora .
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf ${{ env.LOG_PATH }}/*

From 55b2f76ca52f11d26ef43d27ba6a33b6cb6b80e1 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 15:59:42 +0800
Subject: [PATCH 22/32] CI: add pip cache

---
 .github/workflows/daily_ete_test_ascend.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 26f6a01551..534c1e6ec5 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -66,6 +66,7 @@ jobs:
         - /etc/hccn.conf:/etc/hccn.conf:ro
         - /root/qa_test:/test
         - /mnt:/mnt
+        - /root/.cache/pip:/root/.cache/pip
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -94,6 +95,7 @@ jobs:
         - /etc/hccn.conf:/etc/hccn.conf:ro
         - /root/qa_test:/test
         - /mnt:/mnt
+        - /root/.cache/pip:/root/.cache/pip
     steps:
       - name: Copy repository and Artifacts
         run: |
@@ -172,6 +174,7 @@ jobs:
         - /etc/hccn.conf:/etc/hccn.conf
         - /root/qa_test:/test
         - /mnt:/mnt
+        - /root/.cache/pip:/root/.cache/pip
     steps:
       - name: Copy repository and Artifacts
         run: |

From b012c5a375f8084c327e1148867291d372e2888b Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 16:06:13 +0800
Subject: [PATCH 23/32] CI: add pip cache

---
 .github/workflows/daily_ete_test_ascend.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 534c1e6ec5..ebc734dcad 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -107,7 +107,7 @@ jobs:
           python3 -m pip install -r requirements_ascend.txt
       - name: Install lmdeploy - test
         run: |
-          python3 -m pip install -r requirements/test.txt       
+          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/    
       - name: Check env
         run: |
           python3 -m pip list
@@ -186,7 +186,7 @@ jobs:
           python3 -m pip install -r requirements_ascend.txt
       - name: Install lmdeploy - test
         run: |
-          python3 -m pip install -r requirements/test.txt   
+          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/   
       - name: Check env
         run: |
           python3 -m pip list

From 43e66662819d6424f7715a8676bdf6a498e7b321 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 16:20:08 +0800
Subject: [PATCH 24/32] CI: add pip cache

---
 .github/workflows/daily_ete_test_ascend.yml | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index ebc734dcad..4f060abcc0 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -28,11 +28,11 @@ on:
         description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
-      offline_mode:
-        required: true
-        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-        type: boolean
-        default: false
+      # offline_mode:
+      #   required: true
+      #   description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+      #   type: boolean
+      #   default: false
       regression_func:
         required: true
         description: 'regression functions'
@@ -101,10 +101,9 @@ jobs:
         run: |
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
+        # if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install transformers==4.53.1
-          python3 -m pip install -r requirements_ascend.txt
+          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/   
       - name: Install lmdeploy - test
         run: |
           python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/    
@@ -180,10 +179,9 @@ jobs:
         run: |
           cp -r ${{ env.TEST_CODE_PATH }}/. .
       - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
+        # if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install transformers==4.53.1
-          python3 -m pip install -r requirements_ascend.txt
+          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/   
       - name: Install lmdeploy - test
         run: |
           python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/   

From b3dcf405c7e49e4344ce584359557db519177dfa Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 10 Sep 2025 16:35:31 +0800
Subject: [PATCH 25/32] CI: add pip cache

---
 .github/workflows/daily_ete_test_ascend.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 4f060abcc0..181221d3ec 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -47,6 +47,7 @@ env:
   LOG_PATH: /test/log
   TMPDIR: /mnt/deeplink/docker-tmp
   RAY_TMPDIR: /mnt/deeplink/docker-tmp
+  HF_ENDPOINT: https://hf-mirror.com
 
 jobs:
   download_pkgs:

From dd8bac76c1bde5117d8d29a6d2befe84e402b932 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 11 Sep 2025 16:48:43 +0800
Subject: [PATCH 26/32] TEST: update ascend test

---
 .github/workflows/daily_ete_test_ascend.yml   | 109 +++---------------
 autotest/config-ascend.yaml                   |   6 +-
 .../chat/test_command_chat_hf_pytorch.py      |   3 +
 autotest/tools/pipeline/llm_case.py           |   6 +-
 .../test_pipeline_chat_pytorch_llm.py         |   3 +
 .../test_pipeline_chat_pytorch_mllm.py        |   3 +
 .../quantization/test_quantization_awq.py     |   1 -
 .../quantization/test_quantization_w8a8.py    |   1 -
 .../test_restful_chat_hf_pytorch_llm.py       |   3 +
 .../test_restful_chat_hf_pytorch_mllm.py      |   3 +
 10 files changed, 38 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 181221d3ec..24be9c9452 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -17,22 +17,17 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: "['turbomind', 'pytorch']"
+        default: "['pytorch']"
       model:
         required: true
         description: 'Set testcase module filter: llm, vllm. Default contains all models'
         type: string
-        default: "['llm','mllm']"
+        default: "['llm']"
       function:
         required: true
         description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
-      # offline_mode:
-      #   required: true
-      #   description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-      #   type: boolean
-      #   default: false
       regression_func:
         required: true
         description: 'regression functions'
@@ -41,13 +36,12 @@ on:
 
 env:
   REPORT_DIR: /test/test-reports/${{ github.run_id }}
-  COV_PARAM: --cov /usr/local/python3.10.5/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /usr/local/python3.10.17/lib/python3.10/site-packages/lmdeploy
   FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
   TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }}
-  LOG_PATH: /test/log
+  LOG_PATH: /test/log/${{ github.run_id }}
   TMPDIR: /mnt/deeplink/docker-tmp
   RAY_TMPDIR: /mnt/deeplink/docker-tmp
-  HF_ENDPOINT: https://hf-mirror.com
 
 jobs:
   download_pkgs:
@@ -78,71 +72,11 @@ jobs:
       - name: Copy repository
         if: ${{ !cancelled() }}
         run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
-  test_quantization:
-    needs: download_pkgs
-    if: ${{ !cancelled() }}
-    runs-on: [self-hosted, ascend-013]
-    timeout-minutes: 150
-    container:
-      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
-      volumes:
-        - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
-        - /usr/local/sbin:/usr/local/sbin:ro
-        - /var/log/npu/slog:/var/log/npu/slog
-        - /var/log/npu/profiling:/var/log/npu/profiling
-        - /var/log/npu/dump:/var/log/npu/dump
-        - /var/log/npu:/usr/slog
-        - /etc/hccn.conf:/etc/hccn.conf:ro
-        - /root/qa_test:/test
-        - /mnt:/mnt
-        - /root/.cache/pip:/root/.cache/pip
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - offline
-        # if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/   
-      - name: Install lmdeploy - test
-        run: |
-          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/    
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf ${{ env.LOG_PATH }}/*
-          mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p
-          ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
-      - name: Test lmdeploy - quantization w4a16
-        continue-on-error: true
-        if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
-        run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - quantization w8a8
-        continue-on-error: true
-        if: contains(fromJSON(github.event.inputs.backend), 'pytorch')
-        run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --device ascend --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
 
   test_tools:
     if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}}
     runs-on: [self-hosted, ascend-013]
-    needs: test_quantization
+    needs: download_pkgs
     timeout-minutes: 300
     strategy:
       fail-fast: false
@@ -157,10 +91,6 @@ jobs:
           - backend: pytorch
             model: mllm
             function: chat
-        include:
-          - backend: turbomind
-            model: llm
-            function: local_case
     container:
       image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
       options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
@@ -180,12 +110,11 @@ jobs:
         run: |
           cp -r ${{ env.TEST_CODE_PATH }}/. .
       - name: Install lmdeploy - offline
-        # if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/   
+          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/
       - name: Install lmdeploy - test
         run: |
-          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/   
+          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/
       - name: Check env
         run: |
           python3 -m pip list
@@ -199,37 +128,37 @@ jobs:
         continue-on-error: true
         if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
         if: ${{ matrix.function == 'pipeline' }}
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
         if: ${{ matrix.function == 'restful' }}
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and not pr_test' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and not pr_test' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and not pr_test' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and not pr_test' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
           mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml
index bc70824420..1b0588387f 100644
--- a/autotest/config-ascend.yaml
+++ b/autotest/config-ascend.yaml
@@ -7,7 +7,7 @@ dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split
 env_tag: a100
 
 tp_config:
-    Qwen3-0.6B: 2
+    Qwen2.5-32B-Instruct: 2
 
 
 turbomind_chat_model:
@@ -43,9 +43,9 @@ turbomind_quatization:
 
 pytorch_quatization:
     awq:
-        - /Qwen3-0.6B
+        - meta-llama/Meta-Llama-3-8B-Instruct
     w8a8:
-        - /Qwen3-0.6B
+        - meta-llama/Meta-Llama-3-8B-Instruct
     no_kvint4:
         - /Qwen3-0.6B
     no_kvint8:
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 4aaddd580c..5dbcb6256a 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -11,6 +11,7 @@
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1))
 def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -52,6 +53,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_4
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4))
 def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -71,6 +73,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_8
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=8))
 def test_hf_pytorch_chat_tp8(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 0555dce6f2..14285f3c91 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -93,11 +93,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
         print(f'[caseresult {case} start]' + json.dumps(response_list, ensure_ascii=False) +
               f'[caseresult {case} end]\n')
 
-    # TODO fix for ascend
-    if device == 'ascend':
-        pass
-    else:
-        pipe.close()
+    pipe.close()
     import gc
 
     gc.collect()
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index 3738056c56..b9a6939675 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -10,6 +10,7 @@
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
@@ -36,6 +37,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, worker_id)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_4
+@pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id):
@@ -49,6 +51,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_8
+@pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=8, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp8(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 65948209cd..2902deeb65 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -13,6 +13,7 @@
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model'))
 def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
@@ -24,6 +25,7 @@ def test_pipeline_chat_tp1(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=2, model_type='vl_model'))
 def test_pipeline_chat_tp2(config, model, worker_id):
     if 'gw' in worker_id:
@@ -36,6 +38,7 @@ def test_pipeline_chat_tp2(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_4
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=4, model_type='vl_model'))
 def test_pipeline_chat_tp4(config, model, worker_id):
     if 'gw' in worker_id:
diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
index afa31d402b..7552e6e2aa 100644
--- a/autotest/tools/quantization/test_quantization_awq.py
+++ b/autotest/tools/quantization/test_quantization_awq.py
@@ -8,7 +8,6 @@
 
 @pytest.mark.order(3)
 @pytest.mark.test_3090
-@pytest.mark.test_ascend
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('awq'))
 def test_quantization_awq(config, model, worker_id):
diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py
index 9ddc454ae6..d210acdf1b 100644
--- a/autotest/tools/quantization/test_quantization_w8a8.py
+++ b/autotest/tools/quantization/test_quantization_w8a8.py
@@ -8,7 +8,6 @@
 
 @pytest.mark.order(2)
 @pytest.mark.quantization_w8a8
-@pytest.mark.test_ascend
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('w8a8'))
 def test_quantization_w8a8(config, model, worker_id):
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index 57ac524912..6c48007565 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -29,6 +29,7 @@ def getModelList(tp_num):
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -54,6 +55,7 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_4
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
 def test_restful_chat_tp4(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -66,6 +68,7 @@ def test_restful_chat_tp4(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_8
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True)
 def test_restful_chat_tp8(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
index 82d7a7bf7a..63c700d7aa 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -29,6 +29,7 @@ def getModelList(tp_num):
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
 @pytest.mark.test_3090
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -40,6 +41,7 @@ def test_restful_chat_tp1(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_2
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
 def test_restful_chat_tp2(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -51,6 +53,7 @@ def test_restful_chat_tp2(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_4
+@pytest.mark.test_ascend
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
 def test_restful_chat_tp4(config, worker_id):
     if get_workerid(worker_id) is None:

From 9bda18536cff3e50ee44e4bab1849180a3990faa Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 11 Sep 2025 16:49:52 +0800
Subject: [PATCH 27/32] TEST: rm api eval

---
 .github/workflows/api_eva.yml | 187 ----------------------------------
 1 file changed, 187 deletions(-)
 delete mode 100644 .github/workflows/api_eva.yml

diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml
deleted file mode 100644
index 9c15c7b825..0000000000
--- a/.github/workflows/api_eva.yml
+++ /dev/null
@@ -1,187 +0,0 @@
-name: api_evalate
-
-on:
-  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is InternLM'
-        type: string
-        default: 'InternLM/lmdeploy'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      offline_mode:
-        required: true
-        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-        type: boolean
-        default: false
-      regression_func:
-        required: true
-        description: 'regression functions'
-        type: string
-        default: "['evaluation']"
-
-env:
-  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
-  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
-  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
-  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
-  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
-  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
-
-jobs:
-  linux-build:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
-    strategy:
-      matrix:
-        pyver: [py310]
-    runs-on: ubuntu-latest
-    env:
-      PYTHON_VERSION: ${{ matrix.pyver }}
-      PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda11.8
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Build
-        run: |
-          echo ${PYTHON_VERSION}
-          echo ${PLAT_NAME}
-          echo ${DOCKER_TAG}
-          echo ${OUTPUT_FOLDER}
-          echo ${GITHUB_RUN_ID}
-          # remove -it
-          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
-          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
-      - name: Upload Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          if-no-files-found: error
-          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
-          retention-days: 1
-          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
-
-
-  download_pkgs:
-    needs: linux-build
-    if: ${{!cancelled()}}
-    runs-on: [self-hosted, 140-test]
-    timeout-minutes: 50
-    container:
-      image: openmmlab/lmdeploy:latest
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
-      - name: Copy Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
-      - name: Copy Artifacts - offline
-        if: ${{inputs.offline_mode}}
-        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
-
-
-
-  test_evaluation:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
-    runs-on: [self-hosted, 140-test]
-    timeout-minutes: 120 # 2hours
-    strategy:
-      fail-fast: false
-      matrix:
-        evaluate_type: ['chat', 'base']
-    container:
-      image: openmmlab/lmdeploy:latest
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
-        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /mnt/shared:/mnt/shared
-        - /mnt/bigdisk:/mnt/bigdisk
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install sentence_transformers==2.2.2 --no-deps
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
-      - name: Install opencompass
-        run: |
-          git clone --depth=1 https://github.com/open-compass/opencompass.git
-          cd opencompass
-          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
-          python3 -m pip install -e .
-          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Setup paths for evaluation
-        run: |
-          ln -s /root/opencompass-data ./data
-          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
-      - name: Evaluate models
-        if: matrix.evaluate_type == 'chat'
-        run: |
-          export LMDEPLOY_DIR=$(pwd)
-
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
-      - name: Evaluate base models
-        if: matrix.evaluate_type == 'base'
-        run: |
-          export LMDEPLOY_DIR=$(pwd)
-
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
-      - name: Clear workspace
-        if: always()
-        run: |
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir

From 9ce68649bdaf9531c5ac57ff56f4b7fe0ffc7ca1 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 11 Sep 2025 17:10:40 +0800
Subject: [PATCH 28/32] TEST: update chat test

---
 autotest/utils/run_restful_chat.py | 31 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index a3c3c99b77..04033f2482 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -164,8 +164,8 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''):
 
     result = True
 
-    api_client = APIClient(url)
-    model_name = api_client.available_models[0]
+    client = OpenAI(api_key='YOUR_API_KEY', base_url=f'{url}/v1')
+    model_name = client.models.list().data[0].id
 
     messages = []
     msg = ''
@@ -176,18 +176,17 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''):
         messages.append({'role': 'user', 'content': prompt})
         file.writelines('prompt:' + prompt + '\n')
 
-        for output in api_client.chat_completions_v1(model=model_name, messages=messages, top_k=1, max_tokens=256):
-            output_message = output.get('choices')[0].get('message')
-            messages.append(output_message)
+        response = client.chat.completions.create(model=model_name, messages=messages, temperature=0.01, top_p=0.8)
 
-            output_content = output_message.get('content')
-            file.writelines('output:' + output_content + '\n')
+        output_content = response.choices[0].message.content
+        file.writelines('output:' + output_content + '\n')
+        messages.append({'role': 'assistant', 'content': output_content})
 
-            case_result, reason = assert_result(output_content, prompt_detail.values(), model_name)
-            file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n')
-            if not case_result:
-                msg += reason
-            result = result & case_result
+        case_result, reason = assert_result(output_content, prompt_detail.values(), model_name)
+        file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n')
+        if not case_result:
+            msg += reason
+        result = result & case_result
     file.close()
     return result, restful_log, msg
 
@@ -458,9 +457,9 @@ def get_temperature_date(location: str, date: str, unit: str = 'celsius'):
         """Get temperature at a location and date.
 
         Args:
-            location: The location to get the temperature for, in the format "City, State, Country".
-            date: The date to get the temperature for, in the format "Year-Month-Day".
-            unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+            location: The location to get the temperature for, in the format 'City, State, Country'.
+            date: The date to get the temperature for, in the format 'Year-Month-Day'.
+            unit: The unit to return the temperature in. Defaults to 'celsius'. (choices: ['celsius', 'fahrenheit'])
 
         Returns:
             the temperature, the location, the date and the unit in a dict
@@ -618,7 +617,7 @@ def run_tools_case(config, port: int = DEFAULT_PORT):
                 },
             }
         }]
-        messages = [{'role': 'user', 'content': "What's the weather like in Boston today?"}]
+        messages = [{'role': 'user', 'content': 'What\'s the weather like in Boston today?'}]
         response = client.chat.completions.create(model=model_name,
                                                   messages=messages,
                                                   temperature=0.01,

From 4e62a33ccf1a3f3b7e70391907b63c60439d512a Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 11 Sep 2025 17:23:16 +0800
Subject: [PATCH 29/32] TEST: fix tmp dir

---
 .github/workflows/daily_ete_test_ascend.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
index 24be9c9452..0e4f1c30c0 100644
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ b/.github/workflows/daily_ete_test_ascend.yml
@@ -40,8 +40,8 @@ env:
   FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
   TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }}
   LOG_PATH: /test/log/${{ github.run_id }}
-  TMPDIR: /mnt/deeplink/docker-tmp
-  RAY_TMPDIR: /mnt/deeplink/docker-tmp
+  TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp
+  RAY_TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp/ray
 
 jobs:
   download_pkgs:

From 858cb5aff2755116024a9958fa2adbcbde93791f Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 12 Sep 2025 15:21:50 +0800
Subject: [PATCH 30/32] TEST: fix lint

---
 autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 19c12e4e79..44ded4473f 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -180,7 +180,6 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, communicator,
                               is_smoke=True)
 
 
-
 @pytest.mark.order(6)
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
@@ -201,7 +200,6 @@ def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator,
                               is_smoke=True)
 
 
-
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1

From a78654354420961815b9191e310752fcbf74ae28 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Mon, 15 Sep 2025 17:46:20 +0800
Subject: [PATCH 31/32] TEST: update ascend config

---
 autotest/config-ascend.yaml | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml
index 1b0588387f..d30c880463 100644
--- a/autotest/config-ascend.yaml
+++ b/autotest/config-ascend.yaml
@@ -9,38 +9,15 @@ env_tag: a100
 tp_config:
     Qwen2.5-32B-Instruct: 2
 
-
-turbomind_chat_model:
-    - /Qwen3-0.6B
-
-
 pytorch_chat_model:
     - /Qwen3-0.6B
 
-turbomind_vl_model:
-    - /Qwen3-0.6B
-
 pytorch_vl_model:
     - /Qwen3-0.6B
 
-
-turbomind_base_model:
-    - /Qwen3-0.6B
-
 pytorch_base_model:
     - /Qwen3-0.6B
 
-turbomind_quatization:
-    no_awq:
-        - /Qwen3-0.6B
-
-    gptq:
-        - /Qwen3-0.6B
-    no_kvint4:
-        - /Qwen3-0.6B
-    no_kvint8:
-        - /Qwen3-0.6B
-
 pytorch_quatization:
     awq:
         - meta-llama/Meta-Llama-3-8B-Instruct

From 5e06c9f0c3e9fe6035e5bf8f607dd9ea27acd8e1 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Tue, 16 Sep 2025 18:26:19 +0800
Subject: [PATCH 32/32] TEST: rm ascend config

---
 .github/workflows/daily_ete_test_ascend.yml | 171 --------------------
 autotest/config-ascend.yaml                 |  35 ----
 2 files changed, 206 deletions(-)
 delete mode 100644 .github/workflows/daily_ete_test_ascend.yml
 delete mode 100644 autotest/config-ascend.yaml

diff --git a/.github/workflows/daily_ete_test_ascend.yml b/.github/workflows/daily_ete_test_ascend.yml
deleted file mode 100644
index 0e4f1c30c0..0000000000
--- a/.github/workflows/daily_ete_test_ascend.yml
+++ /dev/null
@@ -1,171 +0,0 @@
-name: daily_ete_test_ascend
-
-on:
-  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is InternLM'
-        type: string
-        default: 'InternLM/lmdeploy'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      backend:
-        required: true
-        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
-        type: string
-        default: "['pytorch']"
-      model:
-        required: true
-        description: 'Set testcase module filter: llm, vllm. Default contains all models'
-        type: string
-        default: "['llm']"
-      function:
-        required: true
-        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
-        type: string
-        default: '["pipeline", "restful", "chat"]'
-      regression_func:
-        required: true
-        description: 'regression functions'
-        type: string
-        default: "['tools']"
-
-env:
-  REPORT_DIR: /test/test-reports/${{ github.run_id }}
-  COV_PARAM: --cov /usr/local/python3.10.17/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: ${{ github.event_name == 'push' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
-  TEST_CODE_PATH: /test/test_pkg/lmdeploy/${{ github.run_id }}
-  LOG_PATH: /test/log/${{ github.run_id }}
-  TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp
-  RAY_TMPDIR: /mnt/deeplink/docker-tmp/qa_tmp/ray
-
-jobs:
-  download_pkgs:
-    if: ${{!cancelled()}}
-    runs-on: [self-hosted, ascend-013]
-    timeout-minutes: 50
-    container:
-      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
-      volumes:
-        - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
-        - /usr/local/sbin:/usr/local/sbin:ro
-        - /var/log/npu/slog:/var/log/npu/slog
-        - /var/log/npu/profiling:/var/log/npu/profiling
-        - /var/log/npu/dump:/var/log/npu/dump
-        - /var/log/npu:/usr/slog
-        - /etc/hccn.conf:/etc/hccn.conf:ro
-        - /root/qa_test:/test
-        - /mnt:/mnt
-        - /root/.cache/pip:/root/.cache/pip
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{ !cancelled() }}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository
-        if: ${{ !cancelled() }}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
-
-  test_tools:
-    if: ${{!cancelled() && contains(fromJSON(github.event.inputs.regression_func), 'tools')}}
-    runs-on: [self-hosted, ascend-013]
-    needs: download_pkgs
-    timeout-minutes: 300
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
-        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
-        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
-        exclude:
-          - backend: turbomind
-            model: mllm
-            function: chat
-          - backend: pytorch
-            model: mllm
-            function: chat
-    container:
-      image: crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:910b-latest
-      options: "--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -e PIP_CACHE_DIR=/root/.cache/pip --shm-size=150g --pull never"
-      volumes:
-        - /usr/local/Ascend/driver:/usr/local/Ascend/driver
-        - /usr/local/sbin:/usr/local/sbin
-        - /var/log/npu/slog:/var/log/npu/slog
-        - /var/log/npu/profiling:/var/log/npu/profiling
-        - /var/log/npu/dump:/var/log/npu/dump
-        - /var/log/npu:/usr/slog
-        - /etc/hccn.conf:/etc/hccn.conf
-        - /root/qa_test:/test
-        - /mnt:/mnt
-        - /root/.cache/pip:/root/.cache/pip
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{ env.TEST_CODE_PATH }}/. .
-      - name: Install lmdeploy - offline
-        run: |
-          python3 -m pip install -r requirements_ascend.txt -i https://mirrors.aliyun.com/pypi/simple/
-      - name: Install lmdeploy - test
-        run: |
-          python3 -m pip install -r requirements/test.txt -i https://mirrors.aliyun.com/pypi/simple/
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf ${{ env.LOG_PATH }}/*
-          mkdir ${{ env.REPORT_DIR }}/.pytest_cache -p
-          ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
-      - name: Test lmdeploy - chat
-        continue-on-error: true
-        if: ${{ (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' }}
-        run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{ matrix.backend }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline
-        continue-on-error: true
-        if: ${{ matrix.function == 'pipeline' }}
-        run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful
-        continue-on-error: true
-        if: ${{ matrix.function == 'restful' }}
-        run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_1 and test_ascend' -n 8 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_2 and test_ascend' -n 4 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_4 and test_ascend' -n 2 --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{ matrix.backend }}_${{ matrix.model }}.py -m 'gpu_num_8 and test_ascend' --device ascend --alluredir=${{ env.REPORT_DIR }} ${{ env.COV_PARAM }} || true
-          mv .coverage ${{ env.REPORT_DIR }}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
diff --git a/autotest/config-ascend.yaml b/autotest/config-ascend.yaml
deleted file mode 100644
index d30c880463..0000000000
--- a/autotest/config-ascend.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-model_path: /mnt/deeplink/group01/deeplink-test/weight
-resource_path: /nvme/qa_test_models/resource
-dst_path: /nvme/qa_test_models/autotest_model
-log_path: /test/log
-benchmark_path: /nvme/qa_test_models/benchmark-reports
-dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
-env_tag: a100
-
-tp_config:
-    Qwen2.5-32B-Instruct: 2
-
-pytorch_chat_model:
-    - /Qwen3-0.6B
-
-pytorch_vl_model:
-    - /Qwen3-0.6B
-
-pytorch_base_model:
-    - /Qwen3-0.6B
-
-pytorch_quatization:
-    awq:
-        - meta-llama/Meta-Llama-3-8B-Instruct
-    w8a8:
-        - meta-llama/Meta-Llama-3-8B-Instruct
-    no_kvint4:
-        - /Qwen3-0.6B
-    no_kvint8:
-        - /Qwen3-0.6B
-
-longtext_model:
-    - /Qwen3-0.6B
-
-benchmark_model:
-    - /Qwen3-0.6B