Skip to content

Commit 85f4150

Browse files
[ci] add h800 function test workflow (#3985)
* add h800 * update * update * update * update mark * update test image for cu12.4 * update * update * update * update * update * remove communicator native when tp=1 * fix lint
1 parent b5372e0 commit 85f4150

21 files changed

+619
-168
lines changed

.github/workflows/benchmark.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ on:
2727
env:
2828
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
2929
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
30-
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
30+
OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
3131
REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
3232
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
3333
FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
@@ -42,7 +42,7 @@ jobs:
4242
env:
4343
PYTHON_VERSION: ${{ matrix.pyver }}
4444
PLAT_NAME: manylinux2014_x86_64
45-
DOCKER_TAG: cuda11.8
45+
DOCKER_TAG: cuda12.4
4646
steps:
4747
- name: Checkout repository
4848
uses: actions/checkout@v3
@@ -108,7 +108,7 @@ jobs:
108108
- name: Install lmdeploy
109109
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
110110
run: |
111-
python3 -m pip install lmdeploy-*.whl --no-deps
111+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
112112
python3 -m pip install -r requirements/test.txt
113113
- name: Install lmdeploy - offline
114114
if: ${{inputs.offline_mode}}

.github/workflows/daily_ete_test.yml

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ on:
4444
env:
4545
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
4646
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
47-
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
47+
OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
4848
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
4949
REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
5050
COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
@@ -64,7 +64,7 @@ jobs:
6464
env:
6565
PYTHON_VERSION: ${{ matrix.pyver }}
6666
PLAT_NAME: manylinux2014_x86_64
67-
DOCKER_TAG: cuda11.8
67+
DOCKER_TAG: cuda12.4
6868
steps:
6969
- name: Checkout repository
7070
uses: actions/checkout@v3
@@ -96,7 +96,7 @@ jobs:
9696
runs-on: [self-hosted, linux-a100]
9797
timeout-minutes: 50
9898
container:
99-
image: openmmlab/lmdeploy:latest-cu12.8
99+
image: openmmlab/lmdeploy:latest-cu12
100100
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
101101
volumes:
102102
- /nvme/qa_test_models:/nvme/qa_test_models
@@ -136,7 +136,7 @@ jobs:
136136
MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
137137
MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
138138
container:
139-
image: openmmlab/lmdeploy:latest-cu12.8
139+
image: openmmlab/lmdeploy:latest-cu12
140140
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
141141
volumes:
142142
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -168,7 +168,7 @@ jobs:
168168
rm -rf allure-results
169169
# remove tmp log in testcase
170170
rm -rf /nvme/qa_test_models/autotest_model/log/*
171-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
171+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
172172
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
173173
- name: Test lmdeploy - quantization w4a16
174174
continue-on-error: true
@@ -219,7 +219,7 @@ jobs:
219219
MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
220220
MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
221221
container:
222-
image: openmmlab/lmdeploy:latest-cu12.8
222+
image: openmmlab/lmdeploy:latest-cu12
223223
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
224224
volumes:
225225
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -251,7 +251,7 @@ jobs:
251251
rm -rf allure-results
252252
# remove tmp log in testcase
253253
rm -rf /nvme/qa_test_models/autotest_model/log/*
254-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
254+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
255255
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
256256
- name: Test lmdeploy - chat
257257
continue-on-error: true
@@ -324,7 +324,7 @@ jobs:
324324
model: Intern-S1
325325
timeout-minutes: 60
326326
container:
327-
image: openmmlab/lmdeploy:latest-cu12.8
327+
image: openmmlab/lmdeploy:latest-cu12
328328
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
329329
volumes:
330330
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -352,7 +352,7 @@ jobs:
352352
rm -rf allure-results
353353
# remove tmp log in testcase
354354
rm -rf /nvme/qa_test_models/autotest_model/log/*
355-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
355+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
356356
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
357357
- name: Start restful api
358358
if: matrix.model != 'internlm2_5-20b'
@@ -408,7 +408,7 @@ jobs:
408408
needs: test_quantization
409409
timeout-minutes: 120
410410
container:
411-
image: openmmlab/lmdeploy:latest-cu12.8
411+
image: openmmlab/lmdeploy:latest-cu12
412412
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
413413
volumes:
414414
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -436,7 +436,7 @@ jobs:
436436
rm -rf allure-results
437437
# remove tmp log in testcase
438438
rm -rf /nvme/qa_test_models/autotest_model/log/*
439-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
439+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
440440
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
441441
- name: Test lmdeploy - interface pipeline case
442442
run: |
@@ -465,7 +465,7 @@ jobs:
465465
needs: test_quantization
466466
timeout-minutes: 120
467467
container:
468-
image: openmmlab/lmdeploy:latest-cu12.8
468+
image: openmmlab/lmdeploy:latest-cu12
469469
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
470470
volumes:
471471
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -493,7 +493,7 @@ jobs:
493493
rm -rf allure-results
494494
# remove tmp log in testcase
495495
rm -rf /nvme/qa_test_models/autotest_model/log/*
496-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
496+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
497497
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
498498
- name: Test benchmark script
499499
run: |
@@ -520,7 +520,7 @@ jobs:
520520
matrix:
521521
evaluate_type: ['chat', 'base']
522522
container:
523-
image: openmmlab/lmdeploy:latest-cu12.8
523+
image: openmmlab/lmdeploy:latest-cu12
524524
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
525525
volumes:
526526
- /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -550,8 +550,7 @@ jobs:
550550
run: |
551551
git clone --depth=1 https://github.com/open-compass/opencompass.git
552552
cd opencompass
553-
cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
554-
python3 -m pip install -e .
553+
python3 -m pip install .
555554
echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
556555
- name: Check env
557556
run: |
@@ -560,7 +559,7 @@ jobs:
560559
rm -rf allure-results
561560
# remove tmp log in testcase
562561
rm -rf /nvme/qa_test_models/autotest_model/log/*
563-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
562+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
564563
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
565564
- name: Setup paths for evaluation
566565
run: |
@@ -571,7 +570,7 @@ jobs:
571570
run: |
572571
export LMDEPLOY_DIR=$(pwd)
573572
574-
python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
573+
python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
575574
- name: Evaluate base models
576575
if: matrix.evaluate_type == 'base'
577576
run: |
@@ -594,7 +593,7 @@ jobs:
594593
timeout-minutes: 5
595594
runs-on: [self-hosted, linux-a100]
596595
container:
597-
image: openmmlab/lmdeploy:latest-cu12.8
596+
image: openmmlab/lmdeploy:latest-cu12
598597
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
599598
volumes:
600599
- /nvme/qa_test_models:/nvme/qa_test_models
@@ -619,7 +618,7 @@ jobs:
619618
needs: [test_tools, test_restful, test_pipeline, test_benchmark]
620619
timeout-minutes: 5
621620
container:
622-
image: openmmlab/lmdeploy:latest-cu12.8
621+
image: openmmlab/lmdeploy:latest-cu12
623622
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
624623
volumes:
625624
- /nvme/github-actions/pip-cache:/root/.cache/pip

.github/workflows/daily_ete_test_3090.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ jobs:
153153
python3 -m pip install -r requirements/lite.txt
154154
- name: Install lmdeploy
155155
run: |
156-
python3 -m pip install lmdeploy-*.whl --no-deps
156+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
157157
python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
158158
python3 -m pip install -r requirements/test.txt
159159
- name: Check env
@@ -163,7 +163,7 @@ jobs:
163163
rm -rf allure-results
164164
# remove tmp log in testcase
165165
rm -rf /nvme/qa_test_models/autotest_model/log/*
166-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
166+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
167167
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
168168
- name: Test lmdeploy - quantization w4a16
169169
continue-on-error: true
@@ -226,7 +226,7 @@ jobs:
226226
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
227227
- name: Install lmdeploy
228228
run: |
229-
python3 -m pip install lmdeploy-*.whl --no-deps
229+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
230230
python3 -m pip install -r requirements/test.txt
231231
- name: Check env
232232
run: |
@@ -235,7 +235,7 @@ jobs:
235235
rm -rf allure-results
236236
# remove tmp log in testcase
237237
rm -rf /nvme/qa_test_models/autotest_model/log/*
238-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
238+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
239239
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
240240
- name: Test lmdeploy - chat
241241
continue-on-error: true
@@ -290,7 +290,7 @@ jobs:
290290
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
291291
- name: Install lmdeploy
292292
run: |
293-
python3 -m pip install lmdeploy-*.whl --no-deps
293+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
294294
python3 -m pip install -r requirements/test.txt
295295
- name: Check env
296296
run: |
@@ -299,7 +299,7 @@ jobs:
299299
rm -rf allure-results
300300
# remove tmp log in testcase
301301
rm -rf /nvme/qa_test_models/autotest_model/log/*
302-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
302+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
303303
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
304304
- name: Start restful api turbomind
305305
if: matrix.backend == 'turbomind'
@@ -370,7 +370,7 @@ jobs:
370370
run: cp -r ${{env.TEST_CODE_PATH}}/. .
371371
- name: Install lmdeploy
372372
run: |
373-
python3 -m pip install lmdeploy-*.whl --no-deps
373+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
374374
python3 -m pip install -r requirements/test.txt
375375
- name: Get coverage report
376376
run: |

.github/workflows/daily_ete_test_5080.yml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
download_pkgs:
9393
needs: linux-build
9494
if: ${{!cancelled()}}
95-
runs-on: [self-hosted, 5090-r1]
95+
runs-on: [self-hosted, 5080-r1]
9696
timeout-minutes: 50
9797
container:
9898
image: openmmlab/lmdeploy:latest-cu12.8
@@ -129,7 +129,7 @@ jobs:
129129
test_quantization:
130130
needs: download_pkgs
131131
if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
132-
runs-on: [self-hosted, 5090-r1]
132+
runs-on: [self-hosted, 5080-r1]
133133
timeout-minutes: 150
134134
env:
135135
PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
@@ -153,7 +153,7 @@ jobs:
153153
python3 -m pip install -r requirements/lite.txt
154154
- name: Install lmdeploy
155155
run: |
156-
python3 -m pip install lmdeploy-*.whl --no-deps
156+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
157157
python3 -m pip install transformers==4.53.1 datasets==3.6.0 timm
158158
python3 -m pip install -r requirements/test.txt
159159
- name: Check env
@@ -163,7 +163,7 @@ jobs:
163163
rm -rf allure-results
164164
# remove tmp log in testcase
165165
rm -rf /nvme/qa_test_models/autotest_model/log/*
166-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
166+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
167167
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
168168
- name: Test lmdeploy - quantization w4a16
169169
continue-on-error: true
@@ -188,7 +188,7 @@ jobs:
188188
chmod -R 777 $workdir
189189
test_tools:
190190
if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
191-
runs-on: [self-hosted, 5090-r1]
191+
runs-on: [self-hosted, 5080-r1]
192192
needs: test_quantization
193193
timeout-minutes: 300
194194
strategy:
@@ -225,7 +225,7 @@ jobs:
225225
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
226226
- name: Install lmdeploy
227227
run: |
228-
python3 -m pip install lmdeploy-*.whl --no-deps
228+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
229229
python3 -m pip install -r requirements/test.txt
230230
- name: Check env
231231
run: |
@@ -234,7 +234,7 @@ jobs:
234234
rm -rf allure-results
235235
# remove tmp log in testcase
236236
rm -rf /nvme/qa_test_models/autotest_model/log/*
237-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
237+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
238238
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
239239
- name: Test lmdeploy - chat
240240
continue-on-error: true
@@ -265,7 +265,7 @@ jobs:
265265
chmod -R 777 $workdir
266266
test_restful:
267267
if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
268-
runs-on: [self-hosted, 5090-r1]
268+
runs-on: [self-hosted, 5080-r1]
269269
needs: test_quantization
270270
strategy:
271271
fail-fast: false
@@ -289,7 +289,7 @@ jobs:
289289
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
290290
- name: Install lmdeploy
291291
run: |
292-
python3 -m pip install lmdeploy-*.whl --no-deps
292+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
293293
python3 -m pip install -r requirements/test.txt
294294
- name: Check env
295295
run: |
@@ -298,7 +298,7 @@ jobs:
298298
rm -rf allure-results
299299
# remove tmp log in testcase
300300
rm -rf /nvme/qa_test_models/autotest_model/log/*
301-
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
301+
mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
302302
ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
303303
- name: Start restful api turbomind
304304
if: matrix.backend == 'turbomind'
@@ -353,7 +353,7 @@ jobs:
353353
chmod -R 777 $workdir
354354
get_coverage_report:
355355
if: ${{!cancelled() && success()}}
356-
runs-on: [self-hosted, 5090-r1]
356+
runs-on: [self-hosted, 5080-r1]
357357
needs: [test_tools, test_restful]
358358
timeout-minutes: 5
359359
container:
@@ -368,7 +368,7 @@ jobs:
368368
run: cp -r ${{env.TEST_CODE_PATH}}/. .
369369
- name: Install lmdeploy
370370
run: |
371-
python3 -m pip install lmdeploy-*.whl --no-deps
371+
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
372372
python3 -m pip install -r requirements/test.txt
373373
- name: Get coverage report
374374
run: |

0 commit comments

Comments
 (0)