Skip to content

Commit 7c4e75b

Browse files
add modelscope and lora testcase (#1506)
* update * update * updateg * update * update
1 parent ba0e6b3 commit 7c4e75b

24 files changed

+714
-415
lines changed

.github/workflows/benchmark.yml

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,22 @@ on:
3232
required: true
3333
description: 'Dependency packages, you can also set a specific version'
3434
type: string
35-
default: 'packaging transformers_stream_generator transformers datasets matplotlib'
35+
default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
3636
default_tp:
3737
required: true
3838
description: 'Default tp value'
3939
type: string
4040
default: '--tp 1'
41+
log_level:
42+
required: true
43+
description: 'Default ERROR, can also set INFO'
44+
type: string
45+
default: 'ERROR'
46+
kvint_quantization:
47+
required: true
48+
description: 'Default kvint4, kvint8'
49+
type: string
50+
default: "['kvint4','kvint8']"
4151
models:
4252
required: true
4353
description: 'Set models run benchmark'
@@ -52,6 +62,7 @@ env:
5262
DATASET_FILE: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
5363
TP_INFO: --tp 1
5464
LOOP_NUM: 3
65+
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
5566

5667

5768
jobs:
@@ -93,15 +104,15 @@ jobs:
93104
generation_benchmark:
94105
needs: linux-build
95106
if: ${{github.event_name == 'schedule' || (!cancelled() && contains(fromJSON(github.event.inputs.benchmark_type), 'generation'))}}
96-
runs-on: [self-hosted, linux-a100-2]
107+
runs-on: [self-hosted, linux-a100]
97108
strategy:
98109
fail-fast: false
99110
matrix:
100111
model: ${{fromJSON(github.event.inputs.models)}}
101112
timeout-minutes: 120
102113
env:
103114
MODEL_PATH: /nvme/qa_test_models/${{matrix.model}}
104-
CUDA_VISIBLE_DEVICES: 4,5
115+
CUDA_VISIBLE_DEVICES: 6,7
105116
container:
106117
image: nvcr.io/nvidia/tritonserver:22.12-py3
107118
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
@@ -119,7 +130,7 @@ jobs:
119130
ref: ${{github.event.inputs.repo_ref || 'main'}}
120131
- name: Copy repository - offline
121132
if: ${{inputs.offline_mode}}
122-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
133+
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
123134
- name: Download Artifacts
124135
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
125136
uses: actions/download-artifact@v4
@@ -133,7 +144,7 @@ jobs:
133144
run: |
134145
python3 -m pip install ${{inputs.dependency_pkgs}}
135146
# manually install flash attn
136-
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
147+
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
137148
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
138149
- name: Install lmdeploy
139150
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -216,7 +227,7 @@ jobs:
216227
ref: ${{github.event.inputs.repo_ref || 'main'}}
217228
- name: Copy repository - offline
218229
if: ${{inputs.offline_mode}}
219-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
230+
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
220231
- name: Download Artifacts
221232
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
222233
uses: actions/download-artifact@v4
@@ -230,7 +241,7 @@ jobs:
230241
run: |
231242
python3 -m pip install ${{inputs.dependency_pkgs}}
232243
# manually install flash attn
233-
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
244+
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
234245
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
235246
- name: Install lmdeploy
236247
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -266,7 +277,7 @@ jobs:
266277
done
267278
done
268279
- name: Run throughput benchmark - kvint4
269-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
280+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
270281
env:
271282
result_dir: benchmark-throughput-turbomind-kvint4
272283
run: |
@@ -281,7 +292,7 @@ jobs:
281292
done
282293
done
283294
- name: Run throughput benchmark - kvint8
284-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
295+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
285296
env:
286297
result_dir: benchmark-throughput-turbomind-kvint8
287298
run: |
@@ -357,7 +368,7 @@ jobs:
357368
ref: ${{github.event.inputs.repo_ref || 'main'}}
358369
- name: Copy repository - offline
359370
if: ${{inputs.offline_mode}}
360-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
371+
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
361372
- name: Download Artifacts
362373
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
363374
uses: actions/download-artifact@v4
@@ -371,7 +382,7 @@ jobs:
371382
run: |
372383
python3 -m pip install ${{inputs.dependency_pkgs}}
373384
# manually install flash attn
374-
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
385+
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
375386
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
376387
- name: Install lmdeploy
377388
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -394,7 +405,7 @@ jobs:
394405
- name: Start restful api turbomind
395406
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
396407
run: |
397-
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level INFO > turbomind_run.log 2>&1 &
408+
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
398409
echo "restful_pid=$!" >> "$GITHUB_ENV"
399410
sleep 180s
400411
- name: Run restful benchmark
@@ -414,17 +425,17 @@ jobs:
414425
done
415426
- name: Kill restful api turbomind
416427
continue-on-error: true
417-
if: always()
428+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
418429
run: |
419430
kill -15 "$restful_pid"
420431
- name: Start restful api turbomind - kvint4
421-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
432+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
422433
run: |
423-
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level INFO > turbomind_kvint4_run.log 2>&1 &
434+
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
424435
echo "restful_pid=$!" >> "$GITHUB_ENV"
425436
sleep 180s
426437
- name: Run restful benchmark -kvint4
427-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
438+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
428439
env:
429440
result_dir: benchmark-restful-turbomind-kvint4
430441
run: |
@@ -439,18 +450,17 @@ jobs:
439450
done
440451
done
441452
- name: Kill restful api turbomind - kvint4
442-
continue-on-error: true
443-
if: always()
453+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
444454
run: |
445455
kill -15 "$restful_pid"
446456
- name: Start restful api turbomind - kvint8
447-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
457+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
448458
run: |
449-
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level INFO > turbomind_kvint8_run.log 2>&1 &
459+
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
450460
echo "restful_pid=$!" >> "$GITHUB_ENV"
451461
sleep 180s
452462
- name: Run restful benchmark -kvint8
453-
if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
463+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
454464
env:
455465
result_dir: benchmark-restful-turbomind-kvint8
456466
run: |
@@ -465,14 +475,13 @@ jobs:
465475
done
466476
done
467477
- name: Kill restful api turbomind - kvint8
468-
continue-on-error: true
469-
if: always()
478+
if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
470479
run: |
471480
kill -15 "$restful_pid"
472481
- name: Start restful api pytorch
473482
if: (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
474483
run: |
475-
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level INFO > pytorch_run.log 2>&1 &
484+
lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level ${{inputs.log_level}} > pytorch_run.log 2>&1 &
476485
echo "restful_pid=$!" >> "$GITHUB_ENV"
477486
sleep 120s
478487
- name: Run restful benchmark - pytorch
@@ -491,7 +500,7 @@ jobs:
491500
done
492501
done
493502
- name: Kill restful api pytorch
494-
if: always()
503+
if: (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
495504
run: |
496505
kill -15 "$restful_pid"
497506
- name: Save reports
@@ -521,7 +530,7 @@ jobs:
521530
WORKDIR: /nvme/qa_test_models/triton_workspace
522531
OFFLINE_PKGS: /nvme/qa_test_models/offline_pkg
523532
MODEL_PATH: /nvme/qa_test_models/autotest_model/workspace_${{matrix.model}}
524-
DEVICE: device=7
533+
DEVICE: device=4
525534
GRPC_PORT: 33337
526535
strategy:
527536
fail-fast: false
@@ -537,7 +546,7 @@ jobs:
537546
- name: Set params
538547
if: (contains( matrix.model, 'internlm2-chat-20b'))
539548
run: |
540-
echo 'DEVICE="device=6,7"' >> "$GITHUB_ENV"
549+
echo 'DEVICE="device=4,5"' >> "$GITHUB_ENV"
541550
- name: Create test container
542551
run: |
543552
export date_today="$(date +'%H%M%S')"

.github/workflows/daily_ete_test.yml

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ on:
3232
required: true
3333
description: 'Dependency packages, you can also set a specific version'
3434
type: string
35-
default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm'
35+
default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'
3636
tools_regression:
3737
required: true
3838
description: 'Whether start a tool regression'
@@ -49,12 +49,13 @@ on:
4949
type: boolean
5050
default: true
5151
schedule:
52-
- cron: '00 21 * * *'
52+
- cron: '00 20 * * 1-5'
5353

5454
env:
5555
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
5656
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
5757
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
58+
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
5859

5960

6061
jobs:
@@ -101,12 +102,17 @@ jobs:
101102
env:
102103
REPORT_DIR: /nvme/qa_test_models/test-reports
103104
PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
105+
MODELSCOPE_CACHE: /root/modelscope_hub
106+
MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
104107
container:
105108
image: nvcr.io/nvidia/tritonserver:22.12-py3
106109
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
107110
volumes:
108111
- /nvme/github-actions/pip-cache:/root/.cache/pip
109112
- /nvme/github-actions/packages:/root/packages
113+
- /nvme/github-actions/modelscope_hub:/root/modelscope_hub
114+
- /nvme/github-actions/modelscope_modules:/root/modelscope_modules
115+
- /nvme/github-actions/resources/lora:/root/lora
110116
- /nvme/qa_test_models:/nvme/qa_test_models
111117
- /nvme/qa_test_models/lmdeploy/autotest:/local_case
112118
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
@@ -119,7 +125,7 @@ jobs:
119125
ref: ${{github.event.inputs.repo_ref || 'main'}}
120126
- name: Copy repository - offline
121127
if: ${{inputs.offline_mode}}
122-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
128+
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
123129
- name: Download Artifacts
124130
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
125131
uses: actions/download-artifact@v4
@@ -128,13 +134,13 @@ jobs:
128134
- name: Install pytorch
129135
run: |
130136
python3 -m pip cache dir
131-
python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
137+
python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
132138
- name: Install lmdeploy - dependency
133139
run: |
134-
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai'}}
140+
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
135141
# manually install flash attn
136-
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
137-
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
142+
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
143+
python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
138144
- name: Install lmdeploy
139145
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
140146
run: |
@@ -151,6 +157,7 @@ jobs:
151157
run: |
152158
python3 -m pip list
153159
lmdeploy check_env
160+
cp -r /root/lora .
154161
rm -rf allure-results
155162
# remove tmp log in testcase
156163
rm -rf /nvme/qa_test_models/autotest_model/log/*
@@ -233,7 +240,7 @@ jobs:
233240
continue-on-error: true
234241
if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline'))
235242
run: |
236-
pytest autotest/interface/pipeline -m 'not pr_test' -s -vv --alluredir=allure-results
243+
pytest autotest/interface/pipeline -m 'not pr_test' --alluredir=allure-results
237244
- name: Test lmdeploy - local testcase
238245
if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
239246
run: |
@@ -439,7 +446,7 @@ jobs:
439446
ref: ${{github.event.inputs.repo_ref || 'main'}}
440447
- name: Copy repository - offline
441448
if: ${{inputs.offline_mode}}
442-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
449+
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
443450
- name: Download Artifacts
444451
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
445452
uses: actions/download-artifact@v4
@@ -448,13 +455,13 @@ jobs:
448455
- name: Install pytorch
449456
run: |
450457
python3 -m pip cache dir
451-
python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
458+
python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
452459
- name: Install lmdeploy - dependency
453460
run: |
454-
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai'}}
461+
python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
455462
# manually install flash attn
456-
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
457-
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
463+
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
464+
python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
458465
- name: Install lmdeploy
459466
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
460467
run: |

autotest/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ tp_config:
1414

1515
turbomind_chat_model:
1616
- meta-llama/Llama-2-7b-chat-hf
17+
- meta-llama/Meta-Llama-3-8B-Instruct
1718
- internlm/internlm2-chat-1_8b
1819
- internlm/internlm-chat-7b
1920
- internlm/internlm-chat-20b
@@ -39,6 +40,7 @@ turbomind_chat_model:
3940

4041
pytorch_chat_model:
4142
- meta-llama/Llama-2-7b-chat-hf
43+
- meta-llama/Meta-Llama-3-8B-Instruct
4244
- internlm/internlm-chat-7b
4345
- internlm/internlm-chat-20b
4446
- internlm/internlm2-chat-7b
@@ -81,8 +83,10 @@ quatization_case_config:
8183
- internlm/internlm2-chat-20b
8284
- baichuan-inc/Baichuan2-7B-Chat
8385
- internlm/internlm2-20b
86+
- Qwen/Qwen1.5-7B-Chat
8487
kvint:
8588
- meta-llama/Llama-2-7b-chat-hf
89+
- meta-llama/Meta-Llama-3-8B-Instruct
8690
- internlm/internlm2-chat-1_8b
8791
- internlm/internlm-chat-7b
8892
- internlm/internlm-chat-20b
@@ -101,6 +105,8 @@ quatization_case_config:
101105
- codellama/CodeLlama-7b-Instruct-hf
102106
w8a8:
103107
- meta-llama/Llama-2-7b-chat-hf
108+
- meta-llama/Meta-Llama-3-8B-Instruct
109+
- internlm/internlm-chat-7b
104110
- internlm/internlm-chat-20b
105111
- internlm/internlm2-chat-20b
106112
- internlm/internlm2-chat-7b

autotest/interface/pipeline/test_pipeline_turbomind_func.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def test_pipeline_stream_infer(self, config, model):
192192
assert outputs.generate_token_len == i, str(outputs)
193193
else:
194194
with assume:
195-
assert outputs.generate_token_len == i - 1, str(outputs)
195+
assert outputs.generate_token_len >= i - 1, str(outputs)
196196
with assume:
197197
assert outputs.input_token_len > 50, str(outputs)
198198
with assume:
@@ -250,7 +250,7 @@ def test_pipeline_stream_infer2(self, config, model):
250250
assert outputs.generate_token_len == i, str(outputs)
251251
else:
252252
with assume:
253-
assert outputs.generate_token_len == i - 1, str(outputs)
253+
assert outputs.generate_token_len >= i - 1, str(outputs)
254254
with assume:
255255
assert outputs.input_token_len > 50, str(outputs)
256256
with assume:
@@ -261,7 +261,6 @@ def test_pipeline_stream_infer2(self, config, model):
261261
outputs_list.append(outputs)
262262
continue
263263

264-
print(final_response)
265264
for output in outputs_list[0:-1]:
266265
with assume:
267266
assert output.finish_reason is None, str(output)

0 commit comments

Comments
 (0)