32
32
required : true
33
33
description : ' Dependency packages, you can also set a specific version'
34
34
type : string
35
- default : ' packaging transformers_stream_generator transformers datasets matplotlib'
35
+ default : ' packaging transformers_stream_generator transformers datasets matplotlib jmespath '
36
36
default_tp :
37
37
required : true
38
38
description : ' Default tp value'
39
39
type : string
40
40
default : ' --tp 1'
41
+ log_level :
42
+ required : true
43
+ description : ' Default ERROR, can also set INFO'
44
+ type : string
45
+ default : ' ERROR'
46
+ kvint_quantization :
47
+ required : true
48
+ description : ' Default kvint4, kvint8'
49
+ type : string
50
+ default : " ['kvint4','kvint8']"
41
51
models :
42
52
required : true
43
53
description : ' Set models run benchmark'
52
62
DATASET_FILE : /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
53
63
TP_INFO : --tp 1
54
64
LOOP_NUM : 3
65
+ TRITON_PTXAS_PATH : /usr/local/cuda/bin/ptxas
55
66
56
67
57
68
jobs :
@@ -93,15 +104,15 @@ jobs:
93
104
generation_benchmark :
94
105
needs : linux-build
95
106
if : ${{github.event_name == 'schedule' || (!cancelled() && contains(fromJSON(github.event.inputs.benchmark_type), 'generation'))}}
96
- runs-on : [self-hosted, linux-a100-2 ]
107
+ runs-on : [self-hosted, linux-a100]
97
108
strategy :
98
109
fail-fast : false
99
110
matrix :
100
111
model : ${{fromJSON(github.event.inputs.models)}}
101
112
timeout-minutes : 120
102
113
env :
103
114
MODEL_PATH : /nvme/qa_test_models/${{matrix.model}}
104
- CUDA_VISIBLE_DEVICES : 4,5
115
+ CUDA_VISIBLE_DEVICES : 6,7
105
116
container :
106
117
image : nvcr.io/nvidia/tritonserver:22.12-py3
107
118
options : " --gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
@@ -119,7 +130,7 @@ jobs:
119
130
ref : ${{github.event.inputs.repo_ref || 'main'}}
120
131
- name : Copy repository - offline
121
132
if : ${{inputs.offline_mode}}
122
- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
133
+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
123
134
- name : Download Artifacts
124
135
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
125
136
uses : actions/download-artifact@v4
@@ -133,7 +144,7 @@ jobs:
133
144
run : |
134
145
python3 -m pip install ${{inputs.dependency_pkgs}}
135
146
# manually install flash attn
136
- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
147
+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
137
148
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
138
149
- name : Install lmdeploy
139
150
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -216,7 +227,7 @@ jobs:
216
227
ref : ${{github.event.inputs.repo_ref || 'main'}}
217
228
- name : Copy repository - offline
218
229
if : ${{inputs.offline_mode}}
219
- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
230
+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
220
231
- name : Download Artifacts
221
232
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
222
233
uses : actions/download-artifact@v4
@@ -230,7 +241,7 @@ jobs:
230
241
run : |
231
242
python3 -m pip install ${{inputs.dependency_pkgs}}
232
243
# manually install flash attn
233
- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
244
+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
234
245
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
235
246
- name : Install lmdeploy
236
247
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -266,7 +277,7 @@ jobs:
266
277
done
267
278
done
268
279
- name : Run throughput benchmark - kvint4
269
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
280
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
270
281
env :
271
282
result_dir : benchmark-throughput-turbomind-kvint4
272
283
run : |
@@ -281,7 +292,7 @@ jobs:
281
292
done
282
293
done
283
294
- name : Run throughput benchmark - kvint8
284
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
295
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
285
296
env :
286
297
result_dir : benchmark-throughput-turbomind-kvint8
287
298
run : |
@@ -357,7 +368,7 @@ jobs:
357
368
ref : ${{github.event.inputs.repo_ref || 'main'}}
358
369
- name : Copy repository - offline
359
370
if : ${{inputs.offline_mode}}
360
- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
371
+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
361
372
- name : Download Artifacts
362
373
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
363
374
uses : actions/download-artifact@v4
@@ -371,7 +382,7 @@ jobs:
371
382
run : |
372
383
python3 -m pip install ${{inputs.dependency_pkgs}}
373
384
# manually install flash attn
374
- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
385
+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
375
386
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
376
387
- name : Install lmdeploy
377
388
if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -394,7 +405,7 @@ jobs:
394
405
- name : Start restful api turbomind
395
406
if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
396
407
run : |
397
- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level INFO > turbomind_run.log 2>&1 &
408
+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
398
409
echo "restful_pid=$!" >> "$GITHUB_ENV"
399
410
sleep 180s
400
411
- name : Run restful benchmark
@@ -414,17 +425,17 @@ jobs:
414
425
done
415
426
- name : Kill restful api turbomind
416
427
continue-on-error : true
417
- if : always( )
428
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind' )
418
429
run : |
419
430
kill -15 "$restful_pid"
420
431
- name : Start restful api turbomind - kvint4
421
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
432
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
422
433
run : |
423
- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level INFO > turbomind_kvint4_run.log 2>&1 &
434
+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
424
435
echo "restful_pid=$!" >> "$GITHUB_ENV"
425
436
sleep 180s
426
437
- name : Run restful benchmark -kvint4
427
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
438
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
428
439
env :
429
440
result_dir : benchmark-restful-turbomind-kvint4
430
441
run : |
@@ -439,18 +450,17 @@ jobs:
439
450
done
440
451
done
441
452
- name : Kill restful api turbomind - kvint4
442
- continue-on-error : true
443
- if : always()
453
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
444
454
run : |
445
455
kill -15 "$restful_pid"
446
456
- name : Start restful api turbomind - kvint8
447
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
457
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
448
458
run : |
449
- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level INFO > turbomind_kvint8_run.log 2>&1 &
459
+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
450
460
echo "restful_pid=$!" >> "$GITHUB_ENV"
451
461
sleep 180s
452
462
- name : Run restful benchmark -kvint8
453
- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
463
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
454
464
env :
455
465
result_dir : benchmark-restful-turbomind-kvint8
456
466
run : |
@@ -465,14 +475,13 @@ jobs:
465
475
done
466
476
done
467
477
- name : Kill restful api turbomind - kvint8
468
- continue-on-error : true
469
- if : always()
478
+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
470
479
run : |
471
480
kill -15 "$restful_pid"
472
481
- name : Start restful api pytorch
473
482
if : (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
474
483
run : |
475
- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level INFO > pytorch_run.log 2>&1 &
484
+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level ${{inputs.log_level}} > pytorch_run.log 2>&1 &
476
485
echo "restful_pid=$!" >> "$GITHUB_ENV"
477
486
sleep 120s
478
487
- name : Run restful benchmark - pytorch
@@ -491,7 +500,7 @@ jobs:
491
500
done
492
501
done
493
502
- name : Kill restful api pytorch
494
- if : always( )
503
+ if : (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch') )
495
504
run : |
496
505
kill -15 "$restful_pid"
497
506
- name : Save reports
@@ -521,7 +530,7 @@ jobs:
521
530
WORKDIR : /nvme/qa_test_models/triton_workspace
522
531
OFFLINE_PKGS : /nvme/qa_test_models/offline_pkg
523
532
MODEL_PATH : /nvme/qa_test_models/autotest_model/workspace_${{matrix.model}}
524
- DEVICE : device=7
533
+ DEVICE : device=4
525
534
GRPC_PORT : 33337
526
535
strategy :
527
536
fail-fast : false
@@ -537,7 +546,7 @@ jobs:
537
546
- name : Set params
538
547
if : (contains( matrix.model, 'internlm2-chat-20b'))
539
548
run : |
540
- echo 'DEVICE="device=6,7 "' >> "$GITHUB_ENV"
549
+ echo 'DEVICE="device=4,5 "' >> "$GITHUB_ENV"
541
550
- name : Create test container
542
551
run : |
543
552
export date_today="$(date +'%H%M%S')"
0 commit comments