Skip to content

Commit 6ad960b

Browse files
authored
Merge branch 'main' into batch_invariant_b200
2 parents df7d63a + 938a816 commit 6ad960b

File tree

276 files changed

+10851
-3015
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

276 files changed

+10851
-3015
lines changed

.buildkite/performance-benchmarks/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
77

88
## Performance benchmark quick overview
99

10-
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
10+
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
1111

1212
**Benchmarking Duration**: about 1hr.
1313

@@ -34,6 +34,7 @@ Runtime environment variables:
3434

3535
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
3636
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
37+
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
3738
>
3839
### Latency test
3940

.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
- Input length: 32 tokens.
66
- Output length: 128 tokens.
77
- Batch size: fixed (8).
8-
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
8+
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
99
- CPU Models: llama-3.1 8B.
1010
- Evaluation metrics: end-to-end latency (mean, median, p99).
1111

@@ -16,7 +16,7 @@
1616
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
1717
- Output length: the corresponding output length of these 200 prompts.
1818
- Batch size: dynamically determined by vllm to achieve maximum throughput.
19-
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
19+
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
2020
- CPU Models: llama-3.1 8B.
2121
- Evaluation metrics: throughput.
2222

@@ -28,7 +28,7 @@
2828
- Output length: the corresponding output length of these 200 prompts.
2929
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
3030
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
31-
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
31+
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
3232
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
3333
- CPU Models: llama-3.1 8B.
3434
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ check_gpus() {
1515
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
1616
elif command -v amd-smi; then
1717
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
18+
elif command -v hl-smi; then
19+
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
1820
fi
1921

2022
if [[ $gpu_count -gt 0 ]]; then
@@ -23,10 +25,16 @@ check_gpus() {
2325
echo "Need at least 1 GPU to run benchmarking."
2426
exit 1
2527
fi
28+
29+
declare -g arch_suffix=''
30+
2631
if command -v nvidia-smi; then
2732
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
2833
elif command -v amd-smi; then
2934
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
35+
elif command -v hl-smi; then
36+
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
37+
arch_suffix='-hpu'
3038
fi
3139
echo "GPU type is $gpu_type"
3240
}
@@ -138,6 +146,10 @@ kill_gpu_processes() {
138146
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
139147
sleep 1
140148
done
149+
elif command -v hl-smi; then
150+
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
151+
sleep 1
152+
done
141153
fi
142154

143155
# remove vllm config file
@@ -451,6 +463,7 @@ main() {
451463
ARCH='-cpu'
452464
else
453465
check_gpus
466+
ARCH="$arch_suffix"
454467
fi
455468
check_hf_token
456469

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"PT_HPU_LAZY_MODE": 1,
6+
"VLLM_CONTIGUOUS_PA": 1,
7+
"VLLM_DEFRAG": 1
8+
},
9+
"parameters": {
10+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
11+
"tensor_parallel_size": 1,
12+
"load_format": "dummy",
13+
"num-iters-warmup": 5,
14+
"num-iters": 15,
15+
"max-model-len": 256,
16+
"async-scheduling": ""
17+
}
18+
},
19+
{
20+
"test_name": "latency_llama70B_tp4",
21+
"environment_variables": {
22+
"PT_HPU_LAZY_MODE": 1,
23+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
24+
"VLLM_CONTIGUOUS_PA": 1,
25+
"VLLM_DEFRAG": 1
26+
},
27+
"parameters": {
28+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
29+
"tensor_parallel_size": 4,
30+
"load_format": "dummy",
31+
"num-iters-warmup": 5,
32+
"num-iters": 15,
33+
"max-model-len": 256,
34+
"async-scheduling": ""
35+
}
36+
},
37+
{
38+
"test_name": "latency_mixtral8x7B_tp2",
39+
"environment_variables": {
40+
"PT_HPU_LAZY_MODE": 1,
41+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
42+
"VLLM_CONTIGUOUS_PA": 1,
43+
"VLLM_DEFRAG": 1
44+
},
45+
"parameters": {
46+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
47+
"tensor_parallel_size": 2,
48+
"load_format": "dummy",
49+
"num-iters-warmup": 5,
50+
"num-iters": 15,
51+
"max-model-len": 256,
52+
"async-scheduling": ""
53+
}
54+
}
55+
]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"PT_HPU_LAZY_MODE": 1,
7+
"VLLM_CONTIGUOUS_PA": 1,
8+
"VLLM_DEFRAG": 1
9+
},
10+
"server_parameters": {
11+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
12+
"tensor_parallel_size": 1,
13+
"swap_space": 16,
14+
"disable_log_stats": "",
15+
"load_format": "dummy",
16+
"max-model-len": 2048,
17+
"max-num-seqs": 256,
18+
"async-scheduling": ""
19+
},
20+
"client_parameters": {
21+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
22+
"backend": "vllm",
23+
"dataset_name": "sharegpt",
24+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
25+
"num_prompts": 200
26+
}
27+
},
28+
{
29+
"test_name": "serving_llama70B_tp4_sharegpt",
30+
"qps_list": [1, 4, 16, "inf"],
31+
"server_environment_variables": {
32+
"PT_HPU_LAZY_MODE": 1,
33+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
34+
"VLLM_CONTIGUOUS_PA": 1,
35+
"VLLM_DEFRAG": 1
36+
},
37+
"server_parameters": {
38+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
39+
"tensor_parallel_size": 4,
40+
"swap_space": 16,
41+
"disable_log_stats": "",
42+
"load_format": "dummy",
43+
"max-model-len": 2048,
44+
"max-num-seqs": 256,
45+
"async-scheduling": ""
46+
},
47+
"client_parameters": {
48+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
49+
"backend": "vllm",
50+
"dataset_name": "sharegpt",
51+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
52+
"num_prompts": 200
53+
}
54+
},
55+
{
56+
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
57+
"qps_list": [1, 4, 16, "inf"],
58+
"server_environment_variables": {
59+
"PT_HPU_LAZY_MODE": 1,
60+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
61+
"VLLM_CONTIGUOUS_PA": 1,
62+
"VLLM_DEFRAG": 1
63+
},
64+
"server_parameters": {
65+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
66+
"tensor_parallel_size": 2,
67+
"swap_space": 16,
68+
"disable_log_stats": "",
69+
"load_format": "dummy",
70+
"max-model-len": 2048,
71+
"max-num-seqs": 256,
72+
"async-scheduling": ""
73+
},
74+
"client_parameters": {
75+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
76+
"backend": "vllm",
77+
"dataset_name": "sharegpt",
78+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
79+
"num_prompts": 200
80+
}
81+
}
82+
]
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"PT_HPU_LAZY_MODE": 1,
6+
"VLLM_CONTIGUOUS_PA": 1,
7+
"VLLM_DEFRAG": 1
8+
},
9+
"parameters": {
10+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
11+
"tensor_parallel_size": 1,
12+
"load_format": "dummy",
13+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
14+
"num_prompts": 1000,
15+
"backend": "vllm",
16+
"max-model-len": 2048,
17+
"max-num-seqs": 512,
18+
"async-scheduling": ""
19+
}
20+
},
21+
{
22+
"test_name": "throughput_llama70B_tp4",
23+
"environment_variables": {
24+
"PT_HPU_LAZY_MODE": 1,
25+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
26+
"VLLM_CONTIGUOUS_PA": 1,
27+
"VLLM_DEFRAG": 1
28+
},
29+
"parameters": {
30+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
31+
"tensor_parallel_size": 4,
32+
"load_format": "dummy",
33+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
34+
"num_prompts": 1000,
35+
"backend": "vllm",
36+
"max-model-len": 2048,
37+
"max-num-seqs": 512,
38+
"async-scheduling": ""
39+
}
40+
},
41+
{
42+
"test_name": "throughput_mixtral8x7B_tp2",
43+
"environment_variables": {
44+
"PT_HPU_LAZY_MODE": 1,
45+
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
46+
"VLLM_CONTIGUOUS_PA": 1,
47+
"VLLM_DEFRAG": 1
48+
},
49+
"parameters": {
50+
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
51+
"tensor_parallel_size": 2,
52+
"load_format": "dummy",
53+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
54+
"num_prompts": 1000,
55+
"backend": "vllm",
56+
"max-model-len": 2048,
57+
"max-num-seqs": 512,
58+
"async-scheduling": ""
59+
}
60+
}
61+
]

.buildkite/release-pipeline.yaml

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -116,24 +116,6 @@ steps:
116116
commands:
117117
- "bash .buildkite/scripts/annotate-release.sh"
118118

119-
- label: "Build and publish TPU release image"
120-
depends_on: ~
121-
if: build.env("NIGHTLY") == "1"
122-
agents:
123-
queue: tpu_queue_postmerge
124-
commands:
125-
- "yes | docker system prune -a"
126-
- "git fetch --all"
127-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
128-
- "docker push vllm/vllm-tpu:nightly"
129-
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
130-
plugins:
131-
- docker-login#v3.0.0:
132-
username: vllmbot
133-
password-env: DOCKERHUB_TOKEN
134-
env:
135-
DOCKER_BUILDKIT: "1"
136-
137119
- input: "Provide Release version here"
138120
id: input-release-version
139121
fields:
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8010}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="deepseek-ai/DeepSeek-V2-lite"
20+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
21+
22+
cleanup() {
23+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
24+
kill "${SERVER_PID}" 2>/dev/null || true
25+
for _ in {1..20}; do
26+
kill -0 "${SERVER_PID}" 2>/dev/null || break
27+
sleep 0.5
28+
done
29+
kill -9 "${SERVER_PID}" 2>/dev/null || true
30+
fi
31+
}
32+
trap cleanup EXIT
33+
34+
for BACK in "${BACKENDS[@]}"; do
35+
VLLM_DEEP_GEMM_WARMUP=skip \
36+
VLLM_ALL2ALL_BACKEND=$BACK \
37+
vllm serve "$MODEL" \
38+
--enforce-eager \
39+
--tensor-parallel-size 2 \
40+
--data-parallel-size 2 \
41+
--enable-expert-parallel \
42+
--enable-eplb \
43+
--trust-remote-code \
44+
--max-model-len 2048 \
45+
--port $PORT &
46+
SERVER_PID=$!
47+
wait_for_server $PORT
48+
49+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
50+
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
51+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
52+
python3 - <<PY
53+
import json; acc=json.load(open('${OUT}'))['accuracy']
54+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
55+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
56+
PY
57+
58+
cleanup
59+
SERVER_PID=
60+
sleep 1
61+
PORT=$((PORT+1))
62+
done

0 commit comments

Comments
 (0)