Skip to content

Commit 3884c6f

Browse files
authored
Merge branch 'main' into multi_image_enbeddings
2 parents 0306c96 + 28097d5 commit 3884c6f

File tree

119 files changed

+4683
-1318
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+4683
-1318
lines changed

.buildkite/generate_index.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

.buildkite/scripts/generate-nightly-index.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77

88
import argparse
99
import json
10-
import re
1110
import sys
1211
from dataclasses import asdict, dataclass
1312
from pathlib import Path
1413
from typing import Any
1514
from urllib.parse import quote
1615

16+
import regex as re
17+
1718
if not sys.version_info >= (3, 12):
1819
raise RuntimeError("This script requires Python 3.12 or higher.")
1920

.buildkite/scripts/hardware_ci/run-npu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
7474
7575
# Define environments
7676
ENV DEBIAN_FRONTEND=noninteractive
77+
ENV SOC_VERSION="ascend910b1"
7778
7879
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
7980
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8030}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="deepseek-ai/DeepSeek-V2-lite"
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
31+
32+
cleanup() {
33+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
34+
kill "${SERVER_PID}" 2>/dev/null || true
35+
for _ in {1..20}; do
36+
kill -0 "${SERVER_PID}" 2>/dev/null || break
37+
sleep 0.5
38+
done
39+
kill -9 "${SERVER_PID}" 2>/dev/null || true
40+
fi
41+
}
42+
trap cleanup EXIT
43+
44+
for BACK in "${BACKENDS[@]}"; do
45+
VLLM_DEEP_GEMM_WARMUP=skip \
46+
VLLM_ALL2ALL_BACKEND=$BACK \
47+
vllm serve "$MODEL" \
48+
--enforce-eager \
49+
--tensor-parallel-size 2 \
50+
--data-parallel-size 2 \
51+
--enable-expert-parallel \
52+
--enable-eplb \
53+
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
54+
--trust-remote-code \
55+
--max-model-len 2048 \
56+
--port $PORT &
57+
SERVER_PID=$!
58+
wait_for_server $PORT
59+
60+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
61+
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
62+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
63+
python3 - <<PY
64+
import json; acc=json.load(open('${OUT}'))['accuracy']
65+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
66+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
67+
PY
68+
69+
cleanup
70+
SERVER_PID=
71+
sleep 1
72+
PORT=$((PORT+1))
73+
done

.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
5050
--data-parallel-size 2 \
5151
--enable-expert-parallel \
5252
--enable-eplb \
53+
--eplb-config '{"window_size":200,"step_interval":600}' \
5354
--trust-remote-code \
5455
--max-model-len 2048 \
5556
--port $PORT &
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8040}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
31+
32+
cleanup() {
33+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
34+
kill "${SERVER_PID}" 2>/dev/null || true
35+
for _ in {1..20}; do
36+
kill -0 "${SERVER_PID}" 2>/dev/null || break
37+
sleep 0.5
38+
done
39+
kill -9 "${SERVER_PID}" 2>/dev/null || true
40+
fi
41+
}
42+
trap cleanup EXIT
43+
44+
for BACK in "${BACKENDS[@]}"; do
45+
VLLM_DEEP_GEMM_WARMUP=skip \
46+
VLLM_ALL2ALL_BACKEND=$BACK \
47+
vllm serve "$MODEL" \
48+
--enforce-eager \
49+
--tensor-parallel-size 4 \
50+
--enable-expert-parallel \
51+
--enable-eplb \
52+
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
53+
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
54+
--trust-remote-code \
55+
--max-model-len 2048 \
56+
--gpu-memory-utilization 0.9 \
57+
--port $PORT &
58+
SERVER_PID=$!
59+
wait_for_server $PORT
60+
61+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
62+
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
63+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
64+
python3 - <<PY
65+
import json; acc=json.load(open('${OUT}'))['accuracy']
66+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
67+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
68+
PY
69+
70+
cleanup
71+
SERVER_PID=
72+
sleep 1
73+
PORT=$((PORT+1))
74+
done

.buildkite/test-pipeline.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ steps:
387387
working_dir: "/vllm-workspace/examples"
388388
source_file_dependencies:
389389
- vllm/entrypoints
390+
- vllm/multimodal
390391
- examples/
391392
commands:
392393
- pip install tensorizer # for tensorizer test
@@ -1373,4 +1374,22 @@ steps:
13731374
num_gpus: 2
13741375
working_dir: "/vllm-workspace"
13751376
commands:
1376-
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
1377+
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
1378+
1379+
- label: DeepSeek V2-Lite Async EPLB Accuracy
1380+
timeout_in_minutes: 60
1381+
gpu: h100
1382+
optional: true
1383+
num_gpus: 4
1384+
working_dir: "/vllm-workspace"
1385+
commands:
1386+
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
1387+
1388+
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
1389+
timeout_in_minutes: 60
1390+
gpu: h100
1391+
optional: true
1392+
num_gpus: 4
1393+
working_dir: "/vllm-workspace"
1394+
commands:
1395+
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Compute Resources:
137137
- Alibaba Cloud
138138
- AMD
139139
- Anyscale
140+
- Arm
140141
- AWS
141142
- Crusoe Cloud
142143
- Databricks

benchmarks/benchmark_hash.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""
4+
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
5+
6+
This focuses on a single test payload shaped like the prefix-cache hash input:
7+
(32-byte bytes object, 32-int tuple)
8+
9+
Usage:
10+
python benchmarks/hash_micro_benchmark.py --iterations 20000
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import argparse
16+
import random
17+
import statistics
18+
import time
19+
from collections.abc import Callable, Iterable
20+
21+
from vllm.utils.hashing import sha256, xxhash
22+
23+
24+
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
25+
"""Generate a deterministic test payload."""
26+
random.seed(seed)
27+
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
28+
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
29+
return (bytes_data, int_tuple)
30+
31+
32+
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
33+
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
34+
times: list[float] = []
35+
36+
# Warm-up to avoid first-run noise.
37+
for _ in range(200):
38+
func(data)
39+
40+
for _ in range(iterations):
41+
start = time.perf_counter()
42+
func(data)
43+
end = time.perf_counter()
44+
times.append(end - start)
45+
46+
avg = statistics.mean(times)
47+
std = statistics.stdev(times) if len(times) > 1 else 0.0
48+
return avg, std
49+
50+
51+
def _run_benchmarks(
52+
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
53+
data: tuple,
54+
iterations: int,
55+
):
56+
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
57+
for name, func in benchmarks:
58+
try:
59+
avg, std = _benchmark_func(func, data, iterations)
60+
except ModuleNotFoundError as exc:
61+
print(f"Skipping {name}: {exc}")
62+
continue
63+
yield name, avg, std
64+
65+
66+
def builtin_hash(data: tuple) -> int:
67+
"""Wrapper for Python's built-in hash()."""
68+
return hash(data)
69+
70+
71+
def main() -> None:
72+
parser = argparse.ArgumentParser(description=__doc__)
73+
parser.add_argument(
74+
"--iterations",
75+
type=int,
76+
default=10_000,
77+
help="Number of measured iterations per hash function.",
78+
)
79+
parser.add_argument(
80+
"--seed", type=int, default=42, help="Random seed for test payload."
81+
)
82+
args = parser.parse_args()
83+
84+
data = _generate_test_data(args.seed)
85+
benchmarks = (
86+
("SHA256 (pickle)", sha256),
87+
("xxHash (pickle)", xxhash),
88+
("built-in hash()", builtin_hash),
89+
)
90+
91+
print("=" * 60)
92+
print("HASH FUNCTION MICRO BENCHMARK")
93+
print("=" * 60)
94+
print("Test data: (32-byte bytes object, 32-int tuple)")
95+
print(f"Iterations: {args.iterations:,}")
96+
print("=" * 60)
97+
98+
results = list(_run_benchmarks(benchmarks, data, args.iterations))
99+
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
100+
101+
print("\nResults:")
102+
for name, avg, std in results:
103+
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
104+
105+
if builtin_entry:
106+
_, builtin_avg, _ = builtin_entry
107+
print("\n" + "=" * 60)
108+
print("SUMMARY (relative to built-in hash())")
109+
print("=" * 60)
110+
for name, avg, _ in results:
111+
if name == "built-in hash()":
112+
continue
113+
speed_ratio = avg / builtin_avg
114+
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
115+
else:
116+
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
117+
118+
119+
if __name__ == "__main__":
120+
main()

0 commit comments

Comments
 (0)