Skip to content

Commit abdd895

Browse files
authored
Merge branch 'main' into vadim/adj-kv-block-sizes
Signed-off-by: Vadim Gimpelson <[email protected]>
2 parents 06c5eac + faedbb4 commit abdd895

File tree

195 files changed

+7270
-2206
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

195 files changed

+7270
-2206
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,14 @@ fi
173173
PARALLEL_JOB_COUNT=8
174174
MYPYTHONPATH=".."
175175

176+
# Test that we're launching on the machine that has
177+
# proper access to GPUs
178+
render_gid=$(getent group render | cut -d: -f3)
179+
if [[ -z "$render_gid" ]]; then
180+
echo "Error: 'render' group not found. This is required for GPU access." >&2
181+
exit 1
182+
fi
183+
176184
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
177185
if [[ $commands == *"--shard-id="* ]]; then
178186
# assign job count as the number of shards used
@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
186194
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
187195
--network=host \
188196
--shm-size=16gb \
197+
--group-add "$render_gid" \
189198
--rm \
190199
-e HIP_VISIBLE_DEVICES="${GPU}" \
191200
-e HF_TOKEN \
@@ -217,8 +226,8 @@ else
217226
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
218227
--network=host \
219228
--shm-size=16gb \
229+
--group-add "$render_gid" \
220230
--rm \
221-
-e HIP_VISIBLE_DEVICES=0 \
222231
-e HF_TOKEN \
223232
-e AWS_ACCESS_KEY_ID \
224233
-e AWS_SECRET_ACCESS_KEY \

.buildkite/test-amd.yaml

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ steps:
4848
commands:
4949
- bash standalone_tests/pytorch_nightly_dependency.sh
5050

51-
- label: Async Engine, Inputs, Utils, Worker Test # 36min
52-
timeout_in_minutes: 50
51+
- label: Async Engine, Inputs, Utils, Worker Test # 10min
52+
timeout_in_minutes: 15
5353
mirror_hardwares: [amdexperimental, amdproduction]
5454
agent_pool: mi325_1
5555
# grade: Blocking
@@ -344,7 +344,7 @@ steps:
344344
- pytest -v -s v1/logits_processors
345345
- pytest -v -s v1/worker
346346
- pytest -v -s v1/spec_decode
347-
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
347+
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
348348
- pytest -v -s -m 'not cpu_test' v1/metrics
349349
- pytest -v -s v1/test_oracle.py
350350
- pytest -v -s v1/test_request.py
@@ -616,9 +616,9 @@ steps:
616616
- uv pip install --system torchao==0.13.0
617617
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
618618

619-
- label: LM Eval Small Models # 53min
620-
timeout_in_minutes: 75
621-
mirror_hardwares: [amdexperimental]
619+
- label: LM Eval Small Models # 15min
620+
timeout_in_minutes: 20
621+
mirror_hardwares: [amdexperimental, amdproduction]
622622
agent_pool: mi325_1
623623
# grade: Blocking
624624
source_file_dependencies:
@@ -627,17 +627,18 @@ steps:
627627
commands:
628628
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
629629

630-
- label: OpenAI API correctness # 22min
631-
timeout_in_minutes: 30
632-
mirror_hardwares: [amdexperimental]
630+
- label: OpenAI API correctness # 10min
631+
timeout_in_minutes: 15
632+
mirror_hardwares: [amdexperimental, amdproduction]
633633
agent_pool: mi325_1
634634
# grade: Blocking
635635
source_file_dependencies:
636636
- csrc/
637637
- vllm/entrypoints/openai/
638638
- vllm/model_executor/models/whisper.py
639-
commands: # LMEval+Transcription WER check
640-
- pytest -s entrypoints/openai/correctness/
639+
commands: # LMEval
640+
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
641+
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
641642

642643
- label: OpenAI-Compatible Tool Use # 23 min
643644
timeout_in_minutes: 35
@@ -858,10 +859,10 @@ steps:
858859
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
859860
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
860861

861-
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
862-
mirror_hardwares: [amdexperimental]
862+
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
863+
mirror_hardwares: [amdexperimental, amdproduction]
863864
agent_pool: mi325_1
864-
timeout_in_minutes: 70
865+
timeout_in_minutes: 15
865866
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
866867
source_file_dependencies:
867868
- vllm/multimodal/

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,15 +232,16 @@ steps:
232232
commands:
233233
- pytest -v -s distributed/test_eplb_algo.py
234234

235-
- label: EPLB Execution Test # 5min
236-
timeout_in_minutes: 15
235+
- label: EPLB Execution Test # 10min
236+
timeout_in_minutes: 20
237237
working_dir: "/vllm-workspace/tests"
238238
num_gpus: 4
239239
source_file_dependencies:
240240
- vllm/distributed/eplb
241241
- tests/distributed/test_eplb_execute.py
242242
commands:
243243
- pytest -v -s distributed/test_eplb_execute.py
244+
- pytest -v -s distributed/test_eplb_spec_decode.py
244245

245246
- label: Metrics, Tracing Test # 12min
246247
timeout_in_minutes: 20
@@ -315,6 +316,7 @@ steps:
315316
- vllm/
316317
- tests/v1
317318
commands:
319+
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
318320
# split the test to avoid interference
319321
- pytest -v -s -m 'not cpu_test' v1/core
320322
- pytest -v -s v1/executor

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
221221

222222
# Ignore ep_kernels_workspace folder
223223
ep_kernels_workspace/
224+
225+
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
226+
!vllm/benchmarks/lib/

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
2121

2222
*Latest News* 🔥
2323

24+
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
2425
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
2526
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
2627
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).

0 commit comments

Comments
 (0)