Skip to content

Commit 847c6f5

Browse files
committed
Merge remote-tracking branch 'upstream/main' into dev
Signed-off-by: Yi Pan <[email protected]>
2 parents 21ec47b + a0e0efd commit 847c6f5

File tree

491 files changed

+17285
-6911
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

491 files changed

+17285
-6911
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ steps:
77
commands:
88
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
99
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
10-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
10+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1111
- "mkdir artifacts"
1212
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
1313
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -62,23 +62,45 @@ steps:
6262
env:
6363
DOCKER_BUILDKIT: "1"
6464

65-
- block: "Build release image"
65+
- label: "Build release image (x86)"
6666
depends_on: ~
67-
key: block-release-image-build
68-
69-
- label: "Build release image"
70-
depends_on: block-release-image-build
71-
id: build-release-image
67+
id: build-release-image-x86
7268
agents:
7369
queue: cpu_queue_postmerge
7470
commands:
7571
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
76-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
72+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
73+
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
74+
# re-tag to default image tag and push, just in case arm64 build fails
75+
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
7776
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
7877

78+
- label: "Build release image (arm64)"
79+
depends_on: ~
80+
id: build-release-image-arm64
81+
agents:
82+
queue: arm64_cpu_queue_postmerge
83+
commands:
84+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
86+
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
87+
88+
# Add job to create multi-arch manifest
89+
- label: "Create multi-arch manifest"
90+
depends_on:
91+
- build-release-image-x86
92+
- build-release-image-arm64
93+
id: create-multi-arch-manifest
94+
agents:
95+
queue: cpu_queue_postmerge
96+
commands:
97+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
98+
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
99+
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
100+
79101
- label: "Annotate release workflow"
80102
depends_on:
81-
- build-release-image
103+
- create-multi-arch-manifest
82104
- build-wheel-cuda-12-8
83105
- build-wheel-cuda-12-6
84106
- build-wheel-cuda-11-8

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
164164
--ignore=entrypoints/llm/test_chat.py \
165165
--ignore=entrypoints/llm/test_accuracy.py \
166166
--ignore=entrypoints/llm/test_init.py \
167-
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
168167
--ignore=entrypoints/llm/test_prompt_validation.py "}
169168
fi
170169

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2525
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2626

2727
# Run the image, setting --shm-size=4g for tensor parallel.
28-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
29-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
29+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
3030

3131
function cpu_tests() {
3232
set -e
@@ -49,57 +49,73 @@ function cpu_tests() {
4949
# Run kernel tests
5050
docker exec cpu-test-"$NUMA_NODE" bash -c "
5151
set -e
52-
pytest -v -s tests/kernels/test_onednn.py"
52+
pytest -x -v -s tests/kernels/test_onednn.py"
5353

5454
# Run basic model test
5555
docker exec cpu-test-"$NUMA_NODE" bash -c "
5656
set -e
5757
# Note: disable until supports V1
58-
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
59-
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
58+
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
59+
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
6060
6161
# Note: disable Bart until supports V1
62-
pytest -v -s tests/models/language/generation -m cpu_model \
62+
pytest -x -v -s tests/models/language/generation -m cpu_model \
6363
--ignore=tests/models/language/generation/test_bart.py
64-
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
64+
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
6565
--ignore=tests/models/language/generation/test_bart.py
6666
67-
pytest -v -s tests/models/language/pooling -m cpu_model
68-
pytest -v -s tests/models/multimodal/generation \
67+
pytest -x -v -s tests/models/language/pooling -m cpu_model
68+
pytest -x -v -s tests/models/multimodal/generation \
6969
--ignore=tests/models/multimodal/generation/test_mllama.py \
7070
--ignore=tests/models/multimodal/generation/test_pixtral.py \
7171
-m cpu_model"
7272

7373
# Run compressed-tensor test
7474
docker exec cpu-test-"$NUMA_NODE" bash -c "
7575
set -e
76-
pytest -s -v \
76+
pytest -x -s -v \
7777
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
7878

7979
# Note: disable it until supports V1
8080
# Run AWQ test
8181
# docker exec cpu-test-"$NUMA_NODE" bash -c "
8282
# set -e
83-
# VLLM_USE_V1=0 pytest -s -v \
83+
# VLLM_USE_V1=0 pytest -x -s -v \
8484
# tests/quantization/test_ipex_quant.py"
8585

8686
# Run multi-lora tests
8787
docker exec cpu-test-"$NUMA_NODE" bash -c "
8888
set -e
89-
pytest -s -v \
89+
pytest -x -s -v \
9090
tests/lora/test_qwen2vl.py"
9191

92-
# online serving
92+
# online serving: tp+pp
9393
docker exec cpu-test-"$NUMA_NODE" bash -c '
9494
set -e
9595
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
96+
server_pid=$!
9697
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
9798
vllm bench serve \
9899
--backend vllm \
99100
--dataset-name random \
100101
--model meta-llama/Llama-3.2-3B-Instruct \
101102
--num-prompts 20 \
102-
--endpoint /v1/completions'
103+
--endpoint /v1/completions
104+
kill -s SIGTERM $server_pid &'
105+
106+
# online serving: tp+dp
107+
docker exec cpu-test-"$NUMA_NODE" bash -c '
108+
set -e
109+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
110+
server_pid=$!
111+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
112+
vllm bench serve \
113+
--backend vllm \
114+
--dataset-name random \
115+
--model meta-llama/Llama-3.2-3B-Instruct \
116+
--num-prompts 20 \
117+
--endpoint /v1/completions
118+
kill -s SIGTERM $server_pid &'
103119
}
104120

105121
# All of CPU tests are expected to be finished less than 40 mins.

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ docker run \
3131
set -e
3232
echo $ZE_AFFINITY_MASK
3333
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
34+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
3435
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
3536
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
3637
cd tests

.buildkite/test-pipeline.yaml

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,9 @@ steps:
109109
- tests/entrypoints/offline_mode
110110
commands:
111111
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
112-
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
112+
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
113113
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
114114
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
115-
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
116115
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
117116

118117
- label: Entrypoints Test (API Server) # 40min
@@ -234,16 +233,33 @@ steps:
234233
# OOM in the CI unless we run this separately
235234
- pytest -v -s tokenization
236235

237-
- label: V1 Test
236+
- label: V1 Test e2e + engine
238237
mirror_hardwares: [amdexperimental]
239238
source_file_dependencies:
240239
- vllm/
241240
- tests/v1
242241
commands:
243-
# split the test to avoid interference
244-
- pytest -v -s v1/core
242+
# TODO: accuracy does not match, whether setting
243+
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
244+
- pytest -v -s v1/e2e
245245
- pytest -v -s v1/engine
246+
247+
- label: V1 Test entrypoints
248+
mirror_hardwares: [amdexperimental]
249+
source_file_dependencies:
250+
- vllm/
251+
- tests/v1
252+
commands:
246253
- pytest -v -s v1/entrypoints
254+
255+
- label: V1 Test others
256+
mirror_hardwares: [amdexperimental]
257+
source_file_dependencies:
258+
- vllm/
259+
- tests/v1
260+
commands:
261+
# split the test to avoid interference
262+
- pytest -v -s v1/core
247263
- pytest -v -s v1/executor
248264
- pytest -v -s v1/sample
249265
- pytest -v -s v1/logits_processors
@@ -256,9 +272,6 @@ steps:
256272
- pytest -v -s v1/test_utils.py
257273
- pytest -v -s v1/test_oracle.py
258274
- pytest -v -s v1/test_metrics_reader.py
259-
# TODO: accuracy does not match, whether setting
260-
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
261-
- pytest -v -s v1/e2e
262275
# Integration test for streaming correctness (requires special branch).
263276
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
264277
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -312,7 +325,7 @@ steps:
312325
source_file_dependencies:
313326
- vllm/lora
314327
- tests/lora
315-
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
328+
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
316329
parallelism: 4
317330

318331
- label: PyTorch Compilation Unit Tests
@@ -449,8 +462,8 @@ steps:
449462
- tests/quantization
450463
commands:
451464
# temporary install here since we need nightly, will move to requirements/test.in
452-
# after torchao 0.12 release
453-
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
465+
# after torchao 0.12 release, and pin a working version of torchao nightly here
466+
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
454467
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
455468

456469
- label: LM Eval Small Models # 53min
@@ -553,8 +566,7 @@ steps:
553566
- tests/models/multimodal
554567
commands:
555568
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
556-
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
557-
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
569+
- pytest -v -s models/multimodal/processing
558570

559571
- label: Multi-Modal Models Test (Standard)
560572
mirror_hardwares: [amdexperimental]
@@ -654,6 +666,7 @@ steps:
654666
# Quantization
655667
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
656668
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
669+
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
657670
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
658671
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
659672
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -663,6 +676,7 @@ steps:
663676
- pytest -v -s tests/compile/test_fusion_all_reduce.py
664677
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
665678
- pytest -v -s tests/kernels/moe/test_flashinfer.py
679+
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
666680

667681
##### 1 GPU test #####
668682
##### multi gpus test #####
@@ -755,6 +769,11 @@ steps:
755769
- pytest -v -s plugins_tests/test_platform_plugins.py
756770
- pip uninstall vllm_add_dummy_platform -y
757771
# end platform plugin tests
772+
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
773+
- pip install -e ./plugins/prithvi_io_processor_plugin
774+
- pytest -v -s plugins_tests/test_io_processor_plugins.py
775+
- pip uninstall prithvi_io_processor_plugin -y
776+
# end io_processor plugins test
758777
# other tests continue here:
759778
- pytest -v -s plugins_tests/test_scheduler_plugins.py
760779
- pip install -e ./plugins/vllm_add_dummy_model
@@ -791,13 +810,14 @@ steps:
791810
# requires multi-GPU testing for validation.
792811
- pytest -v -s -x lora/test_chatglm3_tp.py
793812
- pytest -v -s -x lora/test_llama_tp.py
794-
- pytest -v -s -x lora/test_multi_loras_with_tp.py
813+
- pytest -v -s -x lora/test_llm_with_multi_loras.py
795814

796815

797816
- label: Weight Loading Multiple GPU Test # 33min
798817
mirror_hardwares: [amdexperimental]
799818
working_dir: "/vllm-workspace/tests"
800-
num_gpus: 2
819+
num_gpus: 2
820+
optional: true
801821
source_file_dependencies:
802822
- vllm/
803823
- tests/weight_loading

.github/scale-config.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# scale-config.yml:
2+
# Powers what instance types are available for GHA auto-scaled
3+
# runners. Runners listed here will be available as self hosted
4+
# runners, configuration is directly pulled from the main branch.
5+
# runner_types:
6+
# runner_label:
7+
# instance_type: m4.large
8+
# os: linux
9+
# # min_available defaults to the global cfg in the ALI Terraform
10+
# min_available: undefined
11+
# # when max_available value is not defined, no max runners is enforced
12+
# max_available: undefined
13+
# disk_size: 50
14+
# is_ephemeral: true
15+
16+
runner_types:
17+
linux.2xlarge:
18+
disk_size: 150
19+
instance_type: c5.2xlarge
20+
is_ephemeral: true
21+
os: linux

.github/workflows/issue_autolabel.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ jobs:
4949
term: "VLLM_ROCM_",
5050
searchIn: "both"
5151
},
52+
{
53+
term: "aiter",
54+
searchIn: "title"
55+
},
5256
{
5357
term: "rocm",
5458
searchIn: "title"

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ repos:
2121
- id: ruff-format
2222
files: ^(.buildkite|benchmarks|examples)/.*
2323
- repo: https://github.com/crate-ci/typos
24-
rev: v1.34.0
24+
rev: v1.35.5
2525
hooks:
2626
- id: typos
2727
- repo: https://github.com/PyCQA/isort

CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
3030
# Supported python versions. These versions will be searched in order, the
3131
# first match will be selected. These should be kept in sync with setup.py.
3232
#
33-
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
33+
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
3434

3535
# Supported AMD GPU architectures.
3636
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
4545
# requirements.txt files and should be kept consistent. The ROCm torch
4646
# versions are derived from docker/Dockerfile.rocm
4747
#
48-
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
49-
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
48+
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
49+
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
5050

5151
#
5252
# Try to find python package with an executable that exactly matches
@@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
541541
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
542542
set(SRCS
543543
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
544+
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
544545
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
545546
set_gencode_flags_for_srcs(
546547
SRCS "${SRCS}"
@@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
559560
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
560561
set(SRCS
561562
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
563+
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
562564
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
563565
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
564566
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")

0 commit comments

Comments
 (0)