Skip to content

Commit 6fa270b

Browse files
Merge remote-tracking branch 'upstream/main'
2 parents d81d420 + 7be5d11 commit 6fa270b

File tree

138 files changed

+5024
-6017
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+5024
-6017
lines changed

.buildkite/generate_index.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
<html>
99
<body>
1010
<h1>Links for vLLM</h1/>
11-
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
11+
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
12+
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
1213
</body>
1314
</html>
1415
"""
@@ -21,7 +22,20 @@
2122

2223
with open("index.html", "w") as f:
2324
print(f"Generated index.html for {args.wheel}")
25+
if "x86_64" in filename:
26+
x86_wheel = filename
27+
arm_wheel = filename.replace("x86_64", "aarch64")
28+
elif "aarch64" in filename:
29+
x86_wheel = filename.replace("aarch64", "x86_64")
30+
arm_wheel = filename
31+
else:
32+
raise ValueError(f"Unsupported wheel: {filename}")
2433
# cloudfront requires escaping the '+' character
2534
f.write(
26-
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
35+
template.format(
36+
x86_wheel=x86_wheel,
37+
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
38+
arm_wheel=arm_wheel,
39+
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
40+
)
2741
)

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml

Lines changed: 0 additions & 12 deletions
This file was deleted.

.buildkite/lm-eval-harness/configs/models-large.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
33
Mixtral-8x7B-Instruct-v0.1.yaml
44
Qwen2-57B-A14-Instruct.yaml
55
DeepSeek-V2-Lite-Chat.yaml
6-
Meta-Llama-3-8B-QQQ.yaml

.buildkite/nightly-benchmarks/scripts/compare-json-results.py

Lines changed: 119 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,129 @@
33
import argparse
44
import json
55
import os
6+
from importlib import util
67

78
import pandas as pd
89

10+
plotly_found = util.find_spec("plotly.express") is not None
11+
912

1013
def compare_data_columns(
1114
files, name_column, data_column, info_cols, drop_column, debug=False
1215
):
13-
print("\ncompare_data_column: " + data_column)
16+
"""
17+
Align concatenation by keys derived from info_cols instead of row order.
18+
- Pick one canonical key list: subset of info_cols present in ALL files.
19+
- For each file: set index to those keys, aggregate duplicates
20+
- (mean for metric, first for names).
21+
- Concat along axis=1 (indexes align), then reset_index so callers can
22+
- group by columns.
23+
- If --debug, add a <file_label>_name column per file.
24+
"""
25+
print("\ncompare_data_column:", data_column)
26+
1427
frames = []
1528
raw_data_cols = []
1629
compare_frames = []
30+
31+
# 1) choose a canonical key list from info_cols that exists in ALL files
32+
cols_per_file = []
33+
for f in files:
34+
try:
35+
df_tmp = pd.read_json(f, orient="records")
36+
except Exception as err:
37+
raise ValueError(f"Failed to read {f}") from err
38+
cols_per_file.append(set(df_tmp.columns))
39+
40+
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
41+
if not key_cols:
42+
# soft fallback: use any info_cols present in the first file
43+
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
44+
if not key_cols:
45+
raise ValueError(
46+
"No common key columns found from info_cols across the input files."
47+
)
48+
49+
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
50+
meta_added = False
51+
1752
for file in files:
18-
data_df = pd.read_json(file)
19-
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
20-
# Show all info columns in the first couple columns
21-
if not frames:
22-
for col in info_cols:
23-
if col not in serving_df.columns:
24-
print(f"Skipping missing column: {col}")
25-
continue
26-
frames.append(serving_df[col])
27-
# only show test name under debug mode
28-
if debug is True:
29-
serving_df = serving_df.rename(columns={name_column: file + "_name"})
30-
frames.append(serving_df[file + "_name"])
31-
32-
file = "/".join(file.split("/")[:-1])
33-
serving_df = serving_df.rename(columns={data_column: file})
34-
frames.append(serving_df[file])
35-
raw_data_cols.append(file)
36-
compare_frames.append(serving_df[file])
53+
df = pd.read_json(file, orient="records")
54+
55+
# Keep rows that actually have the compared metric (same as original behavior)
56+
if drop_column in df.columns:
57+
df = df.dropna(subset=[drop_column], ignore_index=True)
58+
59+
# Stabilize numeric key columns (harmless if missing)
60+
for c in (
61+
"Input Len",
62+
"Output Len",
63+
"TP Size",
64+
"PP Size",
65+
"# of max concurrency.",
66+
"qps",
67+
):
68+
if c in df.columns:
69+
df[c] = pd.to_numeric(df[c], errors="coerce")
70+
71+
# Ensure all key columns exist
72+
for c in key_cols:
73+
if c not in df.columns:
74+
df[c] = pd.NA
75+
76+
# Set index = key_cols and aggregate duplicates → unique MultiIndex
77+
df_idx = df.set_index(key_cols, drop=False)
78+
79+
# meta (key columns), unique per key
80+
meta = df_idx[key_cols]
81+
if not meta.index.is_unique:
82+
meta = meta.groupby(level=key_cols, dropna=False).first()
83+
84+
# metric series for this file, aggregated to one row per key
85+
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
86+
s = df_idx[data_column]
87+
if not s.index.is_unique:
88+
s = s.groupby(level=key_cols, dropna=False).mean()
89+
s.name = file_label # column label like original
90+
91+
# add meta once (from first file) so keys are the leftmost columns
92+
if not meta_added:
93+
frames.append(meta)
94+
meta_added = True
95+
96+
# (NEW) debug: aligned test-name column per file
97+
if debug and name_column in df_idx.columns:
98+
name_s = df_idx[name_column]
99+
if not name_s.index.is_unique:
100+
name_s = name_s.groupby(level=key_cols, dropna=False).first()
101+
name_s.name = f"{file_label}_name"
102+
frames.append(name_s)
103+
104+
frames.append(s)
105+
raw_data_cols.append(file_label)
106+
compare_frames.append(s)
107+
108+
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
37109
if len(compare_frames) >= 2:
38-
# Compare numbers among two files
39-
ratio_df = compare_frames[1] / compare_frames[0]
40-
frames.append(ratio_df)
41-
compare_frames.pop(1)
110+
base = compare_frames[0]
111+
current = compare_frames[-1]
112+
ratio = current / base
113+
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
114+
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
115+
frames.append(ratio)
42116

117+
# 4) concat on columns with aligned MultiIndex;
118+
# then reset_index to return keys as columns
43119
concat_df = pd.concat(frames, axis=1)
120+
concat_df = concat_df.reset_index(drop=True).reset_index()
121+
if "index" in concat_df.columns:
122+
concat_df = concat_df.drop(columns=["index"])
123+
124+
# Ensure key/info columns appear first (in your info_cols order)
125+
front = [c for c in info_cols if c in concat_df.columns]
126+
rest = [c for c in concat_df.columns if c not in front]
127+
concat_df = concat_df[front + rest]
128+
44129
print(raw_data_cols)
45130
return concat_df, raw_data_cols
46131

@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
67152

68153
df = pd.DataFrame(data)
69154

155+
# Keep only "serving" tests
156+
name_col = next(
157+
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
158+
)
159+
if name_col:
160+
df = df[
161+
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
162+
].copy()
163+
70164
# Handle alias column names
71165
rename_map = {
72166
"tp_size": "TP Size",
@@ -181,16 +275,14 @@ def split_json_by_tp_pp(
181275
f"Expected subset: {filtered_info_cols}, "
182276
f"but DataFrame has: {list(output_df.columns)}"
183277
)
184-
185278
output_df_sorted = output_df.sort_values(by=existing_group_cols)
186279
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
187280
for name, group in output_groups:
188281
html = group.to_html()
189282
text_file.write(html_msgs_for_data_cols[i])
190283
text_file.write(html)
191284

192-
if plot is True:
193-
import pandas as pd
285+
if plot and plotly_found:
194286
import plotly.express as px
195287

196288
df = group[raw_data_cols]

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ steps:
6868
queue: cpu_queue_postmerge
6969
commands:
7070
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
71-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
71+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
7272
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
7373

7474
- label: "Annotate release workflow"

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ function cpu_tests() {
4646
set -e
4747
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
4848

49+
# Run kernel tests
50+
docker exec cpu-test-"$NUMA_NODE" bash -c "
51+
set -e
52+
pytest -v -s tests/kernels/test_onednn.py"
53+
4954
# Run basic model test
5055
docker exec cpu-test-"$NUMA_NODE" bash -c "
5156
set -e
@@ -99,4 +104,4 @@ function cpu_tests() {
99104

100105
# All of CPU tests are expected to be finished less than 40 mins.
101106
export -f cpu_tests
102-
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
107+
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ steps:
328328
- pytest -v -s compile/test_sequence_parallelism.py
329329
- pytest -v -s compile/test_async_tp.py
330330
- pytest -v -s compile/test_fusion_all_reduce.py
331+
- pytest -v -s compile/test_decorator.py
331332

332333
- label: PyTorch Fullgraph Smoke Test # 9min
333334
mirror_hardwares: [amdexperimental]
@@ -341,6 +342,7 @@ steps:
341342
- pytest -v -s compile/piecewise/test_simple.py
342343
- pytest -v -s compile/piecewise/test_toy_llama.py
343344
- pytest -v -s compile/piecewise/test_full_cudagraph.py
345+
- pytest -v -s compile/piecewise/test_multiple_graphs.py
344346

345347
- label: PyTorch Fullgraph Test # 18min
346348
mirror_hardwares: [amdexperimental]

CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
357357
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
358358

359359
set(MARLIN_SRCS
360-
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
361360
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
362-
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
363361
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
364362
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
365363
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")

benchmarks/benchmark_throughput.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,8 +597,8 @@ def validate_args(args):
597597
# https://github.com/vllm-project/vllm/issues/16222
598598
if args.data_parallel_size > 1:
599599
raise ValueError(
600-
"Data parallel is not supported in offline benchmark, \
601-
please use benchmark serving instead"
600+
"Data parallel is not supported in offline benchmark, "
601+
"please use benchmark serving instead"
602602
)
603603

604604

benchmarks/kernels/benchmark_grouped_gemm_cutlass.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ def bench_run(
8080
a, score, topk, renormalize=False
8181
)
8282

83+
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
84+
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
85+
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
86+
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
87+
8388
def run_triton_moe(
8489
a: torch.Tensor,
8590
w1: torch.Tensor,
@@ -111,6 +116,10 @@ def run_cutlass_moe(
111116
w2: torch.Tensor,
112117
w1_scale: torch.Tensor,
113118
w2_scale: torch.Tensor,
119+
ab_strides1: torch.Tensor,
120+
ab_strides2: torch.Tensor,
121+
c_strides1: torch.Tensor,
122+
c_strides2: torch.Tensor,
114123
topk_weights: torch.Tensor,
115124
topk_ids: torch.Tensor,
116125
per_act_token: bool,
@@ -125,6 +134,10 @@ def run_cutlass_moe(
125134
topk_ids,
126135
w1_scale,
127136
w2_scale,
137+
ab_strides1,
138+
ab_strides2,
139+
c_strides1,
140+
c_strides2,
128141
per_act_token,
129142
a1_scale=None,
130143
)
@@ -136,6 +149,10 @@ def run_cutlass_from_graph(
136149
w2_q: torch.Tensor,
137150
w1_scale: torch.Tensor,
138151
w2_scale: torch.Tensor,
152+
ab_strides1: torch.Tensor,
153+
ab_strides2: torch.Tensor,
154+
c_strides1: torch.Tensor,
155+
c_strides2: torch.Tensor,
139156
topk_weights: torch.Tensor,
140157
topk_ids: torch.Tensor,
141158
):
@@ -150,6 +167,10 @@ def run_cutlass_from_graph(
150167
topk_ids,
151168
w1_scale,
152169
w2_scale,
170+
ab_strides1,
171+
ab_strides2,
172+
c_strides1,
173+
c_strides2,
153174
per_act_token,
154175
a1_scale=None,
155176
)
@@ -194,6 +215,10 @@ def replay_graph(graph, num_repeats):
194215
w2_q,
195216
w1_scale,
196217
w2_scale,
218+
ab_strides1,
219+
ab_strides2,
220+
c_strides1,
221+
c_strides2,
197222
topk_weights,
198223
topk_ids,
199224
)
@@ -231,6 +256,10 @@ def replay_graph(graph, num_repeats):
231256
"w1_scale": w1_scale,
232257
"w2_scale": w2_scale,
233258
"per_act_token": per_act_token,
259+
"ab_strides1": ab_strides1,
260+
"ab_strides2": ab_strides2,
261+
"c_strides1": c_strides1,
262+
"c_strides2": c_strides2,
234263
# cuda graph params
235264
"cutlass_graph": cutlass_graph,
236265
"triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def replay_graph(graph, num_repeats):
289318
w2_q,
290319
w1_scale,
291320
w2_scale,
321+
ab_strides1,
322+
ab_strides2,
323+
c_strides1,
324+
c_strides2,
292325
topk_weights,
293326
topk_ids,
294327
per_act_token,
@@ -297,7 +330,7 @@ def replay_graph(graph, num_repeats):
297330

298331
results.append(
299332
benchmark.Timer(
300-
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
333+
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
301334
globals=globals,
302335
label=label,
303336
sub_label=sub_label,

0 commit comments

Comments
 (0)