BaizeAI
diff --git a/‎.buildkite/generate_index.py‎
Lines changed: 16 additions & 2 deletions b/‎.buildkite/generate_index.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 0 additions & 12 deletions b/‎.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/models-large.txt‎
Lines changed: 0 additions & 1 deletion b/‎.buildkite/lm-eval-harness/configs/models-large.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/scripts/compare-json-results.py‎
Lines changed: 119 additions & 27 deletions b/‎.buildkite/nightly-benchmarks/scripts/compare-json-results.py‎
Lines changed: 119 additions & 27 deletions
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 6 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py‎
Lines changed: 34 additions & 1 deletion b/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py‎
Lines changed: 34 additions & 1 deletion
@@ -8,7 +8,8 @@
 <html>
     <body>
     <h1>Links for vLLM</h1/>
-        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
     </body>
 </html>
 """
@@ -21,7 +22,20 @@
 
 with open("index.html", "w") as f:
     print(f"Generated index.html for {args.wheel}")
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64")
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64")
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
     # cloudfront requires escaping the '+' character
     f.write(
-        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
     )
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
@@ -3,44 +3,129 @@
 import argparse
 import json
 import os
+from importlib import util
 
 import pandas as pd
 
+plotly_found = util.find_spec("plotly.express") is not None
+
 
 def compare_data_columns(
     files, name_column, data_column, info_cols, drop_column, debug=False
 ):
-    print("\ncompare_data_column: " + data_column)
+    """
+    Align concatenation by keys derived from info_cols instead of row order.
+    - Pick one canonical key list: subset of info_cols present in ALL files.
+    - For each file: set index to those keys, aggregate duplicates
+    - (mean for metric, first for names).
+    - Concat along axis=1 (indexes align), then reset_index so callers can
+    - group by columns.
+    - If --debug, add a <file_label>_name column per file.
+    """
+    print("\ncompare_data_column:", data_column)
+
     frames = []
     raw_data_cols = []
     compare_frames = []
+
+    # 1) choose a canonical key list from info_cols that exists in ALL files
+    cols_per_file = []
+    for f in files:
+        try:
+            df_tmp = pd.read_json(f, orient="records")
+        except Exception as err:
+            raise ValueError(f"Failed to read {f}") from err
+        cols_per_file.append(set(df_tmp.columns))
+
+    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+    if not key_cols:
+        # soft fallback: use any info_cols present in the first file
+        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+    if not key_cols:
+        raise ValueError(
+            "No common key columns found from info_cols across the input files."
+        )
+
+    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+    meta_added = False
+
     for file in files:
-        data_df = pd.read_json(file)
-        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        # Show all info columns in the first couple columns
-        if not frames:
-            for col in info_cols:
-                if col not in serving_df.columns:
-                    print(f"Skipping missing column: {col}")
-                    continue
-                frames.append(serving_df[col])
-        # only show test name under debug mode
-        if debug is True:
-            serving_df = serving_df.rename(columns={name_column: file + "_name"})
-            frames.append(serving_df[file + "_name"])
-
-        file = "/".join(file.split("/")[:-1])
-        serving_df = serving_df.rename(columns={data_column: file})
-        frames.append(serving_df[file])
-        raw_data_cols.append(file)
-        compare_frames.append(serving_df[file])
+        df = pd.read_json(file, orient="records")
+
+        # Keep rows that actually have the compared metric (same as original behavior)
+        if drop_column in df.columns:
+            df = df.dropna(subset=[drop_column], ignore_index=True)
+
+        # Stabilize numeric key columns (harmless if missing)
+        for c in (
+            "Input Len",
+            "Output Len",
+            "TP Size",
+            "PP Size",
+            "# of max concurrency.",
+            "qps",
+        ):
+            if c in df.columns:
+                df[c] = pd.to_numeric(df[c], errors="coerce")
+
+        # Ensure all key columns exist
+        for c in key_cols:
+            if c not in df.columns:
+                df[c] = pd.NA
+
+        # Set index = key_cols and aggregate duplicates → unique MultiIndex
+        df_idx = df.set_index(key_cols, drop=False)
+
+        # meta (key columns), unique per key
+        meta = df_idx[key_cols]
+        if not meta.index.is_unique:
+            meta = meta.groupby(level=key_cols, dropna=False).first()
+
+        # metric series for this file, aggregated to one row per key
+        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
+        s.name = file_label  # column label like original
+
+        # add meta once (from first file) so keys are the leftmost columns
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
+        # (NEW) debug: aligned test-name column per file
+        if debug and name_column in df_idx.columns:
+            name_s = df_idx[name_column]
+            if not name_s.index.is_unique:
+                name_s = name_s.groupby(level=key_cols, dropna=False).first()
+            name_s.name = f"{file_label}_name"
+            frames.append(name_s)
+
+        frames.append(s)
+        raw_data_cols.append(file_label)
+        compare_frames.append(s)
+
+        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
         if len(compare_frames) >= 2:
-            # Compare numbers among two files
-            ratio_df = compare_frames[1] / compare_frames[0]
-            frames.append(ratio_df)
-            compare_frames.pop(1)
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            ratio = current / base
+            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            frames.append(ratio)
 
+    # 4) concat on columns with aligned MultiIndex;
+    # then reset_index to return keys as columns
     concat_df = pd.concat(frames, axis=1)
+    concat_df = concat_df.reset_index(drop=True).reset_index()
+    if "index" in concat_df.columns:
+        concat_df = concat_df.drop(columns=["index"])
+
+    # Ensure key/info columns appear first (in your info_cols order)
+    front = [c for c in info_cols if c in concat_df.columns]
+    rest = [c for c in concat_df.columns if c not in front]
+    concat_df = concat_df[front + rest]
+
     print(raw_data_cols)
     return concat_df, raw_data_cols
 
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
 
     df = pd.DataFrame(data)
 
+    # Keep only "serving" tests
+    name_col = next(
+        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+    )
+    if name_col:
+        df = df[
+            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+        ].copy()
+
     # Handle alias column names
     rename_map = {
         "tp_size": "TP Size",
@@ -181,16 +275,14 @@ def split_json_by_tp_pp(
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
                 html = group.to_html()
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
 
-                if plot is True:
-                    import pandas as pd
+                if plot and plotly_found:
                     import plotly.express as px
 
                     df = group[raw_data_cols]
 
@@ -68,7 +68,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Annotate release workflow"
 
@@ -46,6 +46,11 @@ function cpu_tests() {
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -v -s tests/kernels/test_onednn.py"
+
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
@@ -99,4 +104,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -328,6 +328,7 @@ steps:
     - pytest -v -s compile/test_sequence_parallelism.py
     - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
+    - pytest -v -s compile/test_decorator.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   mirror_hardwares: [amdexperimental]
@@ -341,6 +342,7 @@ steps:
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
   - pytest -v -s compile/piecewise/test_full_cudagraph.py
+  - pytest -v -s compile/piecewise/test_multiple_graphs.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental]
 
@@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
     set(MARLIN_SRCS
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
 
@@ -597,8 +597,8 @@ def validate_args(args):
     # https://github.com/vllm-project/vllm/issues/16222
     if args.data_parallel_size > 1:
         raise ValueError(
-            "Data parallel is not supported in offline benchmark, \
-            please use benchmark serving instead"
+            "Data parallel is not supported in offline benchmark, "
+            "please use benchmark serving instead"
         )
 
 
 
@@ -80,6 +80,11 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -111,6 +116,10 @@ def run_cutlass_moe(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -125,6 +134,10 @@ def run_cutlass_moe(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -136,6 +149,10 @@ def run_cutlass_from_graph(
         w2_q: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -150,6 +167,10 @@ def run_cutlass_from_graph(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -194,6 +215,10 @@ def replay_graph(graph, num_repeats):
             w2_q,
             w1_scale,
             w2_scale,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -231,6 +256,10 @@ def replay_graph(graph, num_repeats):
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
+        "ab_strides1": ab_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides1": c_strides1,
+        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def replay_graph(graph, num_repeats):
         w2_q,
         w1_scale,
         w2_scale,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -297,7 +330,7 @@ def replay_graph(graph, num_repeats):
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
Original file line number	Diff line number	Diff line change
`@@ -597,8 +597,8 @@ def validate_args(args):`
`597`	`597`	`# https://github.com/vllm-project/vllm/issues/16222`
`598`	`598`	`if args.data_parallel_size > 1:`
`599`	`599`	`raise ValueError(`
`600`		`- "Data parallel is not supported in offline benchmark, \`
`601`		`- please use benchmark serving instead"`
	`600`	`+ "Data parallel is not supported in offline benchmark, "`
	`601`	`+ "please use benchmark serving instead"`
`602`	`602`	`)`
`603`	`603`
`604`	`604`