feat(rocm): enable TritonScaledMM fallback on ROCm; add CUDA fallback entry

shivampr · shivampr · commit 2a6c86c1cc2e · 2025-10-23T22:10:45.000-07:00
Signed-off-by: Shivam &lt;shivampr.dev@gmail.com&gt;
Signed-off-by: Shivam &lt;shivamprasad91@gmail.com&gt;
diff --git a/mini_tests/select_triton_rocm.py b/mini_tests/select_triton_rocm.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import sys
+import types
+
+os.environ["VLLM_TARGET_DEVICE"] = "rocm"
+
+# Mock amdsmi to simulate ROCm
+amdsmi = types.ModuleType("amdsmi")
+amdsmi.amdsmi_init = lambda: None
+amdsmi.amdsmi_shut_down = lambda: None
+amdsmi.amdsmi_get_processor_handles = lambda: [1]
+amdsmi.AmdSmiException = Exception
+sys.modules["amdsmi"] = amdsmi
+sys.modules["vllm._rocm_C"] = types.ModuleType("_rocm_C")
+
+# Prevent CPU platform from conflicting with ROCm on macOS
+import vllm.platforms as platforms_module  # noqa: E402
+
+_orig_cpu = platforms_module.cpu_platform_plugin
+platforms_module.cpu_platform_plugin = (
+    lambda: None if os.environ.get("VLLM_TARGET_DEVICE") == "rocm" else _orig_cpu()
+)
+platforms_module.builtin_platform_plugins["cpu"] = platforms_module.cpu_platform_plugin
+
+# Mock torch to look like ROCm
+import torch  # noqa: E402
+
+torch.version.hip = "5.7.0"
+torch.cuda.get_device_properties = lambda d=0: types.SimpleNamespace(
+    gcnArchName="gfx900", major=9, minor=0
+)
+torch.cuda.get_device_capability = lambda d=0: (9, 0)
+
+# Stub custom ops
+_ops = types.ModuleType("_custom_ops")
+for op in [
+    "cutlass_scaled_mm_supports_fp4",
+    "cutlass_scaled_fp4_mm",
+    "scaled_fp4_quant",
+    "scaled_fp8_quant",
+    "apply_repetition_penalties",
+    "merge_attn_states",
+    "scaled_int8_quant",
+]:
+    setattr(_ops, op, lambda *a, **k: None)
+sys.modules["vllm._custom_ops"] = _ops
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (  # noqa: E402, I001
+    ScaledMMLinearLayerConfig,
+    choose_scaled_mm_linear_kernel,
+)
+
+cfg = ScaledMMLinearLayerConfig(
+    is_channelwise=False,
+    is_static_input_scheme=True,
+    input_symmetric=True,
+)
+
+kernel = choose_scaled_mm_linear_kernel(cfg, compute_capability=None)
+
+print("Selected kernel:", kernel.__name__)
+assert "TritonScaledMMLinearKernel" in kernel.__name__
+print("OK: TritonScaledMMLinearKernel chosen on ROCm fallback.")
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -27,7 +27,7 @@
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CPUScaledMMLinearKernel],
-    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
@@ -68,6 +68,13 @@ def choose_scaled_mm_linear_kernel(
             )
             continue
 
+        # Check if kernel is supported on this platform/capability
+        if hasattr(kernel, "is_supported"):
+            supported, reason = kernel.is_supported(compute_capability)
+            if not supported:
+                failure_reasons.append(f" {kernel.__name__}: {reason}")
+                continue
+
         # If the current platform uses compute_capability,
         # make sure the kernel supports the compute cability.
         if compute_capability is not None:
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -4,39 +4,72 @@
 
 import torch
 
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa: E501
+    triton_scaled_mm,
+)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.platforms import current_platform
 
-from .cutlass import CutlassScaledMMLinearKernel
-from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig
 
 
-class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
         return 75
 
+    @classmethod
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if current_platform.is_cuda_alike():
+            return True, None
+        return False, "Requires ROCm or CUDA."
+
     @classmethod
     def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if current_platform.is_cpu():
-            return (
-                False,
-                "TritonScaledMMLinearKernel requires Triton which is not "
-                + "currently supported on CPU.",
-            )
         if not c.input_symmetric:
-            return (
-                False,
-                "TritonScaledMMLinearKernel only supports symmetric " + "quantization.",
-            )
+            return False, "Only symmetric input is supported."
         return True, None
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        super().process_weights_after_loading(layer)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer,
+            self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False),
+        )
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+            replace_parameter(
+                layer,
+                self.i_s_name,
+                torch.nn.Parameter(input_scale.max(), requires_grad=False),
+            )
+            setattr(layer, self.i_zp_name, None)
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        setattr(layer, self.azp_adj_name, None)
 
     def apply_weights(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return super().apply_weights(layer, x, bias)
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        x_q, x_s, x_zp = ops.scaled_int8_quant(
+            x.contiguous(), i_s, i_zp, symmetric=True
+        )
+
+        assert x_zp is None, "Triton kernel only supports symmetric quantization"
+
+        return triton_scaled_mm(
+            x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias
+        )