Add RMS Norm Quant fp8 Helion Kernel

gmagogsfm · gmagogsfm · commit 298dc8954eb6 · 2025-11-21T18:28:21.000-08:00
Signed-off-by: Yanan Cao &lt;gmagogsfm@gmail.com&gt;
diff --git a/vllm/compilation/helion/rms_norm_fp8.py b/vllm/compilation/helion/rms_norm_fp8.py
@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Helion custom op for RMSNorm with FP8 quantization.
+"""
+
+import helion
+import helion.language as hl
+import torch
+
+from vllm.compilation.helion.benchmark import KernelBenchmark
+from vllm.compilation.helion.custom_op import HelionCustomOp
+from vllm.model_executor.custom_op import CustomOp
+
+
+@torch.library.custom_op(
+    "my_helion_lib::rms_norm_fp8",
+    mutates_args=(),
+    device_types="cuda",
+)
+@helion.kernel(
+    config=helion.Config(
+        block_sizes=[1],
+        indexing=[
+            "tensor_descriptor",
+            "pointer",
+            "pointer",
+            "pointer",
+            "pointer",
+            "tensor_descriptor",
+            "pointer",
+            "pointer",
+        ],
+        load_eviction_policies=["", "first", "", "", "first", "last"],
+        num_stages=7,
+        num_warps=8,
+        pid_type="flat",
+        range_flattens=[None],
+        range_multi_buffers=[None],
+        range_num_stages=[0],
+        range_unroll_factors=[0],
+        range_warp_specializes=[],
+        reduction_loops=[None],
+    ),
+    static_shapes=False,
+)
+def _rms_norm_fp8_helion_kernel(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    epsilon: float,
+) -> torch.Tensor:
+    """
+    Helion kernel for RMSNorm with FP8 quantization.
+
+    Operation: quantize_fp8(RMSNorm(input, weight, epsilon))
+
+    Algorithm (matching CUDA reference exactly):
+    1. variance = sum(x^2) / hidden_size  (per token/row)
+    2. norm_factor = rsqrt(variance + epsilon)
+    3. normalized = (input * norm_factor).to(input.dtype) * weight
+    4. quantized = normalized * (1 / scale)
+
+    Args:
+        input (Tensor): Input tensor with shape [batch, hidden_size]
+        weight (Tensor): Weight tensor with shape [hidden_size]
+        scale (Tensor): Scalar scale factor for FP8 quantization
+        epsilon (float): Epsilon value for numerical stability
+
+    Returns:
+        Tensor: Output tensor with same shape as input and dtype float8_e4m3fn
+    """
+    m, n = input.size()
+    assert weight.size(0) == n, f"weight size mismatch {weight.size(0)} != {n}"
+    assert scale.numel() == 1, "Scale must be a scalar Tensor"
+
+    out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+
+    # Tile over batch dimension only (following Helion rms_norm example)
+    for tile_m in hl.tile(m):
+        scale_val = hl.load(scale, [0])
+        inv_scale = 1.0 / scale_val
+
+        input_row = input[tile_m, :].to(torch.float32)
+
+        # variance = sum(x^2) / hidden_size in fp32
+        x_squared = input_row * input_row
+        variance = torch.mean(x_squared, dim=-1)
+
+        # normalization factor
+        inv_rms = torch.rsqrt(variance + epsilon)
+
+        # out_norm = ((scalar_t)(x * s_variance)) * src2.val[j];
+        normalized = (input_row * inv_rms[:, None]).to(input.dtype)  # fp32 → bf16
+        weighted = (normalized * weight[:]).to(torch.float32)  # bf16*bf16 → fp32
+
+        # Quantize to FP8
+        result_scaled = weighted * inv_scale
+        out[tile_m, :] = result_scaled.to(out.dtype)
+
+    return out
+
+
+@_rms_norm_fp8_helion_kernel.register_fake
+def _rms_norm_fp8_helion_kernel_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    epsilon: float,
+) -> torch.Tensor:
+    """
+    Fake/meta implementation for rms_norm_fp8 Helion kernel.
+    Defines the input/output shape relationship without actual computation.
+
+    Shape contract:
+    - input: [..., hidden_size]
+    - weight: [hidden_size]
+    - scale: scalar (numel == 1)
+    - epsilon: float
+    - returns: [..., hidden_size] with dtype float8_e4m3fn
+    """
+    return torch.empty_like(input, dtype=torch.float8_e4m3fn)
+
+
+# Now define the vLLM CustomOp wrapper
+@CustomOp.register("rms_norm_fp8_helion")
+class RMSNormFp8Helion(HelionCustomOp):
+    """
+    RMSNorm with FP8 quantization using Helion.
+
+    This operation computes:
+        quantize_fp8(RMSNorm(input, weight, epsilon))
+
+    The operation combines:
+    1. Compute RMS (root mean square): rsqrt(mean(x^2) + epsilon)
+    2. Normalize input by RMS
+    3. Apply elementwise multiplication with weight
+    4. Quantize result to FP8 format
+
+    Shapes:
+        input: (num_tokens, hidden_size)
+        weight: (hidden_size,)
+        scale: (1,) - scalar scale factor for FP8 quantization
+        output: (num_tokens, hidden_size) with dtype float8_e4m3fn
+    """
+
+    def forward_helion(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        epsilon: float = 1e-5,
+    ) -> torch.Tensor:
+        """
+        Helion kernel implementation.
+
+        Args:
+            input: Input tensor with shape (num_tokens, hidden_size)
+            weight: Weight tensor with shape (hidden_size,)
+            scale: Scale tensor (scalar) for FP8 quantization
+            epsilon: Epsilon for numerical stability
+
+        Returns:
+            Output tensor with shape (num_tokens, hidden_size) and dtype
+            float8_e4m3fn
+        """
+        return torch.ops.my_helion_lib.rms_norm_fp8(input, weight, scale, epsilon)
+
+
+class RMSNormFp8Benchmark(KernelBenchmark):
+    """
+    Benchmark harness for RMSNorm-FP8 kernel.
+
+    This class provides test configurations and benchmark utilities
+    for the RMSNormFp8Helion custom op.
+    """
+
+    benchmark_name = "rms_norm_fp8"
+
+    def __init__(self):
+        """Initialize the benchmark."""
+        self.op = RMSNormFp8Helion()
+        self.epsilon = 1e-5
+
+    def get_quick_test_shapes(self) -> list[tuple[list[tuple], torch.dtype]]:
+        """
+        Get test configurations for quick smoke testing.
+
+        Returns:
+            List of (shapes, dtype) tuples.
+            Input shapes are (num_tokens, hidden_size).
+        """
+        return [
+            (
+                [
+                    (1, 4096),
+                    (256, 4096),
+                    (1024, 4096),
+                    (1, 8192),
+                    (256, 8192),
+                    (1024, 8192),
+                ],
+                torch.bfloat16,
+            ),
+        ]
+
+    def get_full_test_shapes(self) -> list[tuple[list[tuple], torch.dtype]]:
+        """
+        Get test configurations for comprehensive benchmarking.
+
+        Returns:
+            List of (shapes, dtype) tuples.
+            Input shapes are (num_tokens, hidden_size).
+        """
+        num_tokens_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+        hidden_sizes = [512, 1024, 2048, 4096, 5504, 6912, 7168, 8192, 14336, 16384]
+
+        shapes_bf16 = []
+        shapes_fp16 = []
+
+        for num_tokens in num_tokens_list:
+            for hidden_size in hidden_sizes:
+                shape = (num_tokens, hidden_size)
+                shapes_bf16.append(shape)
+                shapes_fp16.append(shape)
+
+        return [
+            (shapes_bf16, torch.bfloat16),
+            (shapes_fp16, torch.float16),
+        ]
+
+    def create_inputs(
+        self, dtype: torch.dtype, **shape_params
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Create input tensors for rms_norm_fp8 kernel.
+
+        Args:
+            dtype: Data type for inputs
+            **shape_params: Must contain 'shape' - a tuple specifying input shape
+
+        Returns:
+            Tuple of (input_tensor, weight, scale)
+            - input_tensor has shape (num_tokens, hidden_size)
+            - weight has shape (hidden_size,)
+            - scale is a scalar tensor
+        """
+        shape = shape_params["shape"]
+        hidden_size = shape[-1]
+
+        input_tensor = torch.randn(*shape, dtype=dtype, device="cuda")
+        weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
+        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
+        return input_tensor, weight, scale
+
+    def run_baseline(
+        self, input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Run the baseline reference kernel.
+
+        This is the existing vLLM CUDA kernel that Helion is meant to
+        replace or accelerate. Used for performance comparison in benchmarks.
+
+        Args:
+            input: Input tensor with shape (num_tokens, hidden_size)
+            weight: Weight tensor with shape (hidden_size,)
+            scale: Scale tensor (scalar)
+
+        Returns:
+            Output tensor from baseline kernel
+        """
+        out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+        torch.ops._C.rms_norm_static_fp8_quant(out, input, weight, scale, self.epsilon)
+        return out
+
+    def run_helion(
+        self, input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Run the Helion kernel.
+
+        Args:
+            input: Input tensor with shape (num_tokens, hidden_size)
+            weight: Weight tensor with shape (hidden_size,)
+            scale: Scale tensor (scalar)
+
+        Returns:
+            Output tensor from Helion kernel
+        """
+        return self.op.forward_helion(input, weight, scale, self.epsilon)