Add regsiter_kernel decorator

gmagogsfm · gmagogsfm · commit 578ac0356164 · 2025-12-03T18:57:27.000Z
Signed-off-by: Yanan Cao &lt;gmagogsfm@gmail.com&gt;
diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -48,7 +48,7 @@
     if HELION_OP_AVAILABLE:
         import torch
 
-        _ = torch.ops.my_helion_lib.allreduce_add_rmsnorm  # Will raise if not registered
+        _ = torch.ops.vllm_helion.allreduce_add_rmsnorm  # Will raise if not registered
 except (ImportError, AttributeError):
     HELION_OP_AVAILABLE = False
 
@@ -89,7 +89,7 @@ def ops_in_model_before(self):
 
     def ops_in_model_after(self):
         if self.use_helion:
-            return [torch.ops.my_helion_lib.allreduce_add_rmsnorm.default]
+            return [torch.ops.vllm_helion.allreduce_add_rmsnorm.default]
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
 
 
@@ -144,7 +144,7 @@ def forward(self, hidden_states):
 
     def ops_in_model_after(self):
         if self.use_helion:
-            return [torch.ops.my_helion_lib.allreduce_add_rmsnorm.default]
+            return [torch.ops.vllm_helion.allreduce_add_rmsnorm.default]
         return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
 
     def ops_in_model_before(self):
@@ -161,7 +161,9 @@ def __init__(self, hidden_size=16, token_num=16, eps=1e-6, use_helion=False):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.use_helion = use_helion  # Not used for FP4 model, but accept for consistency
+        self.use_helion = (
+            use_helion  # Not used for FP4 model, but accept for consistency
+        )
         self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
 
         self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
@@ -40,14 +40,14 @@
 
 # Check if Helion torch.ops are available
 try:
-    from vllm.compilation.helion.silu_mul_fp8 import SiluMulFp8Helion
-
-    # Check if the op is available - this will be True if Helion is installed and enabled
-    HELION_OP_AVAILABLE = SiluMulFp8Helion.is_helion_available()
+    # Import to trigger registration
     # Try to access the torch.ops to verify it's registered
-    if HELION_OP_AVAILABLE:
-        import torch
-        _ = torch.ops.my_helion_lib.silu_mul_fp8  # Will raise if not registered
+    import torch
+
+    from vllm.compilation.helion.silu_mul_fp8 import silu_mul_fp8
+
+    _ = torch.ops.vllm_helion.silu_mul_fp8  # Will raise if not registered
+    HELION_OP_AVAILABLE = True
 except (ImportError, AttributeError):
     HELION_OP_AVAILABLE = False
 
@@ -100,7 +100,7 @@ def ops_in_model_before(self):
 
     def ops_in_model_after(self):
         if self.use_helion:
-            return [torch.ops.my_helion_lib.silu_mul_fp8]
+            return [torch.ops.vllm_helion.silu_mul_fp8]
         return [FUSED_OPS[kFp8StaticTensorSym]]
 
 
@@ -155,7 +155,11 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, cuda_force_torch, use_helion",
     # Test FP8 model with both Helion and non-Helion
-    list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False], [False, True]))
+    list(
+        itertools.product(
+            [TestSiluMulFp8QuantModel], [True, False], [True, False], [False, True]
+        )
+    )
     # Test NVFP4 model only without Helion (use_helion must be False)
     + [(TestSiluMulNvfp4QuantModel, False, False, False)],
 )
@@ -209,7 +213,10 @@ def test_fusion_silu_and_mul_quant(
         passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)]
         backend = TestBackend(*passes)
         model = model_class(
-            hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x, use_helion=use_helion
+            hidden_size=hidden_size,
+            cuda_force_torch=cuda_force_torch,
+            x=x,
+            use_helion=use_helion,
         )
 
         # First dimension dynamic
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from contextlib import suppress
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -120,7 +119,7 @@ def replacement(
             # This encapsulates all the enable/disable logic in one place
             if self.helion_op is not None and self.helion_op.enabled():
                 # Call the Helion CustomOp's forward method
-                # This will internally call torch.ops.my_helion_lib.silu_mul_fp8
+                # This will internally call the decorated Helion kernel directly
                 return self.helion_op.forward_helion(input, scale)
             else:
                 d = input.shape[-1] // 2
diff --git a/vllm/compilation/helion/__init__.py b/vllm/compilation/helion/__init__.py
@@ -15,6 +15,7 @@
 
 from vllm.compilation.helion.benchmark import KernelBenchmark
 from vllm.compilation.helion.custom_op import HelionCustomOp
+from vllm.compilation.helion.register import register_kernel
 
 # Automatically import all kernel modules to trigger registration
 # This allows new kernels to be added without modifying this file
@@ -47,5 +48,5 @@
 __all__ = [
     "HelionCustomOp",
     "KernelBenchmark",
+    "register_kernel",
 ] + sorted(_helion_ops.keys())
-
diff --git a/vllm/compilation/helion/allreduce_add_rmsnorm.py b/vllm/compilation/helion/allreduce_add_rmsnorm.py
@@ -17,6 +17,7 @@
 
 from vllm.compilation.helion.benchmark import DistributedKernelBenchmark
 from vllm.compilation.helion.custom_op import HelionCustomOp
+from vllm.compilation.helion.register import register_kernel
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 
@@ -188,8 +189,10 @@ def copy_engine_all_reduce_w_progress(
 
 
 # Create a custom op wrapper for fake tensor support
+# TODO(gmagogsfm): remove this custom op registration when torch.compile
+# and make_fx support it
 @torch.library.custom_op(
-    "my_helion_lib::copy_engine_all_reduce_w_progress",
+    "vllm_helion::copy_engine_all_reduce_w_progress",
     mutates_args=("output", "progress"),  # output and progress tensors are mutated
     device_types="cuda",
 )
@@ -231,7 +234,36 @@ def copy_engine_all_reduce_w_progress_fake(
 
 # Only define the Helion kernel if Helion is available
 if HELION_AVAILABLE:
-    # Pure Helion kernel for autotuning - this has the autotune method
+
+    def _allreduce_add_rmsnorm_fake(
+        allreduce_buf: torch.Tensor,
+        residual: torch.Tensor,
+        rms_gamma: torch.Tensor,
+        progress: torch.Tensor,
+        rms_eps: float,
+        SPLITS_PER_RANK: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Custom fake implementation for allreduce_add_rmsnorm.
+
+        Shape contract:
+        - allreduce_buf: [M, K]
+        - residual: [M, K]
+        - rms_gamma: [K]
+        - progress: [SPLITS_PER_RANK]
+        - returns: tuple of (normalized_output, updated_residual) both [M, K]
+        """
+        M, K = allreduce_buf.size()
+        out = torch.empty(
+            [M, K], dtype=allreduce_buf.dtype, device=allreduce_buf.device
+        )
+        residual_out = torch.empty(
+            [M, K], dtype=allreduce_buf.dtype, device=allreduce_buf.device
+        )
+        return out, residual_out
+
+    # Apply @register_kernel to the actual Helion kernel
+    @register_kernel("allreduce_add_rmsnorm", fake_impl=_allreduce_add_rmsnorm_fake)
     @helion.kernel(
         autotune_baseline_atol=0.0,
         autotune_baseline_rtol=0.0,
@@ -273,7 +305,7 @@ def copy_engine_all_reduce_w_progress_fake(
         ),
         static_shapes=True,
     )
-    def _allreduce_add_rmsnorm_pure_helion_kernel(
+    def allreduce_add_rmsnorm(
         allreduce_buf: torch.Tensor,
         residual: torch.Tensor,
         rms_gamma: torch.Tensor,
@@ -343,70 +375,6 @@ def _allreduce_add_rmsnorm_pure_helion_kernel(
 
         return out, residual_out
 
-    # PyTorch custom op wrapper - calls the pure Helion kernel
-    @torch.library.custom_op(
-        "my_helion_lib::allreduce_add_rmsnorm",
-        mutates_args=(),
-        device_types="cuda",
-    )
-    def _allreduce_add_rmsnorm_helion_kernel(
-        allreduce_buf: torch.Tensor,
-        residual: torch.Tensor,
-        rms_gamma: torch.Tensor,
-        progress: torch.Tensor,
-        rms_eps: float,
-        SPLITS_PER_RANK: int,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        PyTorch custom op wrapper for Helion AllReduce+Add+RMSNorm kernel.
-
-        Operation: RMSNorm(AllReduce(input) + residual), returns both normalized
-        and residual
-
-        Args:
-            allreduce_buf: Buffer being filled by AllReduce [M, K]
-            residual: Residual tensor to add [M, K]
-            rms_gamma: RMSNorm gamma weights [K]
-            progress: Progress tracking tensor [SPLITS_PER_RANK]
-            rms_eps: Epsilon for numerical stability
-            SPLITS_PER_RANK: Number of splits per rank
-
-        Returns:
-            Tuple of (normalized_output, updated_residual) both [M, K]
-        """
-        return _allreduce_add_rmsnorm_pure_helion_kernel(
-            allreduce_buf, residual, rms_gamma, progress, rms_eps, SPLITS_PER_RANK
-        )
-
-    @_allreduce_add_rmsnorm_helion_kernel.register_fake
-    def _allreduce_add_rmsnorm_helion_kernel_fake(
-        allreduce_buf: torch.Tensor,
-        residual: torch.Tensor,
-        rms_gamma: torch.Tensor,
-        progress: torch.Tensor,
-        rms_eps: float,
-        SPLITS_PER_RANK: int,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Fake/meta implementation for allreduce_add_rmsnorm Helion kernel.
-        Defines the input/output shape relationship without actual computation.
-
-        Shape contract:
-        - allreduce_buf: [M, K]
-        - residual: [M, K]
-        - rms_gamma: [K]
-        - progress: [SPLITS_PER_RANK]
-        - returns: tuple of (normalized_output, updated_residual) both [M, K]
-        """
-        M, K = allreduce_buf.size()
-        out = torch.empty(
-            [M, K], dtype=allreduce_buf.dtype, device=allreduce_buf.device
-        )
-        residual_out = torch.empty(
-            [M, K], dtype=allreduce_buf.dtype, device=allreduce_buf.device
-        )
-        return out, residual_out
-
 
 def helion_allreduce_add_rmsnorm(
     input_shared: torch.Tensor,
@@ -462,12 +430,12 @@ def helion_allreduce_add_rmsnorm(
     )
 
     # Perform AllReduce with progress tracking (custom op handles fake mode and symmetric memory conversion)
-    torch.ops.my_helion_lib.copy_engine_all_reduce_w_progress(
+    torch.ops.vllm_helion.copy_engine_all_reduce_w_progress(
         allreduce_out, input_shared, progress, splits_per_rank
     )
 
     # Call the Helion kernel for Add + RMSNorm
-    norm_out, residual_out = torch.ops.my_helion_lib.allreduce_add_rmsnorm(
+    norm_out, residual_out = allreduce_add_rmsnorm(
         allreduce_out,
         residual,
         rms_gamma,
@@ -662,9 +630,9 @@ def get_best_config(
                 splits_match = key_splits == splits
 
                 if distance < best_distance or (
-                    distance == best_distance and splits_match and (
-                        best_match is None or not best_match[2]
-                    )
+                    distance == best_distance
+                    and splits_match
+                    and (best_match is None or not best_match[2])
                 ):
                     best_match = (size, key, splits_match)
                     best_distance = distance
@@ -688,7 +656,7 @@ def get_best_config(
     def helion_kernel(self):
         """The Helion kernel function for autotuning."""
         if HELION_AVAILABLE:
-            return _allreduce_add_rmsnorm_pure_helion_kernel
+            return allreduce_add_rmsnorm._helion_kernel
         return None
 
 
diff --git a/vllm/compilation/helion/custom_op.py b/vllm/compilation/helion/custom_op.py
@@ -45,7 +45,7 @@ class HelionCustomOp(CustomOp):
         @CustomOp.register("my_helion_op")
         class MyHelionOp(HelionCustomOp):
             def forward_helion(self, x):
-                return torch.ops.my_helion_lib.my_op(x)
+                return torch.ops.vllm_helion.my_op(x)
 
     Checking if an op is enabled:
         # Class method (call on the class)
diff --git a/vllm/compilation/helion/register.py b/vllm/compilation/helion/register.py
diff --git a/vllm/compilation/helion/rms_norm_fp8.py b/vllm/compilation/helion/rms_norm_fp8.py
diff --git a/vllm/compilation/helion/silu_mul_fp8.py b/vllm/compilation/helion/silu_mul_fp8.py