[helion] backward support for swiglu

shunting314 · shunting314 · commit 99587ae5b754 · 2025-10-01T15:25:42.000-07:00
diff --git a/examples/swiglu.py b/examples/swiglu.py
@@ -31,13 +31,14 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable
+    from typing import Any
 
 
 # %%
 # SwiGLU Kernel
 # -------------
 @helion.kernel()
-def swiglu(a: Tensor, b: Tensor) -> Tensor:
+def swiglu_fwd(a: Tensor, b: Tensor) -> Tensor:
     """
     Performs SwiGLU operation: SiLU(a) * b where SiLU is the Swish activation.
 
@@ -86,6 +87,65 @@ def swiglu(a: Tensor, b: Tensor) -> Tensor:
     return out
 
 
+@helion.kernel()
+def swiglu_bwd(gout: Tensor, x1: Tensor, x2: Tensor) -> tuple[Tensor, Tensor]:
+    """
+    Implement the backward formula for swiglu.
+    """
+    dx1 = torch.empty_like(x1)
+    dx2 = torch.empty_like(x2)
+
+    gout_flat = gout.view(-1)
+    x1_flat = x1.view(-1)
+    x2_flat = x2.view(-1)
+    dx1_flat = dx1.view(-1)
+    dx2_flat = dx2.view(-1)
+
+    for tile in hl.tile(x1.numel()):
+        x1_vals = x1_flat[tile].to(torch.float32)
+        gout_vals = gout_flat[tile].to(torch.float32)
+
+        # compute dx2
+        dx2_vals = x1_vals * torch.sigmoid(x1_vals) * gout_vals
+        dx2_flat[tile] = dx2_vals.to(x2.dtype)
+
+        # compute dx1
+        x2_vals = x2_flat[tile].to(torch.float32)
+        x1_exp = torch.exp(x1_vals)
+        x1_exp_plus1 = x1_exp + 1
+        dextra = x1_exp / x1_exp_plus1 + x1_vals * x1_exp / x1_exp_plus1 / x1_exp_plus1
+        dx1_vals = gout_vals * x2_vals * dextra
+        dx1_flat[tile] = dx1_vals.to(x1.dtype)
+
+    return dx1, dx2
+
+
+class SwigluFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,  # noqa: ANN401
+        x1: Tensor,
+        x2: Tensor,
+    ) -> Tensor:
+        out = swiglu_fwd(x1, x2)
+        ctx.save_for_backward(x1, x2)
+        return out
+
+    @staticmethod
+    def backward(  # type: ignore[override]
+        ctx: Any,  # noqa: ANN401
+        grad_out: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        x1, x2 = ctx.saved_tensors
+        dx1, dx2 = swiglu_bwd(grad_out, x1, x2)
+        return dx1, dx2
+
+
+def swiglu(a: Tensor, b: Tensor) -> Tensor:
+    """swiglu with forward + backward support."""
+    return SwigluFunction.apply(a, b)  # type: ignore[no-any-return]
+
+
 # %%
 # SwiGLU MLP Module (matches liger_kernel structure)
 # --------------------------------------------------
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -3745,6 +3745,60 @@ def swiglu(a: Tensor, b: Tensor, *, _launcher=_default_launcher):
     _launcher(_helion_swiglu, (triton.cdiv(total_elements, _BLOCK_SIZE_0),), a_flat, b_flat, out_flat, a_flat.stride(0), b_flat.stride(0), out_flat.stride(0), total_elements, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestExamples.test_swiglu_bwd)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_compat import libdevice
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_swiglu_bwd(x1_flat, gout_flat, dx2_flat, x2_flat, dx1_flat, x1_size_0, dx1_flat_stride_0, dx2_flat_stride_0, gout_flat_stride_0, x1_flat_stride_0, x2_flat_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x1_size_0
+    load = tl.load(x1_flat + indices_0 * x1_flat_stride_0, mask_0, other=0)
+    v_0 = tl.cast(load, tl.float32)
+    load_1 = tl.load(gout_flat + indices_0 * gout_flat_stride_0, mask_0, other=0)
+    v_1 = tl.cast(load_1, tl.float32)
+    v_2 = tl.sigmoid(tl.cast(v_0, tl.float32))
+    v_3 = v_0 * v_2
+    v_4 = v_3 * v_1
+    v_5 = tl.cast(v_4, tl.bfloat16)
+    tl.store(dx2_flat + indices_0 * dx2_flat_stride_0, v_5, mask_0)
+    load_2 = tl.load(x2_flat + indices_0 * x2_flat_stride_0, mask_0, other=0)
+    v_6 = tl.cast(load_2, tl.float32)
+    v_7 = libdevice.exp(v_0)
+    v_8 = 1.0
+    v_9 = v_7 + v_8
+    v_10 = v_7 / v_9
+    v_11 = v_0 * v_7
+    v_12 = v_11 / v_9
+    v_13 = v_12 / v_9
+    v_14 = v_10 + v_13
+    v_15 = v_1 * v_6
+    v_16 = v_15 * v_14
+    v_17 = tl.cast(v_16, tl.bfloat16)
+    tl.store(dx1_flat + indices_0 * dx1_flat_stride_0, v_17, mask_0)
+
+def swiglu_bwd(gout: Tensor, x1: Tensor, x2: Tensor, *, _launcher=_default_launcher):
+    """
+    Implement the backward formula for swiglu.
+    """
+    dx1 = torch.empty_like(x1)
+    dx2 = torch.empty_like(x2)
+    gout_flat = gout.view(-1)
+    x1_flat = x1.view(-1)
+    x2_flat = x2.view(-1)
+    dx1_flat = dx1.view(-1)
+    dx2_flat = dx2.view(-1)
+    _BLOCK_SIZE_0 = 1024
+    _launcher(_helion_swiglu_bwd, (triton.cdiv(x1.size(0), _BLOCK_SIZE_0),), x1_flat, gout_flat, dx2_flat, x2_flat, dx1_flat, x1.size(0), dx1_flat.stride(0), dx2_flat.stride(0), gout_flat.stride(0), x1_flat.stride(0), x2_flat.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return (dx1, dx2)
+
 --- assertExpectedJournal(TestExamples.test_template_via_closure0)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -4,6 +4,7 @@
 
 from packaging import version
 import torch
+import torch.nn.functional as F
 
 import helion
 from helion._testing import DEVICE
@@ -329,6 +330,33 @@ def test_rms_norm_fwd(self):
             )
         )
 
+    def test_swiglu_bwd(self):
+        """Test backward pass for swiglu."""
+        x1, x2 = [
+            torch.randn(1024, device=DEVICE, dtype=torch.bfloat16, requires_grad=True)
+            for _ in range(2)
+        ]
+
+        out = F.silu(x1) * x2
+
+        grad_out = torch.randn_like(out)
+        out.backward(grad_out)
+
+        args = (
+            grad_out,
+            x1,
+            x2,
+        )
+
+        self.assertExpectedJournal(
+            check_example(
+                "swiglu",
+                args,
+                (x1.grad, x2.grad),
+                fn_name="swiglu_bwd",
+            )
+        )
+
     def test_rms_norm_bwd(self):
         """Test backward pass for rms norm weight gradient."""
         batch_size, dim = 32, 64