pytorch · choijon5 · Oct 22, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -134,3 +134,68 @@ jobs:
           # -rf: print failed tests
           # --timeout: max allowed time for each test
           pytest -rf --timeout=60
+
+  test-notebooks:
+    name: test-notebooks-cu128-py3.12-pytorch-2.9-a10g
+
+    container:
+      image: nvidia/cuda:12.8.1-devel-ubuntu24.04
+      options: --gpus all
+
+    runs-on: linux.g5.4xlarge.nvidia.gpu
+
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Run NVIDIA command
+        run: |
+          echo "Detected NVIDIA image"
+          nvidia-smi || echo "nvidia-smi not found"
+
+      - name: Check out code
+        uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.12"
+          enable-cache: true
+
+      - name: Create virtual environment
+        run: |
+          uv venv --python 3.12
+
+      - name: Install pip in venv
+        run: |
+          source .venv/bin/activate
+          uv pip install pip
+
+      - name: Get current month
+        id: date
+        run: echo "month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT
+
+      - name: Cache dependencies
+        id: cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            ~/.venv
+          key: notebooks-3.12-cu128-${{ hashFiles('.github/workflows/test.yml') }}-${{ steps.date.outputs.month }}
+
+      - name: Install notebook execution tools
+        run: |
+          source .venv/bin/activate
+          # Install jupyter for executing notebooks
+          uv pip install jupyter nbconvert pytest numpy
+
+      - name: Run Notebook Tests
+        run: |
+          source .venv/bin/activate
+          # Execute notebook using jupyter nbconvert
+          # The notebook's subprocess pip install will install torch and helion
+          jupyter nbconvert --to notebook --execute --inplace \
+            --ExecutePreprocessor.timeout=600 \
+            notebooks/softmax.ipynb
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 # About
 
-📚 **[View Documentation](https://helionlang.com)** 📚 | 🎥 **[Watch Talk](https://youtu.be/MBOPzfl1JBo?si=DwAhgL-bpH1kFSt3)** 🎥
+📚 **[View Documentation](https://helionlang.com)** 📚 | 🎥 **[Watch Talk](https://youtu.be/MBOPzfl1JBo?si=DwAhgL-bpH1kFSt3)** 🎥 | 🚀 **[Try In Colab](https://colab.research.google.com/github/pytorch/helion/blob/main/notebooks/softmax.ipynb)** 🚀
 
 **Helion** is a Python-embedded domain-specific language (DSL) for
 authoring machine learning kernels, designed to compile down to [Triton],
@@ -66,7 +66,6 @@ portable between different hardware. Helion automates and autotunes over:
    * Persistent kernel strategies.
    * Warp specialization choices, unrolling, and more.
 
-
 ## Example
 
 A minimal matrix multiplication kernel in Helion looks like this:

diff --git a/docs/index.md b/docs/index.md
@@ -66,6 +66,12 @@ portable between different hardware. Helion automates and autotunes over:
     * Persistent kernel strategies.
     * Warp specialization choices, unrolling, and more.
 
+## Try Helion Now
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/helion/blob/main/notebooks/softmax.ipynb)
+
+Try our [interactive demo notebook](https://github.com/pytorch/helion/blob/main/notebooks/softmax.ipynb) to see Helion in action! The notebook demonstrates softmax kernel implementations and runs directly in Google Colab on a GPU.
+
 ## Example
 
 A minimal matrix multiplication kernel in Helion looks like this:

diff --git a/notebooks/softmax.ipynb b/notebooks/softmax.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install \"torch==2.9.*\" --index-url https://download.pytorch.org/whl/cu126\n",
+    "%pip install helion\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Helion Softmax Kernel Examples\n",
+    "==============================\n",
+    "This example demonstrates multiple Helion kernel implementations of the softmax function,\n",
+    "including a simple wrapper around PyTorch's softmax, and a numerically optimized two-pass version.\n",
+    "The example also includes a check function to compare these kernels against PyTorch's\n",
+    "built-in softmax for correctness.\n",
+    "\"\"\"\n",
+    "\n",
+    "# %%\n",
+    "from __future__ import annotations\n",
+    "import torch\n",
+    "import helion\n",
+    "from helion._testing import run_example\n",
+    "import helion.language as hl\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "@helion.kernel(autotune_effort=\"quick\")\n",
+    "def softmax(x: torch.Tensor) -> torch.Tensor:\n",
+    "    \"\"\"\n",
+    "    Simple Helion kernel wrapping PyTorch's softmax function.\n",
+    "    Args:\n",
+    "        x (torch.Tensor): Input tensor of shape [n, m].\n",
+    "    Returns:\n",
+    "        torch.Tensor: Softmax output tensor of the same shape.\n",
+    "    \"\"\"\n",
+    "    n, _m = x.size()\n",
+    "    out = torch.empty_like(x)\n",
+    "    for tile_n in hl.tile(n):\n",
+    "        out[tile_n, :] = torch.nn.functional.softmax(x[tile_n, :], dim=1)\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "def check(m: int, n: int) -> None:\n",
+    "    \"\"\"\n",
+    "    Runs correctness checks comparing Helion softmax kernels against PyTorch's softmax.\n",
+    "    Args:\n",
+    "        m (int): Number of rows in input tensor.\n",
+    "        n (int): Number of columns in input tensor.\n",
+    "    \"\"\"\n",
+    "    x = torch.randn([m, n], device=\"cuda\", dtype=torch.float16)\n",
+    "    run_example(softmax, lambda x: torch.nn.functional.softmax(x, dim=1), (x,))\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "def main() -> None:\n",
+    "    \"\"\"\n",
+    "    Main function to run the softmax kernel correctness check with example input size.\n",
+    "    \"\"\"\n",
+    "    check(4096, 2560)\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Helion Softmax Kernel Examples\n",
+    "==============================\n",
+    "This example demonstrates multiple Helion kernel implementations of the softmax function,\n",
+    "including a simple wrapper around PyTorch's softmax, and a numerically optimized two-pass version.\n",
+    "The example also includes a check function to compare these kernels against PyTorch's\n",
+    "built-in softmax for correctness.\n",
+    "\"\"\"\n",
+    "\n",
+    "# %%\n",
+    "from __future__ import annotations\n",
+    "import torch\n",
+    "import helion\n",
+    "from helion._testing import run_example\n",
+    "import helion.language as hl\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "@helion.kernel(autotune_effort=\"quick\")\n",
+    "def softmax_two_pass(x: torch.Tensor) -> torch.Tensor:\n",
+    "    \"\"\"\n",
+    "    Numerically optimized Helion kernel performing softmax in two passes.\n",
+    "    Args:\n",
+    "        x (torch.Tensor): Input tensor of shape [m, n].\n",
+    "    Returns:\n",
+    "        torch.Tensor: Softmax output tensor of the same shape.\n",
+    "    \"\"\"\n",
+    "    m, n = x.size()\n",
+    "    out = torch.empty_like(x)\n",
+    "    block_size_m = hl.register_block_size(m)\n",
+    "    block_size_n = hl.register_block_size(n)\n",
+    "    for tile_m in hl.tile(m, block_size=block_size_m):\n",
+    "        mi = hl.full([tile_m], float(\"-inf\"), dtype=torch.float32)\n",
+    "        di = hl.zeros([tile_m], dtype=torch.float32)\n",
+    "        for tile_n in hl.tile(n, block_size=block_size_n):\n",
+    "            values = x[tile_m, tile_n]\n",
+    "            local_amax = torch.amax(values, dim=1)\n",
+    "            mi_next = torch.maximum(mi, local_amax)\n",
+    "            di = di * torch.exp(mi - mi_next) + torch.exp(\n",
+    "                values - mi_next[:, None]\n",
+    "            ).sum(dim=1)\n",
+    "            mi = mi_next\n",
+    "        for tile_n in hl.tile(n, block_size=block_size_n):\n",
+    "            values = x[tile_m, tile_n]\n",
+    "            out[tile_m, tile_n] = torch.exp(values - mi[:, None]) / di[:, None]\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "def check(m: int, n: int) -> None:\n",
+    "    \"\"\"\n",
+    "    Runs correctness checks comparing Helion softmax kernels against PyTorch's softmax.\n",
+    "    Args:\n",
+    "        m (int): Number of rows in input tensor.\n",
+    "        n (int): Number of columns in input tensor.\n",
+    "    \"\"\"\n",
+    "    x = torch.randn([m, n], device=\"cuda\", dtype=torch.float16)\n",
+    "    run_example(softmax_two_pass, lambda x: torch.nn.functional.softmax(x, dim=1), (x,))\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "def main() -> None:\n",
+    "    \"\"\"\n",
+    "    Main function to run the softmax kernel correctness check with example input size.\n",
+    "    \"\"\"\n",
+    "    check(4096, 2560)\n",
+    "\n",
+    "\n",
+    "# %%\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ src = ["helion"]
 docstring-code-format = true
 quote-style = "double"
 line-ending = "lf"
-exclude = [".github/*"]
+exclude = [".github/*", "notebooks/**/*.ipynb"]
 
 [tool.ruff.lint]
 select = [
@@ -64,7 +64,7 @@ ignore = [
 ]
 extend-safe-fixes = ["TC", "UP045", "RUF013", "RSE102"]
 preview = true
-exclude = ["test/data/*", ".github/*"]
+exclude = ["test/data/*", ".github/*", "notebooks/**/*.ipynb"]
 
 [tool.ruff.lint.per-file-ignores]
 "test/*" = ["ANN"]