Why would type annotation affect the performance of matmul kernel? #8457

Fr4nk1inCs · 2025-10-16T13:40:39Z

Fr4nk1inCs
Oct 16, 2025

Following the official tutorial on Matrix Multiplicaiton, I've written a similar matmul operator:

import triton
import triton.language as tl
import torch

AUTOTUNE_CONFIG = [ ... ]

@triton.jit
def _inner_kernel(
    a_ptr, b_ptr, c_ptr,
    M, N, K,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE: tl.constexpr,
):
    pid = tl.program_id(0)

    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)

    pid_m = pid // num_pid_n
    pid_n = pid % num_pid_n
    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE)

    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)
    tl.assume(stride_am > 0)
    tl.assume(stride_ak > 0)
    tl.assume(stride_bn > 0)
    tl.assume(stride_bk > 0)
    tl.assume(stride_cm > 0)
    tl.assume(stride_cn > 0)

    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    mask_m = offs_m < M
    mask_n = offs_n < N

    a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)

    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for k in tl.range(0, K, BLOCK_SIZE_K):
        mask_k = offs_k < (K - k)
        mask_a = mask_m[:, None] & mask_k[None, :]
        mask_b = mask_k[:, None] & mask_n[None, :]

        a = tl.load(a_ptrs, mask=mask_a, other=0.0)
        b = tl.load(b_ptrs, mask=mask_b, other=0.0)
        accumulator = tl.dot(a, b, accumulator, input_precision="ieee")

        a_ptrs += BLOCK_SIZE_K * stride_ak
        b_ptrs += BLOCK_SIZE_K * stride_bk

    c = accumulator.to(c_ptr.dtype.element_ty)
    c_ptrs = c_ptr + (offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn)
    mask_c = mask_m[:, None] & mask_n[None, :]
    tl.store(c_ptrs, c, mask=mask_c)


@triton.autotune(
    configs=AUTOTUNE_CONFIG,
    key=["M", "N", "K"],
)
@triton.jit
def matmul_kernel_without_annotation(
    a_ptr, b_ptr, c_ptr,
    M, N, K,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE: tl.constexpr,
):
    _inner_kernel(
        a_ptr, b_ptr, c_ptr,
        M, N, K,
        stride_am, stride_ak,
        stride_bk, stride_bn,
        stride_cm, stride_cn,
        BLOCK_SIZE_M=BLOCK_SIZE_M,
        BLOCK_SIZE_N=BLOCK_SIZE_N,
        BLOCK_SIZE_K=BLOCK_SIZE_K,
        GROUP_SIZE=GROUP_SIZE,
    )


@triton.autotune(
    configs=AUTOTUNE_CONFIG,
    key=["M", "N", "K"],
)
@triton.jit
def matmul_kernel_with_annotation(
    a_ptr: tl.pointer_type,
    b_ptr: tl.pointer_type,
    c_ptr: tl.pointer_type,
    M: int,
    N: int,
    K: int,
    stride_am: int,
    stride_ak: int,
    stride_bk: int,
    stride_bn: int,
    stride_cm: int,
    stride_cn: int,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE: tl.constexpr,
):
    _inner_kernel(
        a_ptr, b_ptr, c_ptr,
        M, N, K,
        stride_am, stride_ak,
        stride_bk, stride_bn,
        stride_cm, stride_cn,
        BLOCK_SIZE_M=BLOCK_SIZE_M,
        BLOCK_SIZE_N=BLOCK_SIZE_N,
        BLOCK_SIZE_K=BLOCK_SIZE_K,
        GROUP_SIZE=GROUP_SIZE,
    )


def matmul(a: torch.Tensor, b: torch.Tensor, with_annotation: bool = True):
    assert a.shape[1] == b.shape[0]

    M, K, N = a.shape[0], a.shape[1], b.shape[1]
    c = torch.empty((M, N), device=a.device, dtype=a.dtype)

    kernel = (
        matmul_kernel_with_annotation
        if with_annotation
        else matmul_kernel_without_annotation
    )

    grid = lambda META: (
        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
    )
    kernel[grid](
        a_ptr=a,
        b_ptr=b,
        c_ptr=c,
        M=M,
        N=N,
        K=K,
        stride_am=a.stride(0),
        stride_ak=a.stride(1),
        stride_bk=b.stride(0),
        stride_bn=b.stride(1),
        stride_cm=c.stride(0),
        stride_cn=c.stride(1),
    )
    return c

AUTOTUNE_CONFIG

AUTOTUNE_CONFIG = [
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 256,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE": 8,
        },
        num_stages=3,
        num_warps=8,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": 256,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 32,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE": 8},
        num_stages=5,
        num_warps=2,
    ),
    triton.Config(
        {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE": 8},
        num_stages=5,
        num_warps=2,
    ),
    # Good config for fp8 inputs.
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 256,
            "BLOCK_SIZE_K": 128,
            "GROUP_SIZE": 8,
        },
        num_stages=3,
        num_warps=8,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 256,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 128,
            "GROUP_SIZE": 8,
        },
        num_stages=3,
        num_warps=8,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 256,
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 128,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": 256,
            "BLOCK_SIZE_K": 128,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 128,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": 128,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
    triton.Config(
        {
            "BLOCK_SIZE_M": 128,
            "BLOCK_SIZE_N": 32,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE": 8,
        },
        num_stages=4,
        num_warps=4,
    ),
]

The _inner_kernel is the actual implementation of matmul, and matmul_kernel_with_annotation/matmul_kernel_without_annotation are wrappers of _inner_kernel. The only difference is that matmul_kernel_with_annotation has annotations for non-constexpr arguments.

However, when I tried to benchmarking both version of matmul kernel with the following code, it seems that the annotated version has much poorer performance.

from typing import Literal

import triton
import triton.language as tl
import torch

QUANTILES = [0.5, 0.2, 0.8]
BENCHMARK_CONFIGS = [
    triton.testing.Benchmark(
        x_names=["M", "N", "K"],
        x_vals=[128 * i for i in range(2, 33)],
        line_arg="impl",
        line_vals=["torch", "triton_annotated", "triton_unannotated"],
        line_names=["PyTorch", "Triton (annotated)", "Triton (unannotated)"],
        styles=[("blue", "-"), ("green", "-"), ("red", "-")],
        ylabel="TFLOPS",
        plot_name="matmul-annotation-impact",
        args={},
    )
]


@triton.testing.perf_report(BENCHMARK_CONFIGS)
def benchmark(
    M: int,
    N: int,
    K: int,
    impl: Literal["torch", "triton_annotated", "triton_unannotated"],
):
    a = torch.randn((M, K), device="cuda", dtype=torch.float16)
    b = torch.randn((K, N), device="cuda", dtype=torch.float16)

    if impl == "torch":
        fn = lambda: torch.matmul(a, b)
    elif impl == "triton_annotated":
        fn = lambda: matmul(a, b, with_annotation=True)
    elif impl == "triton_unannotated":
        fn = lambda: matmul(a, b, with_annotation=False)

    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=QUANTILES)

    def tflops(ms: float):
        return 2 * M * N * K / ms / 1e9

    return tflops(ms), tflops(min_ms), tflops(max_ms)


if __name__ == "__main__":
    benchmark.run(print_data=True, show_plots=True, save_path="/tmp/matmul_benchmark")

Benchmark result on A40:

matmul-annotation-impact:
         M       N       K     PyTorch  Triton (annotated)  Triton (unannotated)
0    256.0   256.0   256.0    4.681143            1.215036              4.369067
1    384.0   384.0   384.0   12.288000            2.835692             13.824000
2    512.0   512.0   512.0   26.214401            3.855059             23.831273
3    640.0   640.0   640.0   30.145354            4.830189             36.571428
4    768.0   768.0   768.0   55.296000            7.192975             55.296000
5    896.0   896.0   896.0   58.538665            6.787092             52.034370
6   1024.0  1024.0  1024.0   77.672296            9.118052             69.905068
7   1152.0  1152.0  1152.0   96.322066           11.484554             87.823057
8   1280.0  1280.0  1280.0   95.255814            8.274748             74.472727
9   1408.0  1408.0  1408.0   97.353142           11.103413             81.369790
10  1536.0  1536.0  1536.0  105.640118           13.155925             98.303997
11  1664.0  1664.0  1664.0   84.101981            9.467556             78.937823
12  1792.0  1792.0  1792.0   89.915388            9.824672             90.640517
13  1920.0  1920.0  1920.0   94.040817           10.187178            103.939854
14  2048.0  2048.0  2048.0   95.325090           12.409183             92.182504
15  2176.0  2176.0  2176.0  112.422618           13.478666            102.671677
16  2304.0  2304.0  2304.0  124.415996           15.562131            113.751772
17  2432.0  2432.0  2432.0  121.621057           12.177756            103.288469
18  2560.0  2560.0  2560.0  109.959732           11.904814            111.836181
19  2688.0  2688.0  2688.0  113.913080           14.434192            105.077714
20  2816.0  2816.0  2816.0  104.091191           15.935042            113.875215
21  2944.0  2944.0  2944.0  102.123018           13.575601            105.361591
22  3072.0  3072.0  3072.0  106.836041           14.296958            113.473151
23  3200.0  3200.0  3200.0  105.090315           14.988291            107.744111
24  3328.0  3328.0  3328.0  103.883547           13.601227            103.139390
25  3456.0  3456.0  3456.0   96.552774           14.769913            108.948067
26  3584.0  3584.0  3584.0  116.320040           15.881903            108.205503
27  3712.0  3712.0  3712.0  105.711478           14.558051            104.059736
28  3840.0  3840.0  3840.0  111.709094           14.790959            107.266726
29  3968.0  3968.0  3968.0  116.324062           16.215805            106.944733
30  4096.0  4096.0  4096.0  108.766395           15.224334            103.964154

Why would type annotation affect the resulting kernel performance?

Fr4nk1inCs · 2025-10-16T14:14:49Z

Fr4nk1inCs
Oct 16, 2025
Author

Config selected for each input shape:

Autotune config

annotated	input_shape	block_size_m	block_size_n	block_size_k	group_size	num_warps	num_ctas	num_stages
True	256	64	32	32	8	2	1	5
False	256	32	64	32	8	2	1	5
True	384	64	32	32	8	2	1	5
False	384	32	64	32	8	2	1	5
True	512	128	32	64	8	4	1	4
False	512	128	32	32	8	4	1	4
True	640	128	64	64	8	4	1	4
False	640	128	64	32	8	4	1	4
True	768	128	64	64	8	4	1	4
False	768	128	64	32	8	4	1	4
True	896	128	128	32	8	4	1	4
False	896	128	64	32	8	4	1	4
True	1024	128	128	32	8	4	1	4
False	1024	128	64	32	8	4	1	4
True	1152	128	128	32	8	4	1	4
False	1152	128	128	32	8	4	1	4
True	1280	128	128	32	8	4	1	4
False	1280	128	64	32	8	4	1	4
True	1408	128	256	64	8	8	1	3
False	1408	128	256	64	8	8	1	3
True	1536	128	256	64	8	8	1	3
False	1536	128	256	64	8	8	1	3
True	1664	128	128	32	8	4	1	4
False	1664	128	128	32	8	4	1	4
True	1792	128	64	32	8	4	1	4
False	1792	128	128	32	8	4	1	4
True	1920	128	256	64	8	8	1	3
False	1920	128	128	32	8	4	1	4
True	2048	128	256	64	8	8	1	3
False	2048	128	128	32	8	4	1	4
True	2176	128	256	64	8	8	1	3
False	2176	128	128	32	8	4	1	4
True	2304	128	256	64	8	8	1	3
False	2304	128	256	64	8	8	1	3
True	2432	128	128	32	8	4	1	4
False	2432	128	128	32	8	4	1	4
True	2560	128	128	32	8	4	1	4
False	2560	128	128	32	8	4	1	4
True	2688	128	256	64	8	8	1	3
False	2688	128	256	64	8	8	1	3
True	2816	128	256	64	8	8	1	3
False	2816	128	256	64	8	8	1	3
True	2944	128	128	32	8	4	1	4
False	2944	128	128	32	8	4	1	4
True	3072	128	256	64	8	8	1	3
False	3072	128	128	32	8	4	1	4
True	3200	128	256	64	8	8	1	3
False	3200	128	256	64	8	8	1	3
True	3328	128	128	32	8	4	1	4
False	3328	128	128	32	8	4	1	4
True	3456	128	256	64	8	8	1	3
False	3456	128	128	32	8	4	1	4
True	3584	128	256	64	8	8	1	3
False	3584	128	256	64	8	8	1	3
True	3712	128	256	64	8	8	1	3
False	3712	128	128	32	8	4	1	4
True	3840	128	128	32	8	4	1	4
False	3840	128	128	32	8	4	1	4
True	3968	128	256	64	8	8	1	3
False	3968	128	256	64	8	8	1	3
True	4096	128	256	64	8	8	1	3
False	4096	128	128	32	8	4	1	4

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Why would type annotation affect the performance of matmul kernel? #8457

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Why would type annotation affect the performance of matmul kernel? #8457

Uh oh!

Uh oh!

Fr4nk1inCs Oct 16, 2025

Replies: 1 comment

Uh oh!

Fr4nk1inCs Oct 16, 2025 Author

Fr4nk1inCs
Oct 16, 2025

Fr4nk1inCs
Oct 16, 2025
Author