[Wave] Add support for direct global load to lds

nithinsubbiah · nithinsubbiah · commit 727bfa6de459 · 2025-06-16T22:29:46.000Z
Signed-off-by: nithinsubbiah &lt;nithinsubbiah@gmail.com&gt;
diff --git a/iree/turbine/kernel/ops/wave_ops.py b/iree/turbine/kernel/ops/wave_ops.py
@@ -291,6 +291,16 @@ def select(cond: "Register", if_true: "Register", if_false: "Register") -> "Regi
     ...
 
 
+def gather_to_lds(
+    src: "Memory",
+    src_idx: dict[IndexSymbol, IndexSequence],
+    dst: "Memory",
+    dst_idx: dict[IndexSymbol, IndexSequence],
+    dtype: DataType,
+):
+    ...
+
+
 def define_op(op_name: str) -> Callable[[T], T]:
     def decorator(cls: T) -> T:
         cls.tkw_op_name = op_name
@@ -2332,3 +2342,19 @@ def indexing_dims(self) -> list[IndexExpr]:
 
     def infer_type(self):
         self.type = get_custom(_to_sequence(self.args)[0]).type
+
+
+@define_op("gather_to_lds")
+@dataclass
+class GatherToLDS(CustomOp):
+    """
+    Represents an instruction that performs direct load from global
+    to lds. Source memory points to the global memory to load from
+    and the destination points to shared memory.
+    """
+
+    src: Memory
+    src_idx: dict[IndexSymbol, IndexSequence]
+    dst: Memory
+    dst_idx: dict[IndexSymbol, IndexSequence]
+    dtype: DataType
diff --git a/iree/turbine/kernel/wave/codegen/handlers.py b/iree/turbine/kernel/wave/codegen/handlers.py
@@ -61,6 +61,7 @@
     exp2,
     extract,
     extract_slice,
+    gather_to_lds,
     ge,
     get_custom,
     get_result,
@@ -1612,3 +1613,15 @@ def handle_reshape(emitter: WaveEmitter, node: fx.Node):
         [1],
     )
     emitter.bind_node_proxy(node, IRProxyValue(slice))
+
+
+@handle_op(gather_to_lds)
+def handle_gather_to_lds(emitter: WaveEmitter, node: fx.Node):
+    try:
+        src, src_idx, dst, dst_idx, dtype = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    return amdgpu_d.gather_to_lds(
+        transfer_type=dtype, src=src, src_indices=src_idx, dst=dst, dst_indices=dst_idx
+    )
diff --git a/iree/turbine/kernel/wave/gather_to_shared.py b/iree/turbine/kernel/wave/gather_to_shared.py
@@ -0,0 +1,66 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .._support.tracing import CapturedTrace
+from ..lang.global_symbols import *
+from ..ops.wave_ops import GatherToLDS, Write, get_custom
+from ..wave.constraints import (
+    Constraint,
+)
+from ..wave.utils.run_utils import get_default_arch
+from .utils.general_utils import is_valid_global_read
+from .utils.graph_utils import DCE
+from .utils.symbol_utils import (
+    subs_idxc,
+)
+
+
+gather_to_shared_supported_arch = ["gfx950"]
+
+
+def get_write_node_info(read_custom):
+    write_node, write_memory, write_idx = [], [], []
+
+    for user in read_custom.users:
+        if (
+            isinstance(user, Write)
+            and subs_idxc(user.memory_type.address_space) == SHARED_ADDRESS_SPACE
+        ):
+            write_node.append(user)
+            write_memory.append(user.memory)
+            write_idx.append(user.get_derived_indices[0])
+
+    return write_node, write_memory, write_idx
+
+
+def gather_to_shared(trace: CapturedTrace, constraints: list[Constraint]):
+    """
+    This pass enables direct memory load from global to lds without passing
+    through register reducing the data movement. This instruction is supported
+    only on specific architectures (gfx950).
+    """
+
+    if get_default_arch() not in gather_to_shared_supported_arch:
+        return
+
+    global_read_nodes = trace.walk(is_valid_global_read)
+    for read_node in global_read_nodes:
+        read_custom = get_custom(read_node)
+        src = read_custom.memory
+        src_idx = read_custom.get_derived_indices[0]
+        element_type = read_custom.type.dtype
+        write_node, write_memory, write_idx = get_write_node_info(read_custom)
+        if not write_node:
+            continue
+        for (dst_node, dst_memory, dst_idx) in zip(write_node, write_memory, write_idx):
+            with dst_node.graph.inserting_before(dst_node.fx_node):
+                dst_node.replace_all_uses_with(
+                    GatherToLDS(
+                        src, src_idx, dst_memory, dst_idx, element_type
+                    ).add_to_graph(dst_node.graph)
+                )
+
+    DCE(trace)
diff --git a/iree/turbine/kernel/wave/global_to_shared_gathers.py b/iree/turbine/kernel/wave/global_to_shared_gathers.py
@@ -22,13 +22,15 @@
 from .utils.symbol_utils import subs_idxc
 from .utils.general_utils import is_gather
 from .minimize_global_loads import (
-    has_write_shared_user,
     construct_min_global_access_pattern,
     materialize_shape,
     identify_optimizable_loads,
     update_write_dependencies,
     SharedReadMetadata,
 )
+from .utils.general_utils import (
+    has_write_shared_user,
+)
 
 """
 We are given N global gathers that are promoted to shared memory. This function
diff --git a/iree/turbine/kernel/wave/minimize_global_loads.py b/iree/turbine/kernel/wave/minimize_global_loads.py
@@ -11,7 +11,7 @@
     TilingConstraint,
 )
 from .._support.tracing import CapturedTrace
-from .._support.indexing import IndexingContext, IndexSequence, IndexSymbol, IndexExpr
+from .._support.indexing import IndexSequence, IndexSymbol, IndexExpr
 from ..ops.wave_ops import Read, Write, get_custom
 from ..lang.global_symbols import *
 from .utils.general_utils import (
@@ -20,6 +20,9 @@
     is_shared_read,
     get_fastest_index,
 )
+from .utils.general_utils import (
+    is_valid_global_read,
+)
 from .utils.graph_utils import (
     DCE,
 )
@@ -41,23 +44,6 @@ class SharedReadMetadata:
     memory_shape: tuple[int | IndexExpr]
 
 
-def has_write_shared_user(node: Read) -> bool:
-    return any(
-        isinstance(user, Write)
-        and subs_idxc(user.memory_type.address_space) == SHARED_ADDRESS_SPACE
-        for user in node.users
-    )
-
-
-def is_valid_global_read(node: fx.Node) -> bool:
-    custom = get_custom(node)
-    return (
-        isinstance(custom, Read)
-        and subs_idxc(custom.memory_type.address_space) == GLOBAL_ADDRESS_SPACE
-        and has_write_shared_user(custom)
-    )
-
-
 def is_transposed_read(custom: Read) -> bool:
     """
     Checks whether or not we are doing a transposed read.
diff --git a/iree/turbine/kernel/wave/promotion.py b/iree/turbine/kernel/wave/promotion.py
@@ -67,8 +67,8 @@ def apply_promotion_pattern(
     ```
     read_from_global lhs
     write_to_shared lhs
-    read_from_global lhs
-    write_to_shared lhs
+    read_from_global rhs
+    write_to_shared rhs
     shared_barrier
     read_from_shared lhs
     read_from_shared rhs
diff --git a/iree/turbine/kernel/wave/utils/general_utils.py b/iree/turbine/kernel/wave/utils/general_utils.py
@@ -9,12 +9,13 @@
 import os
 import sympy
 import torch
+import torch.fx as fx
 from typing import Any, Callable, Optional
 
 
 from ..._support.indexing import IndexExpr, IndexSequence, IndexSymbol
 from ...lang.global_symbols import *
-from ...ops.wave_ops import CustomOp, Read, Iterate, Write
+from ...ops.wave_ops import CustomOp, Read, Iterate, Write, get_custom
 from ..assumptions import Assumption
 from ..constraints import (
     Constraint,
@@ -375,6 +376,26 @@ def is_shared_read(node: CustomOp) -> bool:
     )
 
 
+def has_write_shared_user(node: Read) -> bool:
+    return any(
+        isinstance(user, Write)
+        and subs_idxc(user.memory_type.address_space) == SHARED_ADDRESS_SPACE
+        for user in node.users
+    )
+
+
+def is_valid_global_read(node: fx.Node) -> bool:
+    """
+    Check if a read node is global and if its user writes to shared memory.
+    """
+    custom = get_custom(node)
+    return (
+        isinstance(custom, Read)
+        and subs_idxc(custom.memory_type.address_space) == GLOBAL_ADDRESS_SPACE
+        and has_write_shared_user(custom)
+    )
+
+
 def is_gather(custom: CustomOp) -> bool:
     if not isinstance(custom, Read):
         return False
diff --git a/iree/turbine/kernel/wave/utils/graph_utils.py b/iree/turbine/kernel/wave/utils/graph_utils.py
@@ -9,6 +9,7 @@
 import iree.turbine.kernel.lang as tkl
 from ...ops.wave_ops import (
     get_custom,
+    Read,
     Write,
     NestedRegionOp,
     Output,
diff --git a/iree/turbine/kernel/wave/wave.py b/iree/turbine/kernel/wave/wave.py
@@ -12,7 +12,7 @@
 from ..lang import Grid, IndexMapping
 from ..lang.global_symbols import *
 from ..ops import wave_ops
-from ..ops.wave_ops import Iterate, CustomOp, get_custom, IterArg
+from ..ops.wave_ops import Iterate, CustomOp, get_custom
 from .._support.indexing import IndexingContext, IndexExpr
 from .symbolic_constraints import SymbolicAlias
 from .._support.tracing import (
@@ -51,6 +51,7 @@
 from .decompose_scan_ops import decompose_scan_ops
 from .decompose_dot_mma import decompose_dot_mma
 from .expansion.expansion import expand_graph, add_get_results
+from .gather_to_shared import gather_to_shared
 from .global_to_shared_gathers import global_to_shared_gathers
 from .hoisting import hoist_loop_invariant_ops
 from .minimize_global_loads import minimize_global_loads
@@ -541,6 +542,7 @@ def _trace_and_get_kernel_signature(
             partial(hoist_loop_invariant_ops, trace, self.constraints),
             partial(global_to_shared_gathers, trace, self.constraints),
             partial(minimize_global_loads, trace, self.constraints),
+            partial(gather_to_shared, trace, self.constraints),
             partial(apply_shared_memory_indexing_corrections, trace, self.constraints),
         ]