Test SDPA fusion via MHA (#2366)

gramalingam · Copilot · github-advanced-security[bot] · web-flow · commit c7d578631573 · 2025-06-05T10:01:52.000-07:00
Implements SDPA (introduced by our fusions) via MHA (in a subset of
cases), so that the fused model can be run and tested using ORT.

Not yet addressed: use of KV cache, 3D vs 4D Q/K/V formats. (Will
address them as I cleanup the MHA fusion rules next).

Also fix some copy-paste errors in the SDPA test-cases (and make the
test-case naming scheme more uniform, helps with pytest test-selection
filter -k).

---------

Signed-off-by: Ganesan Ramalingam &lt;grama@microsoft.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Co-authored-by: Copilot Autofix powered by AI &lt;62310815+github-advanced-security[bot]@users.noreply.github.com&gt;
diff --git a/onnxscript/rewriter/ort_fusions/sdpa_test.py b/onnxscript/rewriter/ort_fusions/sdpa_test.py
@@ -16,7 +16,9 @@
 from onnxscript import script
 from onnxscript.onnx_opset import opset18 as op
 from onnxscript.onnx_types import FLOAT
+from onnxscript.rewriter.ort_fusions._test_utils import assert_allclose, ort_run
 from onnxscript.rewriter.ort_fusions.sdpa import fuse_sdpa
+from onnxscript.rewriter.ort_fusions.sdpa_via_mha import replace_sdpa_by_mha
 
 B = 2  # batch size
 N = 4  # number of heads
@@ -190,7 +192,7 @@ def _masked_post_mul_sdpa_script(query, key, value, mask):
 
 
 @script()
-def _custom_scale_pre_div_sdpa_script(query, key, value, mask):
+def _masked_custom_scale_pre_div_sdpa_script(query, key, value, mask):
     key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
     divisor = op.Constant(value_float=SQRT_CUSTOM_DIV_SCALE_FACTOR)
     scaled_query = op.Div(query, divisor)
@@ -203,7 +205,7 @@ def _custom_scale_pre_div_sdpa_script(query, key, value, mask):
 
 
 @script()
-def _custom_scale_pre_mul_sdpa_script(query, key, value, mask):
+def _masked_custom_scale_pre_mul_sdpa_script(query, key, value, mask):
     key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
     multiplier = op.Constant(value_float=SQRT_CUSTOM_MUL_SCALE_FACTOR)
     scaled_query = op.Mul(query, multiplier)
@@ -216,7 +218,7 @@ def _custom_scale_pre_mul_sdpa_script(query, key, value, mask):
 
 
 @script()
-def _custom_scale_post_div_sdpa_script(query, key, value, mask):
+def _masked_custom_scale_post_div_sdpa_script(query, key, value, mask):
     key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
     divisor = op.Constant(value_float=CUSTOM_DIV_SCALE_FACTOR)
     attn_score = op.MatMul(query, key_transposed)
@@ -228,7 +230,7 @@ def _custom_scale_post_div_sdpa_script(query, key, value, mask):
 
 
 @script()
-def _custom_scale_post_mul_sdpa_script(query, key, value, mask):
+def _masked_custom_scale_post_mul_sdpa_script(query, key, value, mask):
     key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
     multiplier = op.Constant(value_float=CUSTOM_MUL_SCALE_FACTOR)
     attn_score = op.MatMul(query, key_transposed)
@@ -240,15 +242,19 @@ def _custom_scale_post_mul_sdpa_script(query, key, value, mask):
 
 
 class SDPATestCase:
-    def __init__(self, script_func):
+    def __init__(self, script_func, *, with_mask):
         self.script_func = script_func
+        self.with_mask = with_mask
 
     def get_onnx_model(self):
         if not hasattr(self, "_onnx_model"):
             qkv_type = FLOAT[B, N, S, H]
             mask_type = FLOAT[B, N, S, S]
+            input_types = [qkv_type, qkv_type, qkv_type]
+            if self.with_mask:
+                input_types.append(mask_type)
             model_proto = self.script_func.to_model_proto(
-                input_types=[qkv_type, qkv_type, qkv_type, mask_type], output_types=[qkv_type]
+                input_types=input_types, output_types=[qkv_type]
             )
             self._onnx_model = ir.serde.deserialize_model(model_proto)
         return self._onnx_model
@@ -259,8 +265,9 @@ def get_ort_inputs(self):
                 "query": numpy.random.rand(B, N, S, H).astype(numpy.float32),
                 "key": numpy.random.rand(B, N, S, H).astype(numpy.float32),
                 "value": numpy.random.rand(B, N, S, H).astype(numpy.float32),
-                "mask": numpy.random.rand(B, N, S, S).astype(numpy.float32),
             }
+            if self.with_mask:
+                inputs["mask"] = numpy.random.rand(B, N, S, S).astype(numpy.float32)
             self._ort_inputs = inputs
         return self._ort_inputs
 
@@ -296,35 +303,35 @@ def get_ort_inputs(self):
 class TestSDPAFusion(unittest.TestCase):
     @parameterized.parameterized.expand(
         [
-            ("unmasked_pre_div", _unmasked_pre_div_sdpa_script),
-            ("unmasked_pre_mul", _unmasked_pre_mul_sdpa_script),
-            ("unmasked_post_div", _unmasked_post_div_sdpa_script),
-            ("unmasked_post_mul", _unmasked_post_mul_sdpa_script),
-            ("pre_div", _masked_pre_div_sdpa_script),
-            ("pre_mul", _masked_pre_mul_sdpa_script),
-            ("post_div", _masked_post_div_sdpa_script),
-            ("post_mul", _masked_post_mul_sdpa_script),
+            ("pre_div", _unmasked_pre_div_sdpa_script),
+            ("pre_mul", _unmasked_pre_mul_sdpa_script),
+            ("post_div", _unmasked_post_div_sdpa_script),
+            ("post_mul", _unmasked_post_mul_sdpa_script),
+            ("masked_pre_div", _masked_pre_div_sdpa_script),
+            ("masked_pre_mul", _masked_pre_mul_sdpa_script),
+            ("masked_post_div", _masked_post_div_sdpa_script),
+            ("masked_post_mul", _masked_post_mul_sdpa_script),
             ("custom_scale_post_mul", _custom_scale_post_mul_sdpa_script),
             ("custom_scale_post_div", _custom_scale_post_div_sdpa_script),
             ("custom_scale_pre_mul", _custom_scale_pre_mul_sdpa_script),
             ("custom_scale_pre_div", _custom_scale_pre_div_sdpa_script),
-            ("custom_scale_post_mul_masked", _custom_scale_post_mul_sdpa_script),
-            ("custom_scale_post_div_masked", _custom_scale_post_div_sdpa_script),
-            ("custom_scale_pre_mul_masked", _custom_scale_pre_mul_sdpa_script),
-            ("custom_scale_pre_div_masked", _custom_scale_pre_div_sdpa_script),
+            ("masked_custom_scale_post_mul", _masked_custom_scale_post_mul_sdpa_script),
+            ("masked_custom_scale_post_div", _masked_custom_scale_post_div_sdpa_script),
+            ("masked_custom_scale_pre_mul", _masked_custom_scale_pre_mul_sdpa_script),
+            ("masked_custom_scale_pre_div", _masked_custom_scale_pre_div_sdpa_script),
             (
                 "_custom_multi_scale_pre_mul_sdpa_script",
                 _custom_multi_scale_pre_mul_sdpa_script,
             ),
         ]
     )
     def test_sdpa_fusion(self, name, script_func):
-        test_case = SDPATestCase(script_func)
+        test_case = SDPATestCase(script_func, with_mask="masked" in name)
         model = test_case.get_onnx_model()
         onnxscript.optimizer.optimize(model)
 
-        # inputs = test_case.get_ort_inputs()
-        # original_outputs = ort_run("original", model, inputs)
+        inputs = test_case.get_ort_inputs()
+        original_outputs = ort_run("original", model, inputs)
 
         count = fuse_sdpa(model, debug=True)
         self.assertGreater(count, 0)
@@ -347,8 +354,12 @@ def test_sdpa_fusion(self, name, script_func):
             # of scale_factor (is =default_scaling_factor)
             self.assertIsNone(sdpa_node.attributes.get("scale"))
 
-        # new_outputs = ort_run("optimized", model, inputs)
-        # assert_allclose(new_outputs, original_outputs)
+        replace_sdpa_by_mha(model, debug=True)
+
+        self.assertNotIn("SDPA", [n.op_type for n in model.graph])
+
+        new_outputs = ort_run("optimized", model, inputs)
+        assert_allclose(new_outputs, original_outputs)
 
     def test_invalid_sdpa_fusion_value_batch_dim(self):
         test_case = InvalidSDPATestCase(_masked_pre_mul_sdpa_script)
diff --git a/onnxscript/rewriter/ort_fusions/sdpa_via_mha.py b/onnxscript/rewriter/ort_fusions/sdpa_via_mha.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+from typing import Union
+
+import onnxscript.ir as ir
+from onnxscript.rewriter import _fusion_utils, pattern
+
+Dim = Union[int, ir.SymbolicDim]
+
+
+class SDPAImplementation(pattern.RewriteRuleClassBase):
+    def pattern(self, op, query, key_transposed, value):
+        return op.SDPA(
+            query,
+            key_transposed,
+            value,
+            _allow_other_inputs=True,  # Mask is optional
+            _outputs=["sdpa_output"],
+            _domain="ai.onnxruntime.fusion",
+        )
+
+    def check(self, context, query, key_transposed, value, sdpa_output):
+        bindings: dict[str, Dim] = {}
+        _fusion_utils.check_shape(bindings, query, ["B", "H", "S", "Dh"])
+        _fusion_utils.check_shape(bindings, key_transposed, ["B", "H", "Dh", "Skv"])
+        _fusion_utils.check_shape(bindings, value, ["B", "H", "Skv", "Dv"])
+
+        self._num_heads = bindings["H"]
+        if not isinstance(self._num_heads, int):
+            return False
+        self._use_mask_broadcast = True  # TODO: optimize to avoid broadcast if not needed
+        return isinstance(self._num_heads, int)
+
+    def rewrite(self, op, query, key_transposed, value, sdpa_output):
+        sdpa_node = sdpa_output.producer()
+        scale = sdpa_node.attributes.get("scale", None)
+        to_3d_shape = op.Constant(value_ints=[0, 0, -1])
+        to_4d_shape = op.Constant(value_ints=[0, 0, self._num_heads, -1])
+        query_3d = op.Reshape(op.Transpose(query, perm=[0, 2, 1, 3]), to_3d_shape)
+        key_3d = op.Reshape(op.Transpose(key_transposed, perm=[0, 3, 1, 2]), to_3d_shape)
+        value_3d = op.Reshape(op.Transpose(value, perm=[0, 2, 1, 3]), to_3d_shape)
+
+        inputs = [query_3d, key_3d, value_3d]
+        if len(sdpa_node.inputs) > 3:
+            mask = sdpa_node.inputs[3]
+
+            if self._use_mask_broadcast:
+                one = op.Constant(value_ints=[1])
+                query_length = op.Shape(query, start=2, end=3)
+                shape_11S1 = op.Concat(one, one, query_length, one, axis=0)
+                mask = op.Expand(mask, shape_11S1)
+
+            inputs.extend([None, None, mask])
+
+        output = op.MultiHeadAttention(
+            *inputs,
+            num_heads=self._num_heads,
+            scale=scale,
+            _domain="com.microsoft",
+        )
+        output_4d = op.Reshape(output, to_4d_shape)
+        output = op.Transpose(output_4d, perm=[0, 2, 1, 3])
+        return output
+
+
+_rules = pattern.RewriteRuleSet([SDPAImplementation.rule()])
+
+replace_sdpa_by_mha = _fusion_utils.apply_fusion_rules(_rules)