microsoft · gramalingam · Jan 15, 2025 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/onnxscript/rewriter/onnxruntime/xformers/__init__.py b/onnxscript/rewriter/onnxruntime/xformers/__init__.py
@@ -7,9 +7,15 @@
     "fuse_normalization",
     "fuse_rotary_embedding",
     "fuse_cos_sin_cache",
+    "fuse_sdpa",
+    "fuse_mha",
+    "fuse_xformers",
 ]
 
 from onnxscript.rewriter.onnxruntime.xformers.cos_sin_cache import fuse_cos_sin_cache
+from onnxscript.rewriter.onnxruntime.xformers.fuse_xformers import fuse_xformers
+from onnxscript.rewriter.onnxruntime.xformers.mha import fuse_mha
 from onnxscript.rewriter.onnxruntime.xformers.rms_normalization import fuse_rms_normalization
 from onnxscript.rewriter.onnxruntime.xformers.rotary_embedding import fuse_rotary_embedding
-from onnxscript.rewriter.onnxruntime.xformers.skip_normalization import fuse_normalization
+from onnxscript.rewriter.onnxruntime.xformers.sdpa import fuse_sdpa
+from onnxscript.rewriter.onnxruntime.xformers.skip_normalization import fuse_normalization
diff --git a/...er/onnxruntime/xformers/_smollm_1layer.py → ...ewriter/onnxruntime/xformers/_smollm_1.py b/...er/onnxruntime/xformers/_smollm_1layer.py → ...ewriter/onnxruntime/xformers/_smollm_1.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 """
-A one-layer SmolLM model test case.
+A one-layer SmolLM model test case, with inputs: input_ids, attention_mask, and position_ids.
 This is an onnxscript version of the model.
 """
 
@@ -234,7 +234,7 @@ def make_model_with_random_weights():
     return model
 
 
-class _SmollmTestData:
+class TestData:
     def get_onnx_model(self):
         if not hasattr(self, "_onnx_model"):
             model_proto = make_model_with_random_weights()

diff --git a/onnxscript/rewriter/onnxruntime/xformers/_smollm_2.py b/onnxscript/rewriter/onnxruntime/xformers/_smollm_2.py
diff --git a/onnxscript/rewriter/onnxruntime/xformers/cos_sin_cache.py b/onnxscript/rewriter/onnxruntime/xformers/cos_sin_cache.py
@@ -41,6 +41,14 @@
         # pass to remove unused nodes.
         super().__init__(name, remove_nodes=False)
         self._max_pos_id = max_pos_id
+<<<<<<< HEAD
+        # map from inv_freq to (cos, sin) values for transformed graph
+        self._inv_freq_cos_sin_cache: dict[ir.Value, tuple[ir.Value, ir.Value]] = {}
+
+    def cleanup(self):
+        self._inv_freq_cos_sin_cache.clear()
+=======
+>>>>>>> main
 
     def pattern(self, op, x, inv_freq, position_ids, interleaved, num_heads):
         position_ids_expanded = op.Unsqueeze(position_ids, 1)
@@ -61,7 +69,11 @@
             _domain="ai.onnxruntime.fusion",
         )
 
+<<<<<<< HEAD
+    def check(self, context, inv_freq, position_ids, **_):
+=======
     def check(self, context, inv_freq, position_ids, **_) -> bool:
+>>>>>>> main
         if not _ir_utils.has_rank(position_ids, 2):
             return False
         if not _ir_utils.has_rank(inv_freq, 3):
@@ -72,13 +84,27 @@
         return inv_freq_shape[0] == 1 and inv_freq_shape[2] == 1
 
     def rewrite(self, op, x, inv_freq, position_ids, interleaved, num_heads, **_):
+<<<<<<< HEAD
+        if inv_freq in self._inv_freq_cos_sin_cache:
+            cos_2d, sin_2d = self._inv_freq_cos_sin_cache[inv_freq]
+        else:
+            inv_freq_values = inv_freq.const_value.numpy().reshape(1, -1)
+            pos_id_range = np.arange(self._max_pos_id, dtype=np.float32).reshape(-1, 1)
+            angles = np.matmul(pos_id_range, inv_freq_values)
+            cos_value = np.cos(angles)
+            sin_value = np.sin(angles)
+            cos_2d = op.Constant(value=ir.tensor(cos_value))
+            sin_2d = op.Constant(value=ir.tensor(sin_value))
+            self._inv_freq_cos_sin_cache[inv_freq] = (cos_2d, sin_2d)
+=======
         inv_freq_values = inv_freq.const_value.numpy().reshape(1, -1)
         pos_id_range = np.arange(self._max_pos_id, dtype=np.float32).reshape(-1, 1)
         angles = np.matmul(pos_id_range, inv_freq_values)
         cos_value = np.cos(angles)
         sin_value = np.sin(angles)
         cos_2d = op.Constant(value=ir.tensor(cos_value))
         sin_2d = op.Constant(value=ir.tensor(sin_value))
+>>>>>>> main
         return op.RotaryEmbedding(
             x,
             position_ids,

diff --git a/onnxscript/rewriter/onnxruntime/xformers/cos_sin_cache_test.py b/onnxscript/rewriter/onnxruntime/xformers/cos_sin_cache_test.py
@@ -6,13 +6,21 @@
 
 import onnxscript.optimizer
 from onnxscript.rewriter.onnxruntime.xformers import fuse_cos_sin_cache, fuse_rotary_embedding
+<<<<<<< HEAD
+from onnxscript.rewriter.onnxruntime.xformers._smollm_1 import TestData
+=======
 from onnxscript.rewriter.onnxruntime.xformers._smollm_1layer import _SmollmTestData
+>>>>>>> main
 from onnxscript.rewriter.onnxruntime.xformers._test_utils import assert_allclose, ort_run
 
 
 class TestCosSinCacheTransform(unittest.TestCase):
     def test_smollm(self):
+<<<<<<< HEAD
+        smollm_test = TestData()
+=======
         smollm_test = _SmollmTestData()
+>>>>>>> main
         model = smollm_test.get_onnx_model()
         onnxscript.optimizer.optimize(model)
         inputs = smollm_test.get_ort_inputs()

diff --git a/onnxscript/rewriter/onnxruntime/xformers/fuse_xformers.py b/onnxscript/rewriter/onnxruntime/xformers/fuse_xformers.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+from onnxscript.rewriter.onnxruntime.xformers.cos_sin_cache import fuse_cos_sin_cache
+from onnxscript.rewriter.onnxruntime.xformers.mha import fuse_mha
+from onnxscript.rewriter.onnxruntime.xformers.rms_normalization import fuse_rms_normalization
+from onnxscript.rewriter.onnxruntime.xformers.rotary_embedding import fuse_rotary_embedding
+from onnxscript.rewriter.onnxruntime.xformers.sdpa import fuse_sdpa
+from onnxscript.rewriter.onnxruntime.xformers.skip_normalization import fuse_normalization
+
+
+def fuse_xformers(model):
+    fuse_rms_normalization(model)
+    fuse_normalization(model)
+    fuse_rotary_embedding(model)
+    fuse_cos_sin_cache(model)
+    fuse_sdpa(model)
+    fuse_mha(model)
diff --git a/onnxscript/rewriter/onnxruntime/xformers/mha.py b/onnxscript/rewriter/onnxruntime/xformers/mha.py
@@ -0,0 +1,179 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+from typing import Iterable
+
+import onnxscript.ir as ir
+from onnxscript.rewriter import pattern
+
+"""
+The MultiHeadAttention pattern:
+
+B: Batch size
+S: Sequence length
+D: input embedding dimension
+H: number of heads
+d_h: head size (usually, D = H * d_h)
+
+thus, weights are usually of shape (D, D) and (D, D) and (D, D)
+
+for each of Q, K, and V, we have the following pattern:
+   MatMul (Input, W), producing output of shape (B, S, D)
+   Reshape to produce a matrix of shape (B, S, H, d_h)
+   Transpose middle two axes to produce a matrix of shape (B, H, S, d_h)
+
+This is followed by a RotaryEmbedding pattern for Q and K
+
+The last two axes of the key-embedding are then swapped (using a Reshape/Transpose/Reshape sequence)
+
+The dot-product attention is then computed using SDPA
+
+Finally, the output is transposed and reshaped back to (B, S, D) shape
+"""
+
+
+def _project_transpose_head(op, input, weight, reshape_var: str):
+    """Applied to each of Q, K, and V."""
+    projected = op.MatMul(input, weight)
+    # Reshape from (B, S, D) to (B, S, H, D/H)
+    reshaped = op.Reshape(
+        projected,
+        _allow_other_inputs=True,
+        _allow_other_attributes=True,
+        _outputs=[reshape_var],
+    )
+    # Transpose from (B, S, H, D/H) to (B, H, S, D/H)
+    transposed = op.Transpose(reshaped, perm=[0, 2, 1, 3])
+    return transposed
+
+
+def _multi_head_attention_pattern(
+    op,
+    input,
+    query_weight,
+    key_weight,
+    value_weight,
+    mask,
+    cos,
+    sin,
+    past_key,
+    past_value,
+    position_ids,
+):
+    query = _project_transpose_head(op, input, query_weight, "query_mm_reshaped")
+    query_rope = op.RotaryEmbedding(query, position_ids, cos, sin, _domain="com.microsoft")
+    key = _project_transpose_head(op, input, key_weight, "key_mm_reshaped")
+    key_rope = op.RotaryEmbedding(key, position_ids, cos, sin, _domain="com.microsoft")
+    key_rope = op.Concat(past_key, key_rope, axis=-2)
+    # Transpose last two axes of key_rope to compute dot-product via matmul.
+    key_reshaped = op.Reshape(key_rope, _allow_other_inputs=True, _outputs=["key_reshaped"])
+    key_reshaped_transposed = op.Transpose(key_reshaped, perm=[0, 2, 1])
+    key_transposed = op.Reshape(
+        key_reshaped_transposed, _allow_other_inputs=True, _outputs=["key_transposed"]
+    )
+    value = _project_transpose_head(op, input, value_weight, "value_mm_reshaped")
+    value = op.Concat(past_value, value, axis=-2)
+    attention = op.SDPA(
+        query_rope, key_transposed, value, mask, _domain="ai.onnxruntime.fusion"
+    )
+    # Transpose back to (B, S, H, D/H)
+    attention_transposed = op.Transpose(attention, perm=[0, 2, 1, 3])
+    # Reshape back to (B, S, D)
+    attention_reshaped = op.Reshape(
+        attention_transposed, _allow_other_inputs=True, _outputs=["attention_reshaped"]
+    )
+    return attention_reshaped, key_rope, value
+
+
+def _check_shape(bindings: dict[str, int], val: ir.Value, shape: Iterable[str]) -> bool:
+    if val.shape is None:
+        return False
+    if val.shape.rank() != len(shape):
+        return False
+    for actual, expected in zip(val.shape, shape):
+        if expected not in bindings:
+            bindings[expected] = actual
+        elif actual != bindings[expected]:
+            return False
+    return True
+
+
+def _mha_validation(
+    op,
+    query_mm_reshaped,
+    key_mm_reshaped,
+    value_mm_reshaped,
+    key_reshaped,
+    key_transposed,
+    attention_reshaped,
+    **_,
+):
+    bindings: dict[str, int] = {}
+    check = (
+        _check_shape(bindings, query_mm_reshaped, ["B", "S", "H", "d_h"])
+        and _check_shape(bindings, key_mm_reshaped, ["B", "KVS", "H", "d_h"])
+        and _check_shape(bindings, value_mm_reshaped, ["B", "KVS", "H", "d_h"])
+        and _check_shape(bindings, key_reshaped, ["B*H", "TS", "d_h"])
+        and _check_shape(bindings, key_transposed, ["B", "H", "d_h", "TS"])
+        and _check_shape(bindings, attention_reshaped, ["B", "S", "H*d_h"])
+    )
+    if not check:
+        return False
+    if bindings["B"] * bindings["H"] != bindings["B*H"]:
+        return False
+    if bindings["H"] * bindings["d_h"] != bindings["H*d_h"]:
+        return False
+    return True
+
+
+def _multi_head_attention(
+    op,
+    input,
+    query_weight,
+    key_weight,
+    value_weight,
+    mask,
+    cos,
+    sin,
+    past_key,
+    past_value,
+    position_ids,
+    query_mm_reshaped,
+    **_,
+):
+    num_heads = query_mm_reshaped.shape[2]
+    query = op.MatMul(input, query_weight)
+    query_rope = op.RotaryEmbedding(query, position_ids, cos, sin, _domain="com.microsoft")
+    key = op.MatMul(input, key_weight)
+    key_rope = op.RotaryEmbedding(key, position_ids, cos, sin, _domain="com.microsoft")
+    value = op.MatMul(input, value_weight)
+    tiling_factor = op.Constant(value_ints=[1, num_heads, 1, 1])
+    expanded_mask = op.Tile(mask, tiling_factor)
+    return op.MultiHeadAttention(
+        query_rope,
+        key_rope,
+        value,
+        None,  # bias
+        None,  # key padding mask
+        expanded_mask,  # attention mask/bias
+        past_key,
+        past_value,
+        num_heads=num_heads,
+        _domain="com.microsoft",
+        _outputs=3,
+    )
+
+
+_rule1 = pattern.RewriteRule(
+    _multi_head_attention_pattern, _multi_head_attention, _mha_validation
+)
+
+
+mha_rules = pattern.RewriteRuleSet([_rule1])
+
+
+def fuse_mha(model: ir.Model) -> int:
+    count = mha_rules.apply_to_model(model)
+    print(f"MHA count: {count}")
+    return count
diff --git a/onnxscript/rewriter/onnxruntime/xformers/mha_test.py b/onnxscript/rewriter/onnxruntime/xformers/mha_test.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import unittest
+
+import onnxscript.optimizer
+import onnxscript.rewriter.onnxruntime.xformers as xformers
+from onnxscript.rewriter.onnxruntime.xformers._smollm_2 import TestData
+from onnxscript.rewriter.onnxruntime.xformers._test_utils import assert_allclose, ort_run
+
+
+class TestMultiHeadAttention(unittest.TestCase):
+    def test_smollm(self):
+        # Generate model
+        smollm_test = TestData()
+        model = smollm_test.get_onnx_model()
+        onnxscript.optimizer.optimize(model)
+        xformers.fuse_rms_normalization(model)
+        xformers.fuse_normalization(model)
+        xformers.fuse_rotary_embedding(model)
+        xformers.fuse_cos_sin_cache(model)
+
+        # Run model
+        inputs = smollm_test.get_ort_inputs()
+        original_outputs = ort_run("original", model, inputs)
+
+        # Fuse SDPA and MHA
+        sdpa_count = xformers.fuse_sdpa(model)
+        self.assertGreater(sdpa_count, 0)
+        mha_count = xformers.fuse_mha(model)
+        self.assertGreater(mha_count, 0)
+
+        # Run model again
+        new_outputs = ort_run("optimized", model, inputs)
+        assert_allclose(new_outputs, original_outputs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxscript/rewriter/onnxruntime/xformers/rms_normalization_test.py b/onnxscript/rewriter/onnxruntime/xformers/rms_normalization_test.py
@@ -5,14 +5,14 @@
 import unittest
 
 import onnxscript.optimizer
-from onnxscript.rewriter.onnxruntime.xformers._smollm_1layer import _SmollmTestData
+from onnxscript.rewriter.onnxruntime.xformers._smollm_1 import TestData
 from onnxscript.rewriter.onnxruntime.xformers._test_utils import assert_allclose, ort_run
 from onnxscript.rewriter.onnxruntime.xformers.rms_normalization import fuse_rms_normalization
 
 
 class TestRmsNormalization(unittest.TestCase):
     def test_smollm(self):
-        smollm_test = _SmollmTestData()
+        smollm_test = TestData()
         model = smollm_test.get_onnx_model()
         onnxscript.optimizer.optimize(model)
         inputs = smollm_test.get_ort_inputs()

diff --git a/onnxscript/rewriter/onnxruntime/xformers/rotary_embedding_test.py b/onnxscript/rewriter/onnxruntime/xformers/rotary_embedding_test.py
@@ -5,13 +5,21 @@
 import unittest
 
 import onnxscript.optimizer
+<<<<<<< HEAD
+from onnxscript.rewriter.onnxruntime.xformers._smollm_1 import TestData
+=======
 from onnxscript.rewriter.onnxruntime.xformers._smollm_1layer import _SmollmTestData
+>>>>>>> main
 from onnxscript.rewriter.onnxruntime.xformers.rotary_embedding import fuse_rotary_embedding
 
 
 class TestRotaryEmbedding(unittest.TestCase):
     def test_smollm(self):
+<<<<<<< HEAD
+        smollm_test = TestData()
+=======
         smollm_test = _SmollmTestData()
+>>>>>>> main
         model = smollm_test.get_onnx_model()
         onnxscript.optimizer.optimize(model)
         fuse_rotary_embedding(model)