Add translation method for scaled dot product attention torch op (#1857)

aseemw · web-flow · commit 7c71c5348957 · 2023-05-10T21:40:09.000-07:00
* add translation method for scaled dot product attention torch op

* refactor get_mask method out of sdpa op translation main method
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
@@ -5779,3 +5779,114 @@ def torchvision_nms(context, node):
 def tupleindex(context, node):
     tuple_input, index_input = _get_inputs(context, node, expected=2)
     context.add(tuple_input[index_input.val], node.name)
+
+
+def _get_attn_mask(is_causal: Var, attn_mask: Var, query_var: Var, key_var: Var) -> Var:
+    if is_causal.val:
+        # create mask of shape (target_seq, source_seq)
+        # s.t the diagonal and lower triangular of the matrix is all 1s
+        # and upper triangular is a large negative number (e.g. -30k)
+        target_seq = query_var.shape[-2]
+        source_seq = key_var.shape[-2]
+        if is_symbolic(target_seq) or is_symbolic(source_seq):
+            raise NotImplementedError(
+                "scaled_dot_product_attention op: "
+                "is_causal flag not handled when sequence length is symbolic"
+            )
+
+        all_ones = mb.fill(value=1.0, shape=(target_seq, source_seq))
+        all_negative_inf = mb.fill(value=-3e4, shape=(target_seq, source_seq))
+        all_ones_lower = mb.band_part(
+            x=all_ones, lower=-1, upper=0
+        )  # will 0 out upper triangle, excluding diag
+        all_negative_inf_upper = mb.band_part(
+            x=all_negative_inf, lower=0, upper=-1
+        )  # will 0 out lower triangle, excluding diag
+        all_negative_inf_diag_only = mb.band_part(x=all_negative_inf_upper, lower=0, upper=0)
+        all_negative_inf_upper_no_diag = mb.sub(
+            x=all_negative_inf_upper, y=all_negative_inf_diag_only
+        )
+        return mb.add(x=all_ones_lower, y=all_negative_inf_upper_no_diag)
+    elif is_bool(attn_mask.dtype):
+        """
+        compute float mask as:
+        mask = cast(bool_mask) + (1-cast(bool_mask)) * -30k*ones(shape(bool_mask))
+        """
+        shape = mb.shape(x=attn_mask)
+        negative_inf = mb.fill(
+            shape=shape, value=_np.array([-3e4]).astype(types.nptype_from_builtin(query_var.dtype))
+        )
+        mask = mb.cast(x=attn_mask, dtype=types.builtin_to_string(query_var.dtype))
+        compliment_of_mask = mb.sub(
+            x=_np.array([1.0]).astype(types.nptype_from_builtin(mask.dtype)), y=mask
+        )
+        compliment_of_mask = mb.mul(x=negative_inf, y=compliment_of_mask)
+        return mb.add(x=mask, y=compliment_of_mask)
+    else:
+        return attn_mask
+
+
+
+@register_torch_op
+def scaled_dot_product_attention(context, node):
+    """
+    Input shapes/types:
+    - query : (target_seq, d) or (B, target_seq, d) or (B, h, target_seq, d) or (B,.., target_seq, d)
+    - key : (source_seq, d) or (B, source_seq, d) or (B, h, source_seq, d) or (B,.., source_seq, d)
+    - value: (source_seq, d_v) or (B, source_seq, d_v) or (B, h, source_seq, d_v) or (B,.., source_seq, d_v)
+    - attn_mask : (target_seq, source_seq) or (B, target_seq, source_seq) or (B, h, target_seq, source_seq) or
+                  (B, ..., target_seq, source_seq)
+    - is_causal : bool
+
+    Output shape: (target_seq, d_v) or (B,...,target_seq, d_v)
+
+    output = softmax(scale*Q*K^transpose + mask) * V
+
+    See details at:
+    https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    """
+    q, k, v, attn_mask, dropout, is_causal = _get_inputs(context, node, expected=6)
+    if attn_mask is not None and is_causal.val:
+        raise ValueError(
+            "scaled_dot_product_attention op: attn_mask cannot be provided when is_causal is set to True."
+        )
+
+    # check that ranks of q, k, v and attn_mask match
+    if k.rank != q.rank:
+        raise ValueError(
+            "Rank of query and key do not match in scaled_dot_product_attention torch op"
+        )
+    if v.rank != q.rank:
+        raise ValueError(
+            "Rank of query and value do not match in scaled_dot_product_attention torch op"
+        )
+
+    is_mask_present = False
+    if is_causal.val or attn_mask is not None:
+        is_mask_present = True
+        mask = _get_attn_mask(is_causal, attn_mask, q, k)
+
+    # scale the query input
+    embed_size = q.shape[-1]
+    if is_symbolic(embed_size):
+        raise ValueError(
+            "The embedding size, i.e. last dimension of the shape of query tensor"
+            " cannot be symbolic, in scaled_dot_product_attention op"
+        )
+    multiplicative_scale_factor = 1 / _math.sqrt(embed_size)
+    q = mb.mul(x=q, y=multiplicative_scale_factor)
+
+    # multiply query and key input tensors
+    # shape of output: (target_seq, source_seq) or (B,...,target_seq, source_seq)
+    attn_weights = mb.matmul(x=q, y=k, transpose_y=True)
+
+    # add mask if applicable
+    if is_mask_present:
+        attn_weights = mb.add(x=attn_weights, y=mask)
+
+    # do softmax
+    attn_weights_normalized = mb.softmax(x=attn_weights, axis=-1)
+
+    # multiply attn_weights and value tensor
+    res = mb.matmul(x=attn_weights_normalized, y=v, name=node.name)
+    context.add(res)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -8612,3 +8612,217 @@ def forward(self, x):
         self.run_compare_torch(x, OuterModel(),
                                input_as_shape=False, use_scripting=True,
                                backend=backend, compute_unit=compute_unit)
+
+class TestScaledDotProductAttention(TorchBaseTest):
+    """
+    Tests for torch.nn.functional.scaled_dot_product_attention op
+    (https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+    """
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank",
+        itertools.product(
+            compute_units,
+            backends,
+            [2, 3, 4, 5],
+        ),
+    )
+    def test_different_input_ranks_no_mask(self, compute_unit, backend, rank):
+        """
+        The query/key/value inputs can be any rank 2 or greater.
+        """
+        batch_size, seq_len, n_heads_1, n_heads_2, d = 2, 10, 3, 4, 7
+        if rank == 2:
+            input_shape = (seq_len, d)
+        elif rank == 3:
+            input_shape = (batch_size, seq_len, d)
+        elif rank == 4:
+            input_shape = (batch_size, n_heads_1, seq_len, d)
+        elif rank == 5:
+            input_shape = (batch_size, n_heads_1, n_heads_1, seq_len, d)
+        else:
+            raise ValueError("invalid rank")
+
+        model = ModuleWrapper(
+            function=nn.functional.scaled_dot_product_attention,
+            kwargs={
+                "attn_mask": None,
+                "dropout_p": 0.0,
+                "is_causal": False,
+            },
+        )
+
+        self.run_compare_torch(
+            [input_shape] * 3,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, seq_lengths, include_heads",
+        itertools.product(
+            compute_units,
+            backends,
+            [(5, 5), (5, 7), (6, 4)],
+            [False, True],
+        ),
+    )
+    def test_is_causal_flag(self, compute_unit, backend, seq_lengths, include_heads):
+        source_seq_len, target_seq_len = seq_lengths
+        query_shape = (2, 2, target_seq_len, 7) if include_heads else (2, target_seq_len, 7)
+        key_shape = (2, 2, source_seq_len, 7) if include_heads else (2, source_seq_len, 7)
+        value_shape = key_shape
+
+        model = ModuleWrapper(
+            function=nn.functional.scaled_dot_product_attention,
+            kwargs={
+                "attn_mask": None,
+                "is_causal": True,
+            },
+        )
+        res = self.run_compare_torch(
+            [query_shape, key_shape, value_shape],
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+        # check that "fill" and "band_part" ops, which are needed to compute mask, have been constant folded
+        mil_prog = res[1]._get_mil_internal()
+        # assert that "lstm" ops are present in the mil program
+        assert len(mil_prog.find_ops(op_type="fill")) == 0
+        assert len(mil_prog.find_ops(op_type="band_part")) == 0
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, seq_lengths, bool_mask",
+        itertools.product(
+            compute_units,
+            backends,
+            [(5, 5), (7, 5)],
+            [False, True],
+        ),
+    )
+    def test_attn_mask(self, compute_unit, backend, seq_lengths, bool_mask):
+        source_seq_len, target_seq_len = seq_lengths
+        query_shape = (2, 3, target_seq_len, 7)
+        key_shape = (2, 3, source_seq_len, 7)
+        value_shape = key_shape
+        mask_shape = (target_seq_len, source_seq_len)
+
+        query = generate_input_data(query_shape)
+        key = generate_input_data(key_shape)
+        value = generate_input_data(value_shape)
+        if bool_mask:
+            mask = torch.rand(mask_shape) > 0.5
+            mask = mask.bool()
+        else:
+            mask = generate_input_data(mask_shape)
+
+        model = ModuleWrapper(function=nn.functional.scaled_dot_product_attention)
+        self.run_compare_torch(
+            (query, key, value, mask),
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            input_as_shape=False,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, mask_as_input",
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+        ),
+    )
+    def test_toy_xformer_with_sdpa(self, compute_unit, backend, mask_as_input):
+        embedding_size = 32
+        seq_length = 16
+        n_heads = 4
+        batch_size = 2
+        num_blocks = 3
+
+        class AttentionBlock(nn.Module):
+            def __init__(self, embed_dim=embedding_size, n_head=n_heads):
+                super().__init__()
+                self.query_proj_op = nn.Linear(embed_dim, embed_dim)
+                self.key_proj_op = nn.Linear(embed_dim, embed_dim)
+                self.value_proj_op = nn.Linear(embed_dim, embed_dim)
+                self.out_proj_op = nn.Linear(embed_dim, embed_dim)
+                self.n_head = n_head
+
+            def forward(self, x, mask=None):
+                # in comments below for shapes, using following notation:
+                # B: batch_size, S: seq_length, E: embedding_size, h: n_heads
+                # x: (B,S,E)
+                # mask: (S,S)
+                batch_size, seq_len, dim = x.shape
+                query_proj = self.query_proj_op(x)  # (B,S,E)
+                key_proj = self.key_proj_op(x)  # (B,S,E)
+                value_proj = self.value_proj_op(x)  # (B,S,E)
+                # reshape to (B, h, S, E/h)
+                query_proj = query_proj.reshape(
+                    batch_size, seq_len, self.n_head, dim // self.n_head
+                ).permute(
+                    0, 2, 1, 3
+                )  # (B, h, S, E/h)
+                key_proj = key_proj.reshape(
+                    batch_size, seq_len, self.n_head, dim // self.n_head
+                ).permute(
+                    0, 2, 1, 3
+                )  # (B, h, S, E/h)
+                value_proj = value_proj.reshape(
+                    batch_size, seq_len, self.n_head, dim // self.n_head
+                ).permute(
+                    0, 2, 1, 3
+                )  # (B, h, S, E/h)
+                # now do scaled dot produce attention
+                if mask is None:
+                    out = nn.functional.scaled_dot_product_attention(
+                        query_proj, key_proj, value_proj, is_causal=True
+                    )  # (B, h, S, E/h)
+                else:
+                    out = nn.functional.scaled_dot_product_attention(
+                        query_proj, key_proj, value_proj, mask
+                    )  # (B, h, S, E/h)
+                # reshape back to (B, S, E)
+                out = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, dim)  # (B, S, E)
+                return self.out_proj_op(out)
+
+        class MLPBlock(nn.Module):
+            def __init__(self, embed_dim=embedding_size):
+                super().__init__()
+                self.fc1 = nn.Linear(embed_dim, embed_dim)
+                self.activation = nn.GELU()
+                self.fc2 = nn.Linear(embed_dim, embed_dim)
+
+            def forward(self, x):
+                x = self.fc1(x)
+                x = self.activation(x)
+                return self.fc2(x)
+
+        class ToyTransformer(nn.Module):
+            def __init__(self, n_blocks=num_blocks, embed_dim=embedding_size):
+                super().__init__()
+                self.attn_block = AttentionBlock(embed_dim=embed_dim)
+                self.mlp = MLPBlock(embed_dim=embed_dim)
+                self.n_blocks = n_blocks
+                self.lnorm = nn.LayerNorm(embed_dim)
+
+            def forward(self, x, mask=None):
+                for i in range(self.n_blocks):
+                    x = self.attn_block(x, mask) + x
+                    x = self.lnorm(x)
+                    x = self.mlp(x) + x
+                    x = self.lnorm(x)
+                return x
+
+        model = ToyTransformer()
+        self.run_compare_torch(
+            [(batch_size, seq_length, embedding_size), (seq_length, seq_length)]
+            if mask_as_input
+            else [(batch_size, seq_length, embedding_size)],
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
@@ -39,7 +39,11 @@
 class band_part(Operation):
     """
     Returns a tensor setting everything outside a center band to zeros for the innermost
-    matrix. Special cases:
+    matrix. That is,
+    band(m, n) = (lower < 0 || (m-n) <= lower) && (upper < 0 || (n-m) <= upper)
+    output[i, j, k, ..., m, n] = band(m, n) * input[i, j, k, ..., m, n]
+
+    Special cases:
 
     - ``band_part(x, 0, -1)`` returns upper triangular part.
     - ``band_part(x, -1, 0)`` returns lower triangular part.
@@ -86,6 +90,19 @@ def default_inputs(self):
     def type_inference(self):
         return self.x.sym_type
 
+    @precondition(allow=VALUE)
+    def value_inference(self):
+        M, N = self.x.val.shape[-2:]
+        band = np.zeros((M, N), dtype=types.nptype_from_builtin(self.x.sym_type))
+        num_lower = self.lower.val
+        num_upper = self.upper.val
+        for m in range(M):
+            for n in range(N):
+                band[m, n] = (num_lower < 0 or (m - n) <= num_lower) and (
+                    num_upper < 0 or (n - m) <= num_upper
+                )
+        return np.multiply(band, self.x.val)
+
 
 @register_op
 class cumsum(Operation):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS16/tensor_operation.py
@@ -20,7 +20,7 @@
 @register_op(opset_version=_IOS16_TARGET)
 class fill_like(Operation):
     """
-    Returns a tensor with the same size as the input tensor filled with a constant value.
+    Returns a tensor with the same shape as the input tensor filled with a constant value.
 
     Parameters
     ----------
@@ -45,7 +45,7 @@ class fill_like(Operation):
         ref_tensor=TensorInputType(type_domain="T"),
         value=TensorInputType(const=True, optional=True, type_domain="U"),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32, types.bool),
         "U": (types.fp16, types.fp32, types.int32, types.bool),
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py