Merge pull request #72 from graphcore-research/remove-constraint-on-container-modules

DouglasOrr · web-flow · commit 72e1bd26510d · 2024-09-24T22:28:39.000+01:00
Remove the constraint argument in the container modules uu.{MLP, MHSA}
diff --git a/unit_scaling/_modules.py b/unit_scaling/_modules.py
@@ -19,7 +19,6 @@
     format_docstring,
     inherit_docstring,
     mult_docstring,
-    variadic_constraint_docstring,
 )
 from .parameter import MupType, Parameter, has_parameter_data
 
@@ -428,47 +427,40 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
         )
 
 
-@format_docstring(binary_constraint_docstring)
 class MLP(nn.Module):
     """A **unit-scaled** implementation of an MLP layer using SwiGLU.
 
     Args:
         hidden_size (int): the hidden dimension size of the input.
         expansion_factor (int): the factor by which the MLP's intermediate size
             increases relative to `hidden_size`.
-        {0}
     """
 
-    def __init__(
-        self,
-        hidden_size: int,
-        expansion_factor: int = 4,
-        constraint: Optional[str] = "to_output_scale",
-    ) -> None:
+    def __init__(self, hidden_size: int, expansion_factor: int = 4) -> None:
         super().__init__()
         intermediate_size = hidden_size * expansion_factor
-        self.linear_1 = Linear(hidden_size, intermediate_size, constraint=constraint)
-        self.linear_gate = Linear(hidden_size, intermediate_size, constraint=constraint)
-        self.linear_2 = Linear(intermediate_size, hidden_size, constraint=constraint)
+        # Note: constraint=None is safe here, because we know that the forward and
+        # backward constraints are mirrored between {linear_1, linear_gate} and
+        # linear_2.
+        self.linear_1 = Linear(hidden_size, intermediate_size, constraint=None)
+        self.linear_gate = Linear(hidden_size, intermediate_size, constraint=None)
+        self.linear_2 = Linear(intermediate_size, hidden_size, constraint=None)
 
     def forward(self, input: Tensor) -> Tensor:
         z = U.silu_glu(self.linear_1(input), self.linear_gate(input))
         return self.linear_2(z)  # type:ignore[no-any-return]
 
 
-@format_docstring(mult_docstring(), variadic_constraint_docstring)
+@format_docstring(mult_docstring())
 class MHSA(nn.Module):
     """A **unit-scaled** implementation of a multi-head self-attention layer.
 
-    Warning: using `constraint=None` here will likely give incorrect gradients.
-
     Args:
         hidden_size (int): the hidden dimension size of the input.
         heads (int): the number of attention heads.
         is_causal (bool): causal masking (for non-padded sequences).
         dropout_p (float, optional): the probability of the post-softmax dropout.
         {0}
-        {1}
     """
 
     def __init__(
@@ -478,16 +470,14 @@ def __init__(
         is_causal: bool,
         dropout_p: float = 0.0,
         mult: float = 1.0,
-        constraint: Optional[str] = "to_output_scale",
     ) -> None:
         super().__init__()
         self.heads = heads
         self.dropout_p = dropout_p
         self.is_causal = is_causal
         self.mult = mult
-        self.linear_qkv = Linear(hidden_size, 3 * hidden_size, constraint=constraint)
-        self.linear_o = Linear(hidden_size, hidden_size, constraint=constraint)
-        self.constraint = constraint
+        self.linear_qkv = Linear(hidden_size, 3 * hidden_size)
+        self.linear_o = Linear(hidden_size, hidden_size)
 
     def forward(self, input: Tensor) -> Tensor:
         q_k_v = self.linear_qkv(input)
@@ -499,7 +489,6 @@ def forward(self, input: Tensor) -> Tensor:
         return self.linear_o(qkv)  # type: ignore
 
 
-@format_docstring(variadic_constraint_docstring)
 class TransformerLayer(nn.Module):
     """A **unit-scaled** implementation of a PreNorm
     (see https://arxiv.org/abs/2002.04745) transformer layer.
@@ -516,7 +505,6 @@ class TransformerLayer(nn.Module):
         is_causal (bool): causal masking (for non-padded sequences).
         dropout_p (float, optional): the probability of residual and post-softmax
             dropout.
-        {0}
     """
 
     def __init__(
@@ -527,22 +515,15 @@ def __init__(
         mlp_tau: float,
         is_causal: bool,
         dropout_p: float = 0.0,
-        constraint: Optional[str] = "to_output_scale",
     ) -> None:
         super().__init__()
         self.dropout_p = dropout_p
         self.mhsa_tau = mhsa_tau
         self.mlp_tau = mlp_tau
         self.mhsa_norm = RMSNorm(hidden_size)
-        self.mhsa = MHSA(
-            hidden_size,
-            heads,
-            is_causal=is_causal,
-            dropout_p=dropout_p,
-            constraint=constraint,
-        )
+        self.mhsa = MHSA(hidden_size, heads, is_causal=is_causal, dropout_p=dropout_p)
         self.mlp_norm = RMSNorm(hidden_size)
-        self.mlp = MLP(hidden_size, constraint=constraint)
+        self.mlp = MLP(hidden_size)
 
     def forward(self, input: Tensor) -> Tensor:
         input, skip = U.residual_split(input, tau=self.mhsa_tau)
@@ -627,16 +608,13 @@ def __init__(
         )
 
 
-@format_docstring(variadic_constraint_docstring)
 class TransformerDecoder(nn.Sequential):  # pragma: no cover
     """A **unit-scaled** implementation of a decoder-type transformer.
 
     Note: this class is currently just for demonstrating scaling and lacks key
     functionality (for example masking, positional embeddings, usage for
     inference).
 
-    Warning: using `constraint=None` here will likely give incorrect gradients.
-
     Args:
         hidden_size (int): the hidden dimension size of the input.
         vocab_size (int): the number of tokens in the vocabulary.
@@ -648,7 +626,6 @@ class TransformerDecoder(nn.Sequential):  # pragma: no cover
             controlling residual weights in the transformer trunk; see
             :func:`unit_scaling.core.functional.transformer_residual_scaling_rule`
             (default).
-        {0}
     """
 
     def __init__(
@@ -659,7 +636,6 @@ def __init__(
         heads: int,
         dropout_p: float = 0.0,
         residual_scaling: ResidualScalingFn = transformer_residual_scaling_rule(),
-        constraint: Optional[str] = "to_output_scale",
     ) -> None:
         super().__init__()
         self.embedding = Embedding(vocab_size, hidden_size)
@@ -670,7 +646,6 @@ def __init__(
             is_causal=True,
             dropout_p=dropout_p,
             residual_scaling=residual_scaling,
-            constraint=constraint,
         )
         self.final_norm = RMSNorm(hidden_size)
         self.projection = LinearReadout(hidden_size, vocab_size)
diff --git a/unit_scaling/_version.py b/unit_scaling/_version.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024 Graphcore Ltd. All rights reserved.
 
-__version__ = "0.2"
+__version__ = "0.3"
diff --git a/unit_scaling/tests/test_modules.py b/unit_scaling/tests/test_modules.py
@@ -201,6 +201,12 @@ def test_mlp() -> None:
 
     assert float(output.std()) == pytest.approx(1, abs=0.2)
 
+    assert_unit_scaled(
+        model.linear_1.weight.grad,
+        model.linear_gate.weight.grad,
+        model.linear_2.weight.grad,
+    )
+
     assert_not_unit_scaled(
         model.linear_1.weight, model.linear_gate.weight, model.linear_2.weight
     )
diff --git a/unit_scaling/tests/test_utils.py b/unit_scaling/tests/test_utils.py
@@ -29,12 +29,12 @@ def test_analyse_mlp() -> None:
 def forward(self, input : Tensor) -> Tensor:
     input_1 = input;  (-> 1.0, <- 1.44)
     linear_1_weight = self.linear_1.weight;  (-> 1.0, <- 0.503)
-    linear = U.linear(input_1, linear_1_weight, None, 'to_output_scale');  (-> 1.0, <- 0.502)
+    linear = U.linear(input_1, linear_1_weight, None, None);  (-> 1.0, <- 0.502)
     linear_gate_weight = self.linear_gate.weight;  (-> 1.0, <- 0.519)
-    linear_1 = U.linear(input_1, linear_gate_weight, None, 'to_output_scale');  (-> 1.0, <- 0.518)
+    linear_1 = U.linear(input_1, linear_gate_weight, None, None);  (-> 1.0, <- 0.518)
     silu_glu = U.silu_glu(linear, linear_1);  (-> 1.0, <- 0.5)
     linear_2_weight = self.linear_2.weight;  (-> 1.0, <- 1.0)
-    linear_2 = U.linear(silu_glu, linear_2_weight, None, 'to_output_scale');  (-> 1.0, <- 1.0)
+    linear_2 = U.linear(silu_glu, linear_2_weight, None, None);  (-> 1.0, <- 1.0)
     return linear_2
     """.strip()  # noqa: E501
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright (c) 2024 Graphcore Ltd. All rights reserved.`
`2`	`2`
`3`		`-__version__ = "0.2"`
	`3`	`+__version__ = "0.3"`