Merge pull request #67 from graphcore-research/conv1d

thecharlieblake · web-flow · commit 50ea961f1c0f · 2024-09-02T16:10:39.000+01:00
Conv1d
diff --git a/unit_scaling/_modules.py b/unit_scaling/_modules.py
@@ -8,6 +8,7 @@
 
 import einops
 import torch
+import torch.nn.functional as F
 from torch import Tensor, nn
 
 from . import functional as U
@@ -137,7 +138,7 @@ def __init__(
         self.constraint = constraint
         self.weight = Parameter(self.weight.data, mup_type=weight_mup_type)
         if self.bias is not None:
-            self.bias = Parameter(self.bias, mup_type="bias")
+            self.bias = Parameter(self.bias.data, mup_type="bias")
 
     def reset_parameters(self) -> None:
         nn.init.normal_(self.weight)
@@ -181,6 +182,75 @@ def forward(self, input: Tensor) -> Tensor:
         return U.linear_readout(input, self.weight, self.bias, self.constraint)
 
 
+@inherit_docstring(
+    short_description=(
+        "Applies a **unit-scaled** 1D convolution to the incoming data."
+        "\nNote that this layer sets :code:`bias=False` by default."
+        "We also require padding to be supplied as an integer, not a string."
+    ),
+    add_args=[binary_constraint_docstring],
+)
+class Conv1d(nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        padding_mode: str = "zeros",
+        device: Any = None,
+        dtype: Any = None,
+        constraint: Optional[str] = "to_output_scale",
+        weight_mup_type: MupType = "weight",
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+        assert isinstance(padding, int), "only `int` is supported for padding type"
+        self.kernel_size = kernel_size  # type:ignore[assignment]
+        self.stride = stride  # type:ignore[assignment]
+        self.padding = padding  # type:ignore[assignment]
+        self.dilation = dilation  # type:ignore[assignment]
+        self.constraint = constraint
+        self.weight = Parameter(self.weight.data, mup_type=weight_mup_type)
+        if self.bias is not None:
+            self.bias = Parameter(self.bias.data, mup_type="bias")
+
+    def reset_parameters(self) -> None:
+        nn.init.normal_(self.weight)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, input: Tensor) -> Tensor:
+        if self.padding_mode != "zeros":
+            input = F.pad(
+                input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+            )
+        return U.conv1d(
+            input,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+
 @inherit_docstring(
     short_description=(
         "Applies a **unit-scaled** Layer Normalization over a mini-batch of inputs."
diff --git a/unit_scaling/functional.py b/unit_scaling/functional.py
@@ -262,6 +262,49 @@ def linear_readout(
     )
 
 
+@docstring_from(
+    F.conv1d,
+    short_description="Applies a **unit-scaled** 1D convolution.",
+    add_args=[
+        binary_constraint_docstring,
+        "scale_power ((float, float, float), optional): scaling power"
+        " for each of (output, grad(input), grad(weight|bias))",
+    ],
+)
+def conv1d(
+    input: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: int = 1,
+    padding: int = 0,
+    dilation: int = 1,
+    groups: int = 1,
+    constraint: Optional[str] = "to_output_scale",
+    scale_power: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+) -> Tensor:
+    fan_out, fan_in, kernel_size = weight.shape
+    seq_len = input.shape[-1]
+    out_size = (seq_len + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
+    batch_size = out_size
+    if len(input.shape) > 2:
+        batch_size *= input.shape[:-2].numel()
+
+    output_scale = 1 / (fan_in * kernel_size) ** scale_power[0]
+    grad_input_scale = (stride * groups / (fan_out * kernel_size)) ** scale_power[1]
+    grad_weight_scale = grad_bias_scale = 1 / batch_size ** scale_power[2]
+
+    output_scale, grad_input_scale = apply_constraint(
+        constraint, output_scale, grad_input_scale
+    )
+
+    input = scale_bwd(input, grad_input_scale)
+    weight = scale_bwd(weight, grad_weight_scale)
+    bias = scale_bwd(bias, grad_bias_scale) if bias is not None else None
+    output = F.conv1d(input, weight, bias, stride, padding, dilation, groups)
+    assert out_size == output.shape[-1]
+    return scale_fwd(output, output_scale)
+
+
 @docstring_from(
     F.layer_norm,
     short_description=(
diff --git a/unit_scaling/optim.py b/unit_scaling/optim.py
@@ -36,8 +36,10 @@ def _get_fan_in(param: ParameterData) -> int:
         return param.shape[0]
     if len(param.shape) == 2:
         return param.shape[1]
+    if len(param.shape) == 3:
+        return param.shape[1] * param.shape[2]
     raise ValueError(
-        f"Cannot get fan_in of `ndim >= 3` param, shape={tuple(param.shape)}"
+        f"Cannot get fan_in of `ndim >= 4` param, shape={tuple(param.shape)}"
     )
 
 
diff --git a/unit_scaling/tests/test_functional.py b/unit_scaling/tests/test_functional.py
@@ -6,6 +6,7 @@
 
 from ..functional import (
     add,
+    conv1d,
     cross_entropy,
     dropout,
     embedding,
@@ -277,6 +278,92 @@ def test_linear_readout() -> None:
     assert_scale(output, target=2**-5)  # 1/sqrt(fan_in)
 
 
+# --- test conv1d() ---
+
+
+def test_conv1d() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    weight = randn(d_out, d_in, kernel_size, requires_grad=True)
+    bias = zeros(d_out).requires_grad_()
+    output = conv1d(input, weight, bias, constraint=None)
+    unit_backward(output)
+
+    assert_unit_scaled(output, input.grad, weight.grad, bias.grad)
+
+
+def test_conv1d_stride() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    stride = 3
+
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    weight = randn(d_out, d_in, kernel_size, requires_grad=True)
+    bias = zeros(d_out).requires_grad_()
+    output = conv1d(input, weight, bias, stride=stride, constraint=None)
+    unit_backward(output)
+
+    assert_unit_scaled(output, input.grad, weight.grad, bias.grad)
+
+
+def test_conv1d_padding() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    padding = 23  # If this is large enough wrt seq_len, test fails
+
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    weight = randn(d_out, d_in, kernel_size, requires_grad=True)
+    bias = zeros(d_out).requires_grad_()
+    output = conv1d(input, weight, bias, padding=padding, constraint=None)
+    unit_backward(output)
+
+    assert_unit_scaled(output, input.grad, weight.grad, bias.grad)
+
+
+def test_conv1d_dilation() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    dilation = 8
+
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    weight = randn(d_out, d_in, kernel_size, requires_grad=True)
+    bias = zeros(d_out).requires_grad_()
+    output = conv1d(input, weight, bias, dilation=dilation, constraint=None)
+    unit_backward(output)
+
+    assert_unit_scaled(output, input.grad, weight.grad, bias.grad)
+
+
+def test_conv1d_groups() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    groups = 32
+
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    weight = randn(d_out, d_in // groups, kernel_size, requires_grad=True)
+    bias = zeros(d_out).requires_grad_()
+    output = conv1d(input, weight, bias, groups=groups, constraint=None)
+    unit_backward(output)
+
+    assert_unit_scaled(output, input.grad, weight.grad, bias.grad)
+
+
 # --- test layer_norm() ---
 
 
diff --git a/unit_scaling/tests/test_modules.py b/unit_scaling/tests/test_modules.py
@@ -8,6 +8,7 @@
     GELU,
     MHSA,
     MLP,
+    Conv1d,
     CrossEntropyLoss,
     DepthModuleList,
     DepthSequential,
@@ -87,6 +88,28 @@ def test_linear() -> None:
     assert_non_zeros(model.bias)
 
 
+def test_conv1d() -> None:
+    batch_size = 2**6
+    d_in = 2**6 * 3
+    d_out = 2**6 * 5
+    kernel_size = 11
+    seq_len = 2**6 * 7
+    input = randn(batch_size, d_in, seq_len, requires_grad=True)
+    model = Conv1d(d_in, d_out, kernel_size, bias=True)
+    output = model(input)
+
+    assert_unit_scaled(model.weight)
+    assert_zeros(model.bias)
+
+    unit_backward(output)
+    SGD(model.parameters(), lr=1).step()
+
+    assert float(output.std()) == pytest.approx(1, abs=0.1)
+
+    assert_not_unit_scaled(model.weight)
+    assert_non_zeros(model.bias)
+
+
 def test_linear_readout() -> None:
     input = randn(2**8, 2**10, requires_grad=True)
     model = LinearReadout(2**10, 2**12)