graphcore-research · thecharlieblake · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,21 @@
+{
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceFolder": "/workspace",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-toolsai.jupyter"
+            ],
+	    "settings": {
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": { "zsh": { "path": "/bin/zsh" } }
+            }
+        }
+    },
+    "mounts": [
+        "source=${localWorkspaceFolder},target=/workspace,type=bind"
+    ]
+}
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,2 @@
+*
+!requirements*.txt
diff --git a/.github/workflows/ci-ipu.yaml b/.github/workflows/ci-ipu.yaml
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -15,17 +15,19 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v3
-      - name: Install dependencies
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build Docker Image
         run: |
-          sudo apt-get update
-          sudo apt-get install -y git
-          pip install -r requirements-dev.txt
+          docker build -t unit-scaling-dev:latest . 
+
       - name: Run CI
-        run: ./dev ci
+        run: docker run --rm -v $(pwd):/workspace unit-scaling-dev:latest ./dev ci
+
       - name: Publish documentation
         if: ${{github.ref == 'refs/heads/main'}}
         uses: Cecilapp/GitHub-Pages-deploy@v3
         env: { GITHUB_TOKEN: "${{ github.token }}" }
         with:
-          build_dir: docs/_build/html
+          build_dir: docker run --rm -v $(pwd):/workspace unit-scaling-dev:latest docs/_build/html
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,39 @@
+# Use PyTorch base image
+FROM pytorch/pytorch:latest
+
+# Install additional dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    vim \
+    sudo \
+    make \
+    g++ \
+    zsh \
+    && chsh -s /bin/zsh \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*   # cleanup (smaller image)
+
+# Set working directory
+WORKDIR /workspace
+
+# Install Python dependencies
+COPY requirements-dev.txt .
+RUN pip install -r requirements-dev.txt
+
+# Puts pip install libs on $PATH & sets correct locale
+ENV PATH="$PATH:/home/$USERNAME/.local/bin" \
+    LC_ALL=C.UTF-8
+
+# Configure a non-root user with sudo privileges
+ARG USERNAME=developer  # Change this to preferred username
+ARG USER_UID=1001
+ARG USER_GID=$USER_UID
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
+    && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+USER $USERNAME
+
+# Creates basic .zshrc
+RUN sudo cp /etc/zsh/newuser.zshrc.recommended /home/$USERNAME/.zshrc
+
+CMD ["/bin/zsh"]
diff --git a/analysis/almost_scaled_dot_product_attention/demo_transformer.py b/analysis/almost_scaled_dot_product_attention/demo_transformer.py
@@ -10,13 +10,6 @@
 from torch import nn, Tensor
 import tqdm
 
-try:
-    import poptorch
-
-    poptorch_available = True
-except ModuleNotFoundError:
-    poptorch_available = False
-
 
 class Config(dict):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -132,7 +125,7 @@ def forward(self, indices: Tensor) -> Tensor:
         )
 
 
-def train_cpu() -> Tensor:
+def train() -> Tensor:
     model = Model()
     opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
     losses = []
@@ -143,26 +136,3 @@ def train_cpu() -> Tensor:
         opt.step()
         losses.append(float(loss))
     return torch.tensor(losses)
-
-
-def train_ipu() -> Tensor:
-    model = Model()
-    options = poptorch.Options()
-    options.showCompilationProgressBar(False)
-    opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
-    session = poptorch.trainingModel(model, options, opt)
-    try:
-        return torch.tensor(
-            [
-                float(session(batch.int()))
-                for batch in tqdm.tqdm(
-                    islice(batches(), CONFIG.steps), total=CONFIG.steps
-                )
-            ]
-        )
-    finally:
-        session.destroy()
-
-
-def train() -> Tensor:
-    return train_ipu() if poptorch_available else train_cpu()
diff --git a/docs/development.md b/docs/development.md
@@ -8,7 +8,7 @@ For users who wish to develop using this codebase, the following setup is requir
 python3 -m venv .venv
 echo "export PYTHONPATH=\${PYTHONPATH}:\$(dirname \${VIRTUAL_ENV})" >> .venv/bin/activate
 source .venv/bin/activate
-pip install -r requirements-dev.txt  # Or requirements-dev-ipu.txt for the ipu
+pip install -r requirements-dev.txt
 ```
 
 **Subsequent setup**:

diff --git a/requirements-dev-ipu.txt b/requirements-dev-ipu.txt
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,14 +1,28 @@
--r requirements.txt
-black==24.3.0
-flake8==6.0.0
-isort==5.12.0
-mypy==1.2.0
-myst-parser==1.0.0
-pandas-stubs==2.0.2.230605
-pytest==7.3.1
-pytest-cov==4.0.0
-sphinx==6.2.1
-sphinx-rtd-theme==1.2.0
-transformers==4.38.0
-types-Pygments==2.15.0.0
-types-tabulate==0.9.0.2
+# Look in pytorch-cpu first, then pypi second
+--index-url https://download.pytorch.org/whl/cpu
+--extra-index-url=https://pypi.org/simple
+
+# Same as requirements.txt, but with versions locked-in
+datasets==3.1.0
+docstring-parser==0.16
+einops==0.8.0
+numpy==2.1.3
+seaborn==0.13.2
+tabulate==0.9.0
+torch==2.5.1+cpu
+
+# Additional dev requirements
+black==24.10.0
+flake8==7.1.1
+isort==5.13.2
+mypy==1.13.0
+myst-parser==4.0.0
+pandas-stubs==2.2.3.241009
+pytest==8.3.3
+pytest-cov==6.0.0
+sphinx==8.1.3
+sphinx-rtd-theme==3.0.1
+transformers==4.46.1
+triton==3.1.0
+types-Pygments==2.18.0.20240506
+types-tabulate==0.9.0.20240106
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 datasets
 docstring-parser
 einops
-numpy<2.0
+numpy
 seaborn
 tabulate
 torch>=2.2
diff --git a/setup.cfg b/setup.cfg
@@ -8,12 +8,6 @@ show_error_codes = true
 strict = true
 check_untyped_defs = true
 
-[mypy-poptorch.*]
-ignore_missing_imports = True
-
-[mypy-poptorch_experimental_addons.*]
-ignore_missing_imports = True
-
 # As torch.fx doesn't explicitly export many of its useful modules.
 [mypy-torch.fx]
 implicit_reexport = True

diff --git a/setup.py b/setup.py
@@ -5,15 +5,6 @@
 import setuptools
 
 requirements = Path("requirements.txt").read_text().rstrip("\n").split("\n")
-try:
-    import poptorch
-
-    # This should match requirements-dev-ipu.txt
-    requirements.append(
-        "poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58"
-    )
-except ImportError:
-    pass
 
 version = re.search("__version__ = \"(.+)\"", Path("unit_scaling/_version.py").read_text()).group(1)
 

diff --git a/unit_scaling/analysis.py b/unit_scaling/analysis.py
@@ -12,8 +12,8 @@
 import matplotlib.colors
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns  # type: ignore[import]
-from datasets import load_dataset  # type: ignore[import]
+import seaborn as sns  # type: ignore[import-untyped]
+from datasets import load_dataset  # type: ignore[import-untyped]
 from torch import Tensor, nn
 from torch.fx.graph import Graph
 from torch.fx.node import Node

diff --git a/unit_scaling/formats.py b/unit_scaling/formats.py
@@ -10,14 +10,6 @@
 
 from ._internal_utils import generate__all__
 
-try:  # pragma: no cover
-    import poptorch
-    import poptorch_experimental_addons as pea
-
-    _poptorch_available = True
-except ImportError:  # pragma: no cover
-    _poptorch_available = False
-
 Shape = Tuple[int, ...]
 
 
@@ -68,14 +60,6 @@ def min_absolute_subnormal(self) -> float:
 
     def quantise(self, x: Tensor) -> Tensor:
         """Non-differentiably quantise the given tensor in this format."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
-
         absmax = self.max_absolute_value
         downscale = 2.0 ** (127 - 2 ** (self.exponent_bits - 1))
         mask = torch.tensor(2 ** (23 - self.mantissa_bits) - 1, device=x.device)
@@ -108,13 +92,6 @@ def quantise(self, x: Tensor) -> Tensor:
 
     def quantise_fwd(self, x: Tensor) -> Tensor:
         """Quantise the given tensor in the forward pass only."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx_ste(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
 
         class QuantiseForward(torch.autograd.Function):
             @staticmethod
@@ -131,13 +108,6 @@ def backward(  # type:ignore[override]
 
     def quantise_bwd(self, x: Tensor) -> Tensor:
         """Quantise the given tensor in the backward pass only."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx_grad(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
 
         class QuantiseBackward(torch.autograd.Function):
             @staticmethod

diff --git a/unit_scaling/optim.py b/unit_scaling/optim.py
@@ -123,8 +123,8 @@ def scaled_parameters(
     result = []
     for entry in params:
         group = dict(params=[entry]) if isinstance(entry, Tensor) else entry.copy()
-        group.setdefault("lr", lr)
-        group.setdefault("weight_decay", weight_decay)
+        group.setdefault("lr", lr)  # type: ignore[arg-type]
+        group.setdefault("weight_decay", weight_decay)  # type: ignore[arg-type]
         if group["lr"] is None:
             raise ValueError(
                 "scaled_params() requires lr to be provided,"
@@ -133,10 +133,10 @@ def scaled_parameters(
         for param in group["params"]:
             # Careful not to overwrite `lr` or `weight_decay`
             param_lr = group["lr"]
-            if has_parameter_data(param):
+            if has_parameter_data(param):  # type: ignore[arg-type]
                 if isinstance(param_lr, Tensor):
                     param_lr = param_lr.clone()
-                param_lr *= lr_scale_func(param)
+                param_lr *= lr_scale_func(param)  # type: ignore[operator]
             elif not allow_non_unit_scaling_params:
                 raise ValueError(
                     "Non-unit-scaling parameter (no mup_type),"
@@ -145,7 +145,7 @@ def scaled_parameters(
             param_weight_decay = group["weight_decay"]
             if independent_weight_decay:
                 # Note: only independent of peak LR, not of schedule
-                param_weight_decay /= float(param_lr)
+                param_weight_decay /= float(param_lr)  # type: ignore
 
             result.append(
                 dict(