diff --git a/.devcontainer.json b/.devcontainer.json
new file mode 100644
index 0000000..33f5165
--- /dev/null
+++ b/.devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceFolder": "/home/developer/unit-scaling",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-toolsai.jupyter"
+            ],
+	    "settings": {
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": { "zsh": { "path": "/bin/zsh" } }
+            }
+        }
+    },
+    "mounts": [
+        "source=${localEnv:HOME}/.ssh,target=/home/developer/.ssh,type=bind,readonly=true",
+        "source=${localEnv:HOME}/.gitconfig,target=/home/developer/.gitconfig,type=bind,readonly=true",
+        "source=${localWorkspaceFolder},target=/home/developer/unit-scaling,type=bind"
+    ],
+    "remoteUser": "developer"
+}
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..836364a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+*
+!requirements*.txt
diff --git a/.github/workflows/ci-ipu.yaml b/.github/workflows/ci-ipu.yaml
deleted file mode 100644
index 620dced..0000000
--- a/.github/workflows/ci-ipu.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: CI-IPU
-
-on:
-  pull_request:
-    branches:
-      - "**ipu**"
-      - "**poptorch**"
-  workflow_dispatch:
-
-concurrency:
-  # Run everything on main, most-recent on PR builds
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ci-ipu:
-    runs-on: ubuntu-latest
-    container: graphcore/pytorch:3.2.0-ubuntu-20.04
-    timeout-minutes: 10
-    steps:
-      - uses: actions/checkout@v3
-      - name: Install dependencies
-        run: |
-          apt-get update
-          apt-get install -y git
-          pip install -r requirements-dev-ipu.txt
-      - name: Run CI
-        run: ./dev ci
-      - name: Publish documentation
-        if: ${{github.ref == 'refs/heads/main'}}
-        uses: Cecilapp/GitHub-Pages-deploy@v3
-        env: { GITHUB_TOKEN: "${{ github.token }}" }
-        with:
-          build_dir: docs/_build/html
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 8c44be9..62bc51f 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -15,14 +15,16 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v3
-      - name: Install dependencies
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build Docker Image
         run: |
-          sudo apt-get update
-          sudo apt-get install -y git
-          pip install -r requirements-dev.txt
+          docker build -t unit-scaling-dev:latest . 
+
       - name: Run CI
-        run: ./dev ci
+        run: docker run --rm -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest ./dev ci
+
       - name: Publish documentation
         if: ${{github.ref == 'refs/heads/main'}}
         uses: Cecilapp/GitHub-Pages-deploy@v3
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..ec6826a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,39 @@
+# Use PyTorch base image
+FROM pytorch/pytorch:latest
+
+# Install additional dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    vim \
+    sudo \
+    make \
+    g++ \
+    zsh \
+    && chsh -s /bin/zsh \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*   # cleanup (smaller image)
+
+# Configure a non-root user with sudo privileges
+ARG USERNAME=developer  # Change this to preferred username
+ARG USER_UID=1001
+ARG USER_GID=$USER_UID
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
+    && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+USER $USERNAME
+
+# Set working directory
+WORKDIR /home/$USERNAME/unit-scaling
+
+# Puts pip install libs on $PATH & sets correct locale
+ENV PATH="$PATH:/home/$USERNAME/.local/bin" \
+    LC_ALL=C.UTF-8
+
+# Install Python dependencies
+COPY requirements-dev.txt .
+RUN pip install -r requirements-dev.txt
+
+# Creates basic .zshrc
+RUN sudo cp /etc/zsh/newuser.zshrc.recommended /home/$USERNAME/.zshrc
+
+CMD ["/bin/zsh"]
diff --git a/README.md b/README.md
index 9585e4e..24526da 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,22 @@ To install the `unit-scaling` library, run:
 pip install git+https://github.com/graphcore-research/unit-scaling.git
 ```
 
-For development on this repository, see [docs/development.md](docs/development.md).
+## Development
+
+For development in this repository, we recommend using the provided docker container.
+This image can be built and entered interactively using:
+
+```sh
+docker build -t unit-scaling-dev:latest .
+docker run -it --rm  --user developer:developer -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest
+# To use git within the container, add `-v ~/.ssh:/home/developer/.ssh:ro -v ~/.gitconfig:/home/developer/.gitconfig:ro`.
+```
+
+For vscode users, this repo also contains a `.devcontainer.json` file, which enables the container to be used as a full-featured IDE (see the [Dev Container docs](https://code.visualstudio.com/docs/devcontainers/containers) for details on how to use this feature).
+
+Key development functionality is contained within the `./dev` script. This includes running unit tests, linting, formatting, documentation generation and more. Run `./dev --help` for the available options. Running `./dev` without arguments is equivalent to using the `--ci` option, which runs all of the available dev checks. This is the test used for GitHub CI.
+
+We encourage pull requests from the community. Please reach out to us with any questions about contributing.
 
 ## What is u-μP?
 
diff --git a/analysis/almost_scaled_dot_product_attention/demo_transformer.py b/analysis/almost_scaled_dot_product_attention/demo_transformer.py
index ed09916..bcafbad 100644
--- a/analysis/almost_scaled_dot_product_attention/demo_transformer.py
+++ b/analysis/almost_scaled_dot_product_attention/demo_transformer.py
@@ -10,13 +10,6 @@
 from torch import nn, Tensor
 import tqdm
 
-try:
-    import poptorch
-
-    poptorch_available = True
-except ModuleNotFoundError:
-    poptorch_available = False
-
 
 class Config(dict):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -132,7 +125,7 @@ def forward(self, indices: Tensor) -> Tensor:
         )
 
 
-def train_cpu() -> Tensor:
+def train() -> Tensor:
     model = Model()
     opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
     losses = []
@@ -143,26 +136,3 @@ def train_cpu() -> Tensor:
         opt.step()
         losses.append(float(loss))
     return torch.tensor(losses)
-
-
-def train_ipu() -> Tensor:
-    model = Model()
-    options = poptorch.Options()
-    options.showCompilationProgressBar(False)
-    opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
-    session = poptorch.trainingModel(model, options, opt)
-    try:
-        return torch.tensor(
-            [
-                float(session(batch.int()))
-                for batch in tqdm.tqdm(
-                    islice(batches(), CONFIG.steps), total=CONFIG.steps
-                )
-            ]
-        )
-    finally:
-        session.destroy()
-
-
-def train() -> Tensor:
-    return train_ipu() if poptorch_available else train_cpu()
diff --git a/docs/development.md b/docs/development.md
deleted file mode 100644
index 8f89f6e..0000000
--- a/docs/development.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Development
-
-For users who wish to develop using this codebase, the following setup is required:
-
-**First-time setup**:
-
-```bash
-python3 -m venv .venv
-echo "export PYTHONPATH=\${PYTHONPATH}:\$(dirname \${VIRTUAL_ENV})" >> .venv/bin/activate
-source .venv/bin/activate
-pip install -r requirements-dev.txt  # Or requirements-dev-ipu.txt for the ipu
-```
-
-**Subsequent setup**:
-
-```bash
-source .venv/bin/activate
-```
-
-**Run pre-flight checks** (or run `./dev --help` to see supported commands):
-
-```bash
-./dev
-```
-
-**IDE recommendations**:
-
-- Python intepreter is set to `.venv/bin/python`
-- Format-on-save enabled
-- Consider a `.env` file for setting `PYTHONPATH`, for example `echo "PYTHONPATH=$(pwd)" > .env`
-  (note that this will be a different path if using devcontainers)
-
-**Docs development**:
-
-```bash
-cd docs/
-make html
-```
-
-then view `docs/_build/html/index.html` in your browser.
\ No newline at end of file
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
index 40c47b9..a8ec322 100644
--- a/docs/user_guide.rst
+++ b/docs/user_guide.rst
@@ -55,13 +55,6 @@ The advantage of using a unit-scaled model is as follows:
    scales have stayed within range for all unit-scaled models tested thus far.
 3. This can enable the use of smaller, more efficient number formats out-of-the-box,
    such as FP16 and even FP8.
-4. As the behaviour of some ops depends on scale, unit-scaling a model can change its
-   training dynamics slightly. In some experiments this has been shown to lead to
-   loss decreasing faster, though further work is needed to validate this.
-
-For a more in-depth treatment of unit scaling, see our paper
-`Unit Scaling: Out-of-the-Box Low-Precision Training (ICML, 2023)
-<https://arxiv.org/abs/2303.11257>`_.
 
 
 How to unit-scale a model
diff --git a/requirements-dev-ipu.txt b/requirements-dev-ipu.txt
deleted file mode 100644
index 4b041dc..0000000
--- a/requirements-dev-ipu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
--r requirements.txt
-black==24.3.0
-flake8==6.0.0
-isort==5.12.0
-mypy==1.0.1
-myst-parser==1.0.0
-poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58
-pytest==7.2.1
-pytest-cov==4.0.0
-sphinx==5.3.0
-sphinx-rtd-theme==1.2.0
-types-Pygments==2.15.0.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b5b93b3..79b4f12 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,14 +1,28 @@
--r requirements.txt
-black==24.3.0
-flake8==6.0.0
-isort==5.12.0
-mypy==1.2.0
-myst-parser==1.0.0
-pandas-stubs==2.0.2.230605
-pytest==7.3.1
-pytest-cov==4.0.0
-sphinx==6.2.1
-sphinx-rtd-theme==1.2.0
-transformers==4.38.0
-types-Pygments==2.15.0.0
-types-tabulate==0.9.0.2
\ No newline at end of file
+# Look in pytorch-cpu first, then pypi second
+--index-url https://download.pytorch.org/whl/cpu
+--extra-index-url=https://pypi.org/simple
+
+# Same as requirements.txt, but with versions locked-in
+datasets==3.1.0
+docstring-parser==0.16
+einops==0.8.0
+numpy==2.1.3
+seaborn==0.13.2
+tabulate==0.9.0
+torch==2.5.1+cpu
+
+# Additional dev requirements
+black==24.10.0
+flake8==7.1.1
+isort==5.13.2
+mypy==1.13.0
+myst-parser==4.0.0
+pandas-stubs==2.2.3.241009
+pytest==8.3.3
+pytest-cov==6.0.0
+sphinx==8.1.3
+sphinx-rtd-theme==3.0.1
+transformers==4.46.1
+triton==3.1.0
+types-Pygments==2.18.0.20240506
+types-tabulate==0.9.0.20240106
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 98040e2..5271610 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 datasets
 docstring-parser
 einops
-numpy<2.0
+numpy
 seaborn
 tabulate
 torch>=2.2
diff --git a/setup.cfg b/setup.cfg
index dc68e0b..b91579c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,12 +8,6 @@ show_error_codes = true
 strict = true
 check_untyped_defs = true
 
-[mypy-poptorch.*]
-ignore_missing_imports = True
-
-[mypy-poptorch_experimental_addons.*]
-ignore_missing_imports = True
-
 # As torch.fx doesn't explicitly export many of its useful modules.
 [mypy-torch.fx]
 implicit_reexport = True
diff --git a/setup.py b/setup.py
index aad8eec..8608eb2 100644
--- a/setup.py
+++ b/setup.py
@@ -5,15 +5,6 @@
 import setuptools
 
 requirements = Path("requirements.txt").read_text().rstrip("\n").split("\n")
-try:
-    import poptorch
-
-    # This should match requirements-dev-ipu.txt
-    requirements.append(
-        "poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58"
-    )
-except ImportError:
-    pass
 
 version = re.search("__version__ = \"(.+)\"", Path("unit_scaling/_version.py").read_text()).group(1)
 
diff --git a/unit_scaling/analysis.py b/unit_scaling/analysis.py
index 8b5d526..1b47001 100644
--- a/unit_scaling/analysis.py
+++ b/unit_scaling/analysis.py
@@ -12,8 +12,8 @@
 import matplotlib.colors
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns  # type: ignore[import]
-from datasets import load_dataset  # type: ignore[import]
+import seaborn as sns  # type: ignore[import-untyped]
+from datasets import load_dataset  # type: ignore[import-untyped]
 from torch import Tensor, nn
 from torch.fx.graph import Graph
 from torch.fx.node import Node
diff --git a/unit_scaling/formats.py b/unit_scaling/formats.py
index 58ec5a7..5389873 100644
--- a/unit_scaling/formats.py
+++ b/unit_scaling/formats.py
@@ -10,14 +10,6 @@
 
 from ._internal_utils import generate__all__
 
-try:  # pragma: no cover
-    import poptorch
-    import poptorch_experimental_addons as pea
-
-    _poptorch_available = True
-except ImportError:  # pragma: no cover
-    _poptorch_available = False
-
 Shape = Tuple[int, ...]
 
 
@@ -68,14 +60,6 @@ def min_absolute_subnormal(self) -> float:
 
     def quantise(self, x: Tensor) -> Tensor:
         """Non-differentiably quantise the given tensor in this format."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
-
         absmax = self.max_absolute_value
         downscale = 2.0 ** (127 - 2 ** (self.exponent_bits - 1))
         mask = torch.tensor(2 ** (23 - self.mantissa_bits) - 1, device=x.device)
@@ -108,13 +92,6 @@ def quantise(self, x: Tensor) -> Tensor:
 
     def quantise_fwd(self, x: Tensor) -> Tensor:
         """Quantise the given tensor in the forward pass only."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx_ste(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
 
         class QuantiseForward(torch.autograd.Function):
             @staticmethod
@@ -131,13 +108,6 @@ def backward(  # type:ignore[override]
 
     def quantise_bwd(self, x: Tensor) -> Tensor:
         """Quantise the given tensor in the backward pass only."""
-        if _poptorch_available and poptorch.isRunningOnIpu():
-            return pea.quantise_fpx_grad(  # type: ignore[no-any-return]
-                x,
-                exponent_bits=self.exponent_bits,
-                mantissa_bits=self.mantissa_bits,
-                rounding=self.rounding,
-            )  # pragma: no cover
 
         class QuantiseBackward(torch.autograd.Function):
             @staticmethod
diff --git a/unit_scaling/optim.py b/unit_scaling/optim.py
index 3183a9d..b80852f 100644
--- a/unit_scaling/optim.py
+++ b/unit_scaling/optim.py
@@ -123,8 +123,8 @@ def scaled_parameters(
     result = []
     for entry in params:
         group = dict(params=[entry]) if isinstance(entry, Tensor) else entry.copy()
-        group.setdefault("lr", lr)
-        group.setdefault("weight_decay", weight_decay)
+        group.setdefault("lr", lr)  # type: ignore[arg-type]
+        group.setdefault("weight_decay", weight_decay)  # type: ignore[arg-type]
         if group["lr"] is None:
             raise ValueError(
                 "scaled_params() requires lr to be provided,"
@@ -133,10 +133,10 @@ def scaled_parameters(
         for param in group["params"]:
             # Careful not to overwrite `lr` or `weight_decay`
             param_lr = group["lr"]
-            if has_parameter_data(param):
+            if has_parameter_data(param):  # type: ignore[arg-type]
                 if isinstance(param_lr, Tensor):
                     param_lr = param_lr.clone()
-                param_lr *= lr_scale_func(param)
+                param_lr *= lr_scale_func(param)  # type: ignore[operator]
             elif not allow_non_unit_scaling_params:
                 raise ValueError(
                     "Non-unit-scaling parameter (no mup_type),"
@@ -145,7 +145,7 @@ def scaled_parameters(
             param_weight_decay = group["weight_decay"]
             if independent_weight_decay:
                 # Note: only independent of peak LR, not of schedule
-                param_weight_decay /= float(param_lr)
+                param_weight_decay /= float(param_lr)  # type: ignore
 
             result.append(
                 dict(
diff --git a/unit_scaling/scale.py b/unit_scaling/scale.py
index b1360e3..a5828eb 100644
--- a/unit_scaling/scale.py
+++ b/unit_scaling/scale.py
@@ -11,14 +11,6 @@
 
 from ._internal_utils import generate__all__
 
-try:  # pragma: no cover
-    import poptorch
-    import poptorch_experimental_addons as pea
-
-    _poptorch_available = True
-except ImportError:  # pragma: no cover
-    _poptorch_available = False
-
 
 class _ScaledGrad(torch.autograd.Function):  # pragma: no cover
     """Enables a custom backward method which has a different scale to forward."""
@@ -51,9 +43,6 @@ def _scale(
     t: Tensor, fwd_scale: float = 1.0, bwd_scale: float = 1.0
 ) -> Tensor:  # pragma: no cover
     """Given a tensor, applies a separate scale in the forward and backward pass."""
-
-    if _poptorch_available and poptorch.isRunningOnIpu():
-        return pea.autograd_proxy(t * fwd_scale, t * bwd_scale)  # type: ignore
     return _ScaledGrad.apply(t, fwd_scale, bwd_scale)  # type: ignore
 
 
diff --git a/unit_scaling/tests/test_analysis.py b/unit_scaling/tests/test_analysis.py
index 0c31637..2b8a9c1 100644
--- a/unit_scaling/tests/test_analysis.py
+++ b/unit_scaling/tests/test_analysis.py
@@ -4,7 +4,7 @@
 
 import torch.nn.functional as F
 from torch import Size, Tensor, nn, randn
-from transformers import AutoTokenizer  # type: ignore[import]
+from transformers import AutoTokenizer  # type: ignore[import-untyped]
 
 from ..analysis import _create_batch, _example_seqs, example_batch, plot, visualiser
 from ..transforms import track_scales