diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000..33f5165 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,24 @@ +{ + "build": { + "dockerfile": "Dockerfile" + }, + "workspaceFolder": "/home/developer/unit-scaling", + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter" + ], + "settings": { + "terminal.integrated.defaultProfile.linux": "zsh", + "terminal.integrated.profiles.linux": { "zsh": { "path": "/bin/zsh" } } + } + } + }, + "mounts": [ + "source=${localEnv:HOME}/.ssh,target=/home/developer/.ssh,type=bind,readonly=true", + "source=${localEnv:HOME}/.gitconfig,target=/home/developer/.gitconfig,type=bind,readonly=true", + "source=${localWorkspaceFolder},target=/home/developer/unit-scaling,type=bind" + ], + "remoteUser": "developer" +} diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..836364a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +* +!requirements*.txt diff --git a/.github/workflows/ci-ipu.yaml b/.github/workflows/ci-ipu.yaml deleted file mode 100644 index 620dced..0000000 --- a/.github/workflows/ci-ipu.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: CI-IPU - -on: - pull_request: - branches: - - "**ipu**" - - "**poptorch**" - workflow_dispatch: - -concurrency: - # Run everything on main, most-recent on PR builds - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - ci-ipu: - runs-on: ubuntu-latest - container: graphcore/pytorch:3.2.0-ubuntu-20.04 - timeout-minutes: 10 - steps: - - uses: actions/checkout@v3 - - name: Install dependencies - run: | - apt-get update - apt-get install -y git - pip install -r requirements-dev-ipu.txt - - name: Run CI - run: ./dev ci - - name: Publish documentation - if: ${{github.ref == 'refs/heads/main'}} - uses: Cecilapp/GitHub-Pages-deploy@v3 - env: { GITHUB_TOKEN: "${{ github.token }}" } - with: - build_dir: docs/_build/html diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8c44be9..62bc51f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,14 +15,16 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v3 - - name: Install dependencies + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build Docker Image run: | - sudo apt-get update - sudo apt-get install -y git - pip install -r requirements-dev.txt + docker build -t unit-scaling-dev:latest . + - name: Run CI - run: ./dev ci + run: docker run --rm -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest ./dev ci + - name: Publish documentation if: ${{github.ref == 'refs/heads/main'}} uses: Cecilapp/GitHub-Pages-deploy@v3 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ec6826a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +# Use PyTorch base image +FROM pytorch/pytorch:latest + +# Install additional dependencies +RUN apt-get update && apt-get install -y \ + git \ + vim \ + sudo \ + make \ + g++ \ + zsh \ + && chsh -s /bin/zsh \ + && apt-get clean && rm -rf /var/lib/apt/lists/* # cleanup (smaller image) + +# Configure a non-root user with sudo privileges +ARG USERNAME=developer # Change this to preferred username +ARG USER_UID=1001 +ARG USER_GID=$USER_UID +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ + && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME +USER $USERNAME + +# Set working directory +WORKDIR /home/$USERNAME/unit-scaling + +# Puts pip install libs on $PATH & sets correct locale +ENV PATH="$PATH:/home/$USERNAME/.local/bin" \ + LC_ALL=C.UTF-8 + +# Install Python dependencies +COPY requirements-dev.txt . +RUN pip install -r requirements-dev.txt + +# Creates basic .zshrc +RUN sudo cp /etc/zsh/newuser.zshrc.recommended /home/$USERNAME/.zshrc + +CMD ["/bin/zsh"] diff --git a/README.md b/README.md index 9585e4e..24526da 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,22 @@ To install the `unit-scaling` library, run: pip install git+https://github.com/graphcore-research/unit-scaling.git ``` -For development on this repository, see [docs/development.md](docs/development.md). +## Development + +For development in this repository, we recommend using the provided docker container. +This image can be built and entered interactively using: + +```sh +docker build -t unit-scaling-dev:latest . +docker run -it --rm --user developer:developer -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest +# To use git within the container, add `-v ~/.ssh:/home/developer/.ssh:ro -v ~/.gitconfig:/home/developer/.gitconfig:ro`. +``` + +For vscode users, this repo also contains a `.devcontainer.json` file, which enables the container to be used as a full-featured IDE (see the [Dev Container docs](https://code.visualstudio.com/docs/devcontainers/containers) for details on how to use this feature). + +Key development functionality is contained within the `./dev` script. This includes running unit tests, linting, formatting, documentation generation and more. Run `./dev --help` for the available options. Running `./dev` without arguments is equivalent to using the `--ci` option, which runs all of the available dev checks. This is the test used for GitHub CI. + +We encourage pull requests from the community. Please reach out to us with any questions about contributing. ## What is u-μP? diff --git a/analysis/almost_scaled_dot_product_attention/demo_transformer.py b/analysis/almost_scaled_dot_product_attention/demo_transformer.py index ed09916..bcafbad 100644 --- a/analysis/almost_scaled_dot_product_attention/demo_transformer.py +++ b/analysis/almost_scaled_dot_product_attention/demo_transformer.py @@ -10,13 +10,6 @@ from torch import nn, Tensor import tqdm -try: - import poptorch - - poptorch_available = True -except ModuleNotFoundError: - poptorch_available = False - class Config(dict): def __init__(self, *args: Any, **kwargs: Any): @@ -132,7 +125,7 @@ def forward(self, indices: Tensor) -> Tensor: ) -def train_cpu() -> Tensor: +def train() -> Tensor: model = Model() opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr) losses = [] @@ -143,26 +136,3 @@ def train_cpu() -> Tensor: opt.step() losses.append(float(loss)) return torch.tensor(losses) - - -def train_ipu() -> Tensor: - model = Model() - options = poptorch.Options() - options.showCompilationProgressBar(False) - opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr) - session = poptorch.trainingModel(model, options, opt) - try: - return torch.tensor( - [ - float(session(batch.int())) - for batch in tqdm.tqdm( - islice(batches(), CONFIG.steps), total=CONFIG.steps - ) - ] - ) - finally: - session.destroy() - - -def train() -> Tensor: - return train_ipu() if poptorch_available else train_cpu() diff --git a/docs/development.md b/docs/development.md deleted file mode 100644 index 8f89f6e..0000000 --- a/docs/development.md +++ /dev/null @@ -1,40 +0,0 @@ -# Development - -For users who wish to develop using this codebase, the following setup is required: - -**First-time setup**: - -```bash -python3 -m venv .venv -echo "export PYTHONPATH=\${PYTHONPATH}:\$(dirname \${VIRTUAL_ENV})" >> .venv/bin/activate -source .venv/bin/activate -pip install -r requirements-dev.txt # Or requirements-dev-ipu.txt for the ipu -``` - -**Subsequent setup**: - -```bash -source .venv/bin/activate -``` - -**Run pre-flight checks** (or run `./dev --help` to see supported commands): - -```bash -./dev -``` - -**IDE recommendations**: - -- Python intepreter is set to `.venv/bin/python` -- Format-on-save enabled -- Consider a `.env` file for setting `PYTHONPATH`, for example `echo "PYTHONPATH=$(pwd)" > .env` - (note that this will be a different path if using devcontainers) - -**Docs development**: - -```bash -cd docs/ -make html -``` - -then view `docs/_build/html/index.html` in your browser. \ No newline at end of file diff --git a/docs/user_guide.rst b/docs/user_guide.rst index 40c47b9..a8ec322 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -55,13 +55,6 @@ The advantage of using a unit-scaled model is as follows: scales have stayed within range for all unit-scaled models tested thus far. 3. This can enable the use of smaller, more efficient number formats out-of-the-box, such as FP16 and even FP8. -4. As the behaviour of some ops depends on scale, unit-scaling a model can change its - training dynamics slightly. In some experiments this has been shown to lead to - loss decreasing faster, though further work is needed to validate this. - -For a more in-depth treatment of unit scaling, see our paper -`Unit Scaling: Out-of-the-Box Low-Precision Training (ICML, 2023) -`_. How to unit-scale a model diff --git a/requirements-dev-ipu.txt b/requirements-dev-ipu.txt deleted file mode 100644 index 4b041dc..0000000 --- a/requirements-dev-ipu.txt +++ /dev/null @@ -1,12 +0,0 @@ --r requirements.txt -black==24.3.0 -flake8==6.0.0 -isort==5.12.0 -mypy==1.0.1 -myst-parser==1.0.0 -poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58 -pytest==7.2.1 -pytest-cov==4.0.0 -sphinx==5.3.0 -sphinx-rtd-theme==1.2.0 -types-Pygments==2.15.0.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index b5b93b3..79b4f12 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,14 +1,28 @@ --r requirements.txt -black==24.3.0 -flake8==6.0.0 -isort==5.12.0 -mypy==1.2.0 -myst-parser==1.0.0 -pandas-stubs==2.0.2.230605 -pytest==7.3.1 -pytest-cov==4.0.0 -sphinx==6.2.1 -sphinx-rtd-theme==1.2.0 -transformers==4.38.0 -types-Pygments==2.15.0.0 -types-tabulate==0.9.0.2 \ No newline at end of file +# Look in pytorch-cpu first, then pypi second +--index-url https://download.pytorch.org/whl/cpu +--extra-index-url=https://pypi.org/simple + +# Same as requirements.txt, but with versions locked-in +datasets==3.1.0 +docstring-parser==0.16 +einops==0.8.0 +numpy==2.1.3 +seaborn==0.13.2 +tabulate==0.9.0 +torch==2.5.1+cpu + +# Additional dev requirements +black==24.10.0 +flake8==7.1.1 +isort==5.13.2 +mypy==1.13.0 +myst-parser==4.0.0 +pandas-stubs==2.2.3.241009 +pytest==8.3.3 +pytest-cov==6.0.0 +sphinx==8.1.3 +sphinx-rtd-theme==3.0.1 +transformers==4.46.1 +triton==3.1.0 +types-Pygments==2.18.0.20240506 +types-tabulate==0.9.0.20240106 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 98040e2..5271610 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ datasets docstring-parser einops -numpy<2.0 +numpy seaborn tabulate torch>=2.2 diff --git a/setup.cfg b/setup.cfg index dc68e0b..b91579c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,12 +8,6 @@ show_error_codes = true strict = true check_untyped_defs = true -[mypy-poptorch.*] -ignore_missing_imports = True - -[mypy-poptorch_experimental_addons.*] -ignore_missing_imports = True - # As torch.fx doesn't explicitly export many of its useful modules. [mypy-torch.fx] implicit_reexport = True diff --git a/setup.py b/setup.py index aad8eec..8608eb2 100644 --- a/setup.py +++ b/setup.py @@ -5,15 +5,6 @@ import setuptools requirements = Path("requirements.txt").read_text().rstrip("\n").split("\n") -try: - import poptorch - - # This should match requirements-dev-ipu.txt - requirements.append( - "poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58" - ) -except ImportError: - pass version = re.search("__version__ = \"(.+)\"", Path("unit_scaling/_version.py").read_text()).group(1) diff --git a/unit_scaling/analysis.py b/unit_scaling/analysis.py index 8b5d526..1b47001 100644 --- a/unit_scaling/analysis.py +++ b/unit_scaling/analysis.py @@ -12,8 +12,8 @@ import matplotlib.colors import matplotlib.pyplot as plt import pandas as pd -import seaborn as sns # type: ignore[import] -from datasets import load_dataset # type: ignore[import] +import seaborn as sns # type: ignore[import-untyped] +from datasets import load_dataset # type: ignore[import-untyped] from torch import Tensor, nn from torch.fx.graph import Graph from torch.fx.node import Node diff --git a/unit_scaling/formats.py b/unit_scaling/formats.py index 58ec5a7..5389873 100644 --- a/unit_scaling/formats.py +++ b/unit_scaling/formats.py @@ -10,14 +10,6 @@ from ._internal_utils import generate__all__ -try: # pragma: no cover - import poptorch - import poptorch_experimental_addons as pea - - _poptorch_available = True -except ImportError: # pragma: no cover - _poptorch_available = False - Shape = Tuple[int, ...] @@ -68,14 +60,6 @@ def min_absolute_subnormal(self) -> float: def quantise(self, x: Tensor) -> Tensor: """Non-differentiably quantise the given tensor in this format.""" - if _poptorch_available and poptorch.isRunningOnIpu(): - return pea.quantise_fpx( # type: ignore[no-any-return] - x, - exponent_bits=self.exponent_bits, - mantissa_bits=self.mantissa_bits, - rounding=self.rounding, - ) # pragma: no cover - absmax = self.max_absolute_value downscale = 2.0 ** (127 - 2 ** (self.exponent_bits - 1)) mask = torch.tensor(2 ** (23 - self.mantissa_bits) - 1, device=x.device) @@ -108,13 +92,6 @@ def quantise(self, x: Tensor) -> Tensor: def quantise_fwd(self, x: Tensor) -> Tensor: """Quantise the given tensor in the forward pass only.""" - if _poptorch_available and poptorch.isRunningOnIpu(): - return pea.quantise_fpx_ste( # type: ignore[no-any-return] - x, - exponent_bits=self.exponent_bits, - mantissa_bits=self.mantissa_bits, - rounding=self.rounding, - ) # pragma: no cover class QuantiseForward(torch.autograd.Function): @staticmethod @@ -131,13 +108,6 @@ def backward( # type:ignore[override] def quantise_bwd(self, x: Tensor) -> Tensor: """Quantise the given tensor in the backward pass only.""" - if _poptorch_available and poptorch.isRunningOnIpu(): - return pea.quantise_fpx_grad( # type: ignore[no-any-return] - x, - exponent_bits=self.exponent_bits, - mantissa_bits=self.mantissa_bits, - rounding=self.rounding, - ) # pragma: no cover class QuantiseBackward(torch.autograd.Function): @staticmethod diff --git a/unit_scaling/optim.py b/unit_scaling/optim.py index 3183a9d..b80852f 100644 --- a/unit_scaling/optim.py +++ b/unit_scaling/optim.py @@ -123,8 +123,8 @@ def scaled_parameters( result = [] for entry in params: group = dict(params=[entry]) if isinstance(entry, Tensor) else entry.copy() - group.setdefault("lr", lr) - group.setdefault("weight_decay", weight_decay) + group.setdefault("lr", lr) # type: ignore[arg-type] + group.setdefault("weight_decay", weight_decay) # type: ignore[arg-type] if group["lr"] is None: raise ValueError( "scaled_params() requires lr to be provided," @@ -133,10 +133,10 @@ def scaled_parameters( for param in group["params"]: # Careful not to overwrite `lr` or `weight_decay` param_lr = group["lr"] - if has_parameter_data(param): + if has_parameter_data(param): # type: ignore[arg-type] if isinstance(param_lr, Tensor): param_lr = param_lr.clone() - param_lr *= lr_scale_func(param) + param_lr *= lr_scale_func(param) # type: ignore[operator] elif not allow_non_unit_scaling_params: raise ValueError( "Non-unit-scaling parameter (no mup_type)," @@ -145,7 +145,7 @@ def scaled_parameters( param_weight_decay = group["weight_decay"] if independent_weight_decay: # Note: only independent of peak LR, not of schedule - param_weight_decay /= float(param_lr) + param_weight_decay /= float(param_lr) # type: ignore result.append( dict( diff --git a/unit_scaling/scale.py b/unit_scaling/scale.py index b1360e3..a5828eb 100644 --- a/unit_scaling/scale.py +++ b/unit_scaling/scale.py @@ -11,14 +11,6 @@ from ._internal_utils import generate__all__ -try: # pragma: no cover - import poptorch - import poptorch_experimental_addons as pea - - _poptorch_available = True -except ImportError: # pragma: no cover - _poptorch_available = False - class _ScaledGrad(torch.autograd.Function): # pragma: no cover """Enables a custom backward method which has a different scale to forward.""" @@ -51,9 +43,6 @@ def _scale( t: Tensor, fwd_scale: float = 1.0, bwd_scale: float = 1.0 ) -> Tensor: # pragma: no cover """Given a tensor, applies a separate scale in the forward and backward pass.""" - - if _poptorch_available and poptorch.isRunningOnIpu(): - return pea.autograd_proxy(t * fwd_scale, t * bwd_scale) # type: ignore return _ScaledGrad.apply(t, fwd_scale, bwd_scale) # type: ignore diff --git a/unit_scaling/tests/test_analysis.py b/unit_scaling/tests/test_analysis.py index 0c31637..2b8a9c1 100644 --- a/unit_scaling/tests/test_analysis.py +++ b/unit_scaling/tests/test_analysis.py @@ -4,7 +4,7 @@ import torch.nn.functional as F from torch import Size, Tensor, nn, randn -from transformers import AutoTokenizer # type: ignore[import] +from transformers import AutoTokenizer # type: ignore[import-untyped] from ..analysis import _create_batch, _example_seqs, example_batch, plot, visualiser from ..transforms import track_scales