local_sgd: initial version of fault tolerant LocalSGD

d4l3k · d4l3k · commit a4bb97166f68 · 2024-12-18T11:30:11.000-08:00
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -17,6 +17,7 @@ the entire training job.
     manager
     optim
     ddp
+    local_sgd
     data
     checkpointing
     parameter_server
diff --git a/docs/source/local_sgd.rst b/docs/source/local_sgd.rst
@@ -0,0 +1,4 @@
+.. automodule:: torchft.local_sgd
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+LocalSGD
+=========
+
+This module implements a fault tolerant version of LocalSGD and related methods.
+"""
+
+from typing import Any, Dict, Mapping, Optional
+
+import torch
+from torch import nn
+
+from torchft.manager import Manager
+
+
+class LocalSGD(nn.Module):
+    """
+    LocalSGD is a model wrapper similar to DistributedDataParallel that
+    implements the algorithm described in https://arxiv.org/pdf/1805.09767
+
+    This will synchronize the model parameters periodically in a fault tolerant
+    way using a torchft Manager.
+
+    This expects you to call step() on every step of the training loop after
+    the optimizer step. This will then call the allreduce on the gradients
+    every sync_every steps.
+
+    To implement safe and fault tolerant, this requires a backup copy of the
+    weights. By default these are stored in CPU memory. If any error occurs
+    during the LocalSGD step, the step will be discarded and the model
+    parameters will reset back to the last time LocalSGD synchronized.
+
+    The backup weights could be eliminated by relaxing the guarantee of exactly
+    `sync_every` steps but that would diverge from the LocalSGD algorithm.
+    DiLoCo also needs this backup copy to compute the delta.
+
+    TODO: add DiLoCo support
+
+    The torchft quorum is computed at the beginning of ``sync_every`` steps. If
+    any error occurs, or a worker fails between syncs, ``sync_every`` steps will be
+    discarded and a new quorum will be computed on the next step.
+
+    TODO: add a way via Manager to detect workers heartbeats failing early
+
+    If running in async mode, on a joining worker the first ``sync_every`` steps
+    will discarded as the model will be recovering during that period. When
+    using sync mode, the checkpoint will be restored prior to the first step.
+    """
+
+    def __init__(
+        self,
+        manager: Manager,
+        model: nn.Module,
+        sync_every: int,
+        backup_device: Optional[torch.device] = None,
+    ) -> None:
+        """
+        Args:
+            manager: The manager to use.
+            model: The model to wrap.
+            sync_every: How often to sync the model weights.
+            backup_device: The device to store the backup of the model parameters on. (default cpu)
+        """
+        super().__init__()
+
+        self._manager = manager
+        self._model = model
+        self._local_step = 0
+        self._started_step = False
+        self._sync_every = sync_every
+        assert sync_every >= 1, "sync_every must be greater than or equal to 1"
+
+        device = backup_device or torch.device("cpu")
+
+        self._backup_parameters: Dict[str, torch.Tensor] = {}
+
+        for name, p in self._model.named_parameters():
+            t = torch.empty(*tuple(p.shape), dtype=p.dtype, device=device)
+            if t.device == torch.device("cpu"):
+                t = t.pin_memory()
+            self._backup_parameters[name] = t
+
+        # Need to copy the parameters to the host to be safe if we are on the first step.
+        self._save_parameters()
+
+    def _save_parameters(self) -> None:
+        # TODO: consider running copy on a separate stream
+        for name, p in self._model.named_parameters():
+            self._backup_parameters[name].copy_(p.data, non_blocking=True)
+
+    def _restore_parameters(self) -> None:
+        # TODO: consider running copy on a separate stream
+        for name, p in self._model.named_parameters():
+            p.data.copy_(self._backup_parameters[name], non_blocking=True)
+
+    # pyre-fixme[14]: support state_dict args
+    def state_dict(self) -> Dict[str, object]:
+        """
+        state_dict returns the state_dict from the last time LocalSGD
+        synchronized and not the current weights.
+        """
+        state_dict = self._model.state_dict()
+        for name, p in self._backup_parameters.items():
+            assert name in state_dict
+            state_dict[name] = p
+        return state_dict
+
+    def load_state_dict(
+        self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False
+    ) -> None:
+        """
+        Loads the state dict to the model and the backup parameters.
+
+        This must be called while the model weights aren't being modified to
+        avoid corrupting the backup weights.
+        """
+        self._model.load_state_dict(state_dict, strict=strict, assign=assign)
+        self._save_parameters()
+
+    def forward(self, *args: object, **kwargs: object) -> object:
+        """
+        Run the model parameters.
+
+        This should be called before the optimizer step.
+
+        This will start the quorum and save the parameters if this is the first step.
+        """
+        if self._local_step == 0:
+            self._manager.start_quorum()
+
+        self._started_step = True
+
+        return self._model.forward(*args, **kwargs)
+
+    def step(self) -> None:
+        """
+        This should be called after the optimizer step.
+
+        This will call the allreduce on the model weights every sync_every steps.
+        If any errors occur it will restore to the weights from the previous sync.
+
+        ``forward`` must be called before this function.
+        """
+        assert self._started_step, "forward must be called before step"
+        self._started_step = False
+
+        self._local_step += 1
+
+        if self._local_step >= self._sync_every:
+            self._local_step = 0
+            self._average()
+
+            if self._manager.should_commit():
+                # save the parameters so we can restore from them later if necessary.
+                self._save_parameters()
+            else:
+                # commit failed, restore from the backup parameters
+                self._restore_parameters()
+
+    def _average(self) -> None:
+        # TODO: do we need to broadcast buffers like DDP does?
+
+        works = []
+
+        for p in self._model.parameters():
+            # TODO: bucketize parameters
+            works.append(self._manager.allreduce_grad(p))
+
+        for work in works:
+            work.wait()
diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py
@@ -0,0 +1,98 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+from unittest import TestCase
+from unittest.mock import create_autospec
+
+import torch
+from torch import nn, optim
+
+from torchft.local_sgd import LocalSGD
+from torchft.manager import Manager
+
+
+class SimpleModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.model = nn.Sequential(
+            nn.Linear(3, 4),
+            nn.ReLU(),
+            nn.Linear(4, 5),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+
+def _params_dict(m: torch.nn.Module) -> Dict[str, torch.Tensor]:
+    return {name: p.data for name, p in m.named_parameters()}
+
+
+def _copy_state_dict(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    return {name: value.clone().detach() for name, value in state_dict.items()}
+
+
+class LocalSGDTest(TestCase):
+    def test_local_sgd_healthy(self) -> None:
+        base_m = SimpleModel()
+        optimizer = optim.SGD(base_m.parameters())
+        manager = create_autospec(Manager)
+
+        m = LocalSGD(manager, base_m, sync_every=2)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+
+        inp = torch.rand(2, 3)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        m.step()
+        self.assertEqual(m._local_step, 1)
+        self.assertEqual(manager.start_quorum.call_count, 1)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        manager.should_commit.return_value = True
+        m.step()
+        self.assertEqual(m._local_step, 0)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+        self.assertEqual(manager.should_commit.call_count, 1)
+        self.assertEqual(manager.allreduce_grad.call_count, 4)
+
+    def test_local_sgd_recovery(self) -> None:
+        base_m = SimpleModel()
+        optimizer = optim.SGD(base_m.parameters())
+        manager = create_autospec(Manager)
+
+        m = LocalSGD(manager, base_m, sync_every=2)
+
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+        og_state_dict = _copy_state_dict(base_m.state_dict())
+
+        inp = torch.rand(2, 3)
+
+        loss = m(inp).mean()
+        loss.backward()
+        optimizer.step()
+
+        m.step()
+        self.assertEqual(m._local_step, 1)
+
+        state_dict = m.state_dict()
+        torch.testing.assert_close(state_dict, m._backup_parameters)
+        torch.testing.assert_close(state_dict, og_state_dict)
+
+        m.load_state_dict(state_dict)
+        torch.testing.assert_close(_params_dict(base_m), state_dict)
+        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -324,12 +324,11 @@ def start_quorum(self, allow_heal: bool = True) -> None:
         It's best practice to call this before the forwards pass of each step for
         performance as computing quorum may take some time.
 
-        If allow_heal is set, the manager will attempt to heal either
-        synchronously before returning or asynchronously prior to any network
-        calls.
-
         Args:
-            allow_heal: whether to allow healing at the beginning of the step
+            allow_heal: (experimental) whether to allow healing at the beginning of the step
+                If allow_heal is set, the manager will attempt to heal either
+                synchronously before returning or asynchronously prior to any network
+                calls. All replicas must pass the same value to allow_heal.
         """
 
         # wait for previous quorum to complete