process_group: wrapper updates and ErrorSwallowingProcessGroup (#21)

d4l3k · web-flow · commit 4e8667692ea3 · 2024-12-04T11:44:17.000-08:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -252,8 +252,7 @@ def wrap_future(self, fut: torch.futures.Future[object], default: object) -> Non
             default: the default value to complete the Future with if an error occurs
         """
 
-        # schedule error handling and grad normalization as a continuation
-        # on the Future
+        # schedule error handling as a continuation on the Future
         def callback(
             fut: torch.futures.Future[List[torch.Tensor]],
         ) -> torch.futures.Future[torch.Tensor]:
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -20,7 +20,7 @@
 import threading
 from abc import ABC
 from datetime import timedelta
-from typing import Callable, List, Optional, Tuple, Type
+from typing import Callable, List, Optional, Tuple, Type, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
@@ -44,6 +44,9 @@
 
 from torch.futures import Future
 
+if TYPE_CHECKING:
+    from torchft.manager import Manager
+
 logger = logging.getLogger(__name__)
 
 # TODO: use non strings which are cheaper
@@ -177,18 +180,25 @@ def unregister(self) -> None:
         """
         dist.destroy_process_group(self)
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+
 
 class ProcessGroupWrapper(ProcessGroup):
     PG_CLASS: Type[BaseProcessGroup]
     """
     This is a wrapper around any ProcessGroup with a reconfiguration method.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, pg: Optional[ProcessGroup] = None) -> None:
         super().__init__(0, 1)
-        self._pg = None
+        self._pg = pg
 
     def configure(self, store_addr: str, rank: int, world_size: int) -> None:
+        if isinstance(self._pg, ProcessGroup):
+            self._pg.configure(store_addr, rank, world_size)
+            return
+
         if self._pg is not None:
             if hasattr(self._pg, "abort"):
                 self._pg.abort()
@@ -216,6 +226,12 @@ def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> Work:
     def size(self) -> int:
         return self._pg.size()
 
+    def parent(self) -> ProcessGroup:
+        return self._pg
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(pg={self._pg})"
+
 
 class ProcessGroupGloo(ProcessGroupWrapper):
     """
@@ -252,7 +268,7 @@ def __init__(self, result):
         self.future_ = torch.futures.Future()
         self.future_.set_result(result)
 
-    def wait(self, timeout):
+    def wait(self, timeout=None):
         return True
 
     def get_future(self):
@@ -278,6 +294,10 @@ def __init__(self, rank: int, world: int) -> None:
         self.wait_count = 0
         self.get_future_count = 0
         self._work = []
+        self.configure_count = 0
+
+    def configure(self, store_addr: str, rank: int, world_size: int) -> None:
+        self.configure_count += 1
 
     def broadcast(self, tensor_list, opts):
         res = _DummyWork(tensor_list)
@@ -304,6 +324,102 @@ def getBackendName(self):
         return "torchft-dummy"
 
 
+class _ErrorSwallowingWork(Work):
+    def __init__(
+        self,
+        pg: "ErrorSwallowingProcessGroup",
+        work: Work,
+        default_result: object,
+    ):
+        super().__init__()
+
+        self._pg = pg
+        self._work = work
+        self._default_result = default_result
+
+    def wait(self, timeout=None) -> bool:
+        try:
+            self._work.wait()
+        except Exception as e:
+            self._pg.report_error(e)
+
+        return True
+
+    def get_future(self) -> Future:
+        fut = self._work.get_future()
+
+        # schedule error handling as a continuation on the Future
+        def callback(
+            fut: torch.futures.Future[List[torch.Tensor]],
+        ) -> torch.futures.Future[torch.Tensor]:
+            try:
+                return fut.value()
+            except Exception as e:
+                logger.exception(f"got exception in future -- skipping remaining: {e}")
+                self._pg.report_error(e)
+                return self._default_result
+
+        fut = fut.then(callback)
+        return fut
+
+
+class ErrorSwallowingProcessGroupWrapper(ProcessGroupWrapper):
+    """
+    This is a wrapper around any ProcessGroup that will swallow errors and
+    return dummy results on error.
+
+    This is intended to allow handling errors outside of the training loop to
+    avoid having to modify modeling code to support error handling.
+
+    After an error occurs all future operations will be skipped until the
+    process group is reconfigured via ``configure``.
+    """
+
+    def __init__(self, pg: ProcessGroup) -> None:
+        super().__init__(pg)
+
+        self._error = None
+
+    def configure(self, store_addr: str, rank: int, world_size: int) -> None:
+        self._error = None
+
+        super().configure(store_addr, rank, world_size)
+
+    def report_error(self, e: Exception) -> None:
+        """
+        Report an error to this process group. This will cause all future
+        operations to be skipped until the process group is reconfigured via
+        ``configure``.
+
+        Args:
+            e: exception to report
+        """
+        self._error = e
+
+    def error(self) -> Optional[Exception]:
+        """
+        Returns the error that was reported to this process group.
+
+        Returns:
+            exception that was reported
+        """
+        return self._error
+
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        if self._error is not None:
+            return _DummyWork(tensors)
+
+        try:
+            return _ErrorSwallowingWork(
+                self,
+                super().allreduce(tensors, opts),
+                tensors,
+            )
+        except Exception as e:
+            self.report_error(e)
+            return _DummyWork(tensors)
+
+
 class _BabyWork(Work):
     def __init__(
         self,
diff --git a/torchft/process_group_test.py b/torchft/process_group_test.py
@@ -7,6 +7,7 @@
 import os
 from concurrent.futures import ThreadPoolExecutor
 from unittest import skipUnless, TestCase
+from unittest.mock import Mock
 
 import torch
 import torch.distributed as dist
@@ -16,13 +17,17 @@
 from torch.distributed.device_mesh import init_device_mesh
 
 from torchft.process_group import (
+    _DummyWork,
+    _ErrorSwallowingWork,
+    ErrorSwallowingProcessGroupWrapper,
     extend_device_mesh,
     ProcessGroup,
     ProcessGroupBabyGloo,
     ProcessGroupBabyNCCL,
     ProcessGroupDummy,
     ProcessGroupGloo,
     ProcessGroupNCCL,
+    ProcessGroupWrapper,
 )
 
 
@@ -194,3 +199,35 @@ def test_functional_collectives(self) -> None:
             _functional_collectives.all_reduce(t, "sum", pg).wait()
         finally:
             pg.unregister()
+
+    def test_process_group_wrapper(self) -> None:
+        pg = ProcessGroupDummy(0, 1)
+        wrapper = ProcessGroupWrapper(pg)
+        self.assertIs(wrapper.parent(), pg)
+
+        wrapper.configure("addr", 0, 1)
+        self.assertEqual(pg.configure_count, 1)
+
+        self.assertEqual(repr(wrapper), "ProcessGroupWrapper(pg=ProcessGroupDummy())")
+
+    def test_error_swallowing_process_group_wrapper(self) -> None:
+        pg = ProcessGroupDummy(0, 1)
+        wrapper = ErrorSwallowingProcessGroupWrapper(pg)
+        self.assertIs(wrapper.parent(), pg)
+
+        t = torch.zeros(10)
+        work = wrapper.allreduce([t], ReduceOp.SUM)
+        self.assertIsInstance(work, _ErrorSwallowingWork)
+        work.wait()
+        fut = work.get_future()
+        fut.wait()
+
+        err = RuntimeError("test")
+        wrapper.report_error(err)
+        self.assertEqual(wrapper.error(), err)
+
+        work = wrapper.allreduce([t], ReduceOp.SUM)
+        self.assertIsInstance(work, _DummyWork)
+        work.wait()
+        fut = work.get_future()
+        fut.wait()