option 2 - call work.wait inside wrapped work

tushar00jain · tushar00jain · commit 83957be8a4d5 · 2025-07-28T23:06:17.000-07:00
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -521,10 +521,6 @@ def _bucketize_and_allreduce(
                 pack_offset += numel
                 flat_index += 1
 
-            work = self._manager.allreduce(
-                flat_buffer, should_quantize=self.should_quantize
-            )
-
             def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
                 with torch.cuda.stream(self._stream) if self._stream else nullcontext():
                     nonlocal bucket_tensors, flat_buffer
@@ -535,8 +531,11 @@ def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
                             flat_buffer[pack_offset : pack_offset + numel].view_as(t)
                         )
 
-            fut = work.get_future()
-            fut = fut.then(callback)
+            work = self._manager.allreduce(
+                flat_buffer,
+                should_quantize=self.should_quantize,
+                callback=callback,
+            )
 
             self._allreduce_work.append(work)
 
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -38,13 +38,14 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar, cast
 
 import torch
+import torch.distributed as dist
 from torch.distributed import ReduceOp, TCPStore
 from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp, Work
 
 from torchft._torchft import ManagerClient, ManagerServer
 from torchft.checkpointing import CheckpointTransport, HTTPTransport
 from torchft.futures import future_timeout
-from torchft.work import _DummyWork, _WorkWrapper
+from torchft.work import _DummyWork
 
 if TYPE_CHECKING:
     from torchft.process_group import ProcessGroup
@@ -74,6 +75,7 @@
 QUORUM_RETRIES_ENV: str = "TORCHFT_QUORUM_RETRIES"
 
 T = TypeVar("T")
+type AllReduceCallback = Callable[[torch.futures.Future[torch.Tensor]], None]
 
 
 def get_timeout(
@@ -350,7 +352,12 @@ def shutdown(self, wait: bool = True) -> None:
         self._executor.shutdown(wait=wait)
 
     @torch.profiler.record_function("torchft::manager::allreduce")
-    def allreduce(self, tensor: torch.Tensor, should_quantize: bool = False) -> Work:
+    def allreduce(
+        self,
+        tensor: torch.Tensor,
+        should_quantize: bool = False,
+        callback: Optional[AllReduceCallback] = None,
+    ) -> Work:
         """
         Fault tolerant allreduce the tensor and return a Future that will be completed when
         the tensor is ready.
@@ -388,37 +395,8 @@ def allreduce(self, tensor: torch.Tensor, should_quantize: bool = False) -> Work
                 )
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
-                work.block_current_stream()
-
-            fut = work.get_future()
-
-            stream: Optional[torch.cuda.Stream] = (
-                torch.cuda.current_stream() if torch.cuda.is_available() else None
-            )
-
-            # schedule grad normalization as a continuation
-            # on the Future
-            @torch.profiler.record_function("torchft::manager::allreduce::callback")
-            def callback(
-                fut: torch.futures.Future[List[torch.Tensor]],
-            ) -> torch.Tensor:
-                nonlocal tensor, stream, num_participants
-
-                # change the stream to avoid making the callback stream
-                # dependent on process group stream running the allreduce
-                with torch.cuda.stream(stream) if stream is not None else nullcontext():
-                    # Setup stream dependency
-                    fut.wait()
-                    fut.value()
-                    tensor /= num_participants
-
-                    return tensor
-
-            fut = fut.then(callback)
-
-            fut = self.wrap_future(fut, tensor)
 
-            return _WorkWrapper(work, fut)
+            return _WorkWrapper(work, self, tensor, num_participants, callback)
 
         except Exception as e:
             self._logger.exception(
@@ -938,3 +916,77 @@ def warn(self, msg: str) -> None:
 
     def exception(self, msg: str) -> None:
         self._logger.exception(f"{self.prefix()} {msg}")
+
+
+class _WorkWrapper(dist._Work):
+    def __init__(
+        self,
+        work: dist._Work,
+        manager: Manager,
+        tensor: torch.Tensor,
+        num_participants: int,
+        callback: Optional[AllReduceCallback],
+    ) -> None:
+        super().__init__()
+        self._manager = manager
+        self._work = work
+        self._tensor = tensor
+        self._num_participants = num_participants
+        self._callback = callback
+
+        self._stream: Optional[torch.cuda.Stream] = (
+            torch.cuda.current_stream() if torch.cuda.is_available() else None
+        )
+
+    def _set_future_callback(
+        self,
+    ) -> None:
+        # schedule grad normalization as a continuation
+        # on the Future
+        @torch.profiler.record_function("torchft::manager::allreduce::callback")
+        def callback(
+            fut: torch.futures.Future[List[torch.Tensor]],
+        ) -> torch.Tensor:
+            # change the stream to avoid making the callback stream
+            # dependent on process group stream running the allreduce
+            with (
+                torch.cuda.stream(self._stream)
+                if self._stream is not None
+                else nullcontext()
+            ):
+                # Setup stream dependency
+                fut.wait()
+                self._tensor /= self._num_participants
+
+                return self._tensor
+
+        fut = self._work.get_future()
+        fut = fut.then(callback)
+        fut = self._manager.wrap_future(fut, self._tensor)
+        fut = fut.then(self._callback) if self._callback else fut
+
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            self._work.wait()
+
+        self._set_future_callback()
+
+        return True
+
+    def block_current_stream(self, timeout: Optional[timedelta] = None) -> None:
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            self._work.block_current_stream()
+
+        self._set_future_callback()
+
+    def get_future(self) -> torch.futures.Future[torch.Tensor]:
+        self.block_current_stream()
+        return self._work.get_future()
diff --git a/torchft/work.py b/torchft/work.py
@@ -18,19 +18,3 @@ def wait(self, timeout: Optional[timedelta] = None) -> bool:
 
     def get_future(self) -> torch.futures.Future[object]:
         return self.future_
-
-
-class _WorkWrapper(dist._Work):
-    def __init__(
-        self, work: dist._Work, fut: torch.futures.Future[torch.Tensor]
-    ) -> None:
-        super().__init__()
-        self._work = work
-        self._fut = fut
-
-    def wait(self, timeout: Optional[timedelta] = None) -> bool:
-        self._fut.wait()
-        return True
-
-    def get_future(self) -> torch.futures.Future[torch.Tensor]:
-        return self._fut