manager_integ_tests: added multi rank recovery

d4l3k · d4l3k · commit 43ef1757b35c · 2024-12-13T13:40:11.000-08:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -33,7 +33,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
 from enum import Enum
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, TypeVar, cast
+from typing import Callable, cast, Dict, List, Optional, TYPE_CHECKING, TypeVar
 
 import torch
 from torch.distributed import ReduceOp, TCPStore
@@ -374,16 +374,19 @@ def _async_quorum(self) -> None:
                 self._participating_rank = None
 
         if quorum_id != self._quorum_id:
-            logger.info(f"{replica_rank=} reconfiguring for quorum_id {quorum_id}")
             store_prefixed_addr = f"{store_address}/torchft/{quorum_id}/{self._rank}"
+
+            logger.info(
+                f"{replica_rank=} reconfiguring for {quorum_id=} {store_prefixed_addr=}"
+            )
             # We use the replica rank and world as we want all replicas in the PG.
             self._pg.configure(store_prefixed_addr, replica_rank, replica_world_size)
             self._quorum_id = quorum_id
 
         # See manager.rs for healing conditions
         if heal:
             self._healing = True
-            logger.info(f"{replica_rank}= healing required")
+            logger.info(f"{replica_rank=} healing required")
 
             logger.info(f"fetching checkpoint server address from {address}")
             primary_client = ManagerClient(address, timeout=self._timeout)
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
@@ -1,6 +1,7 @@
-from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+from concurrent.futures import as_completed, ThreadPoolExecutor
 from contextlib import ExitStack
-from typing import Dict, Set, Tuple
+from typing import Dict, List, Set, Tuple
 from unittest import TestCase
 
 import torch
@@ -32,32 +33,74 @@ class InjectedFailure(Exception):
 
 class FailureInjector:
     def __init__(self) -> None:
-        self._failures: Set[int] = set()
+        self._lock = threading.Lock()
+        self._failures: Set[Tuple[int, int]] = set()
         self.count = 0
 
-    def fail_at(self, step: int) -> "FailureInjector":
-        self._failures.add(step)
-        return self
+    def fail_at(self, rank: int, step: int) -> "FailureInjector":
+        with self._lock:
+            self._failures.add((rank, step))
+            return self
 
-    def check(self, step: int) -> None:
-        if step in self._failures:
-            self.count += 1
-            self._failures.remove(step)
-            print(f"injecting failure {step=}")
-            raise InjectedFailure(f"injected failure {step=}")
+    def check(self, rank: int, step: int) -> None:
+        with self._lock:
+            key = (rank, step)
+            if key in self._failures:
+                self.count += 1
+                self._failures.remove(key)
+                print(f"injecting failure {rank=} {step=}")
+                raise InjectedFailure(f"injected failure {rank=} {step=}")
+
+
+def replica_main(
+    replica_id: int,
+    lighthouse_address: str,
+    failure_injector: FailureInjector,
+    world_size: int,
+) -> List[Dict[str, Dict[str, object]]]:
+    store = dist.TCPStore(
+        host_name="localhost",
+        port=0,
+        is_master=True,
+        wait_for_workers=False,
+    )
+
+    with ThreadPoolExecutor(
+        max_workers=world_size, thread_name_prefix=f"replica{replica_id}"
+    ) as executor:
+        futures = []
+        for rank in range(world_size):
+            futures.append(
+                executor.submit(
+                    train_loop,
+                    replica_id,
+                    lighthouse_address,
+                    failure_injector=failure_injector,
+                    rank=rank,
+                    world_size=world_size,
+                    store_port=store.port,
+                )
+            )
+
+        return [fut.result() for fut in as_completed(futures)]
 
 
 def worker_manager(
     replica_id: int,
     lighthouse_address: str,
     failure_injector: FailureInjector,
     attempts: int = 3,
-) -> Dict[str, Dict[str, object]]:
+    world_size: int = 1,
+) -> List[Dict[str, Dict[str, object]]]:
+
     for i in range(attempts):
         try:
-            print(f"starting worker {replica_id} attempt {i}")
-            return train_loop(
-                replica_id, lighthouse_address, failure_injector=failure_injector
+            print(f"starting replica group {replica_id=} {world_size=} attempt {i}")
+            return replica_main(
+                replica_id,
+                lighthouse_address,
+                failure_injector=failure_injector,
+                world_size=world_size,
             )
         except InjectedFailure as e:
             print("got injected failure", i, e)
@@ -69,15 +112,14 @@ def worker_manager(
 
 
 def train_loop(
-    replica_id: int, lighthouse_address: str, failure_injector: FailureInjector
+    replica_id: int,
+    lighthouse_address: str,
+    failure_injector: FailureInjector,
+    rank: int,
+    world_size: int,
+    store_port: int,
 ) -> Dict[str, Dict[str, object]]:
     with ExitStack() as stack:
-        store = dist.TCPStore(
-            host_name="localhost",
-            port=0,
-            is_master=True,
-            wait_for_workers=False,
-        )
 
         def load_state_dict(state_dict: Dict[str, Dict[str, object]]) -> None:
             m.load_state_dict(state_dict["model"])
@@ -89,6 +131,8 @@ def state_dict() -> Dict[str, Dict[str, object]]:
                 "optim": optimizer.state_dict(),
             }
 
+        print(f"worker {replica_id=} {rank=} {world_size=} starting")
+
         pg = ProcessGroupGloo()
         manager = Manager(
             pg=pg,
@@ -97,9 +141,9 @@ def state_dict() -> Dict[str, Dict[str, object]]:
             state_dict=state_dict,
             replica_id=str(replica_id),
             store_addr="localhost",
-            store_port=store.port,
-            rank=0,
-            world_size=1,
+            store_port=store_port,
+            rank=rank,
+            world_size=world_size,
             lighthouse_addr=lighthouse_address,
             port=19530 + replica_id,
         )
@@ -112,7 +156,9 @@ def state_dict() -> Dict[str, Dict[str, object]]:
         criterion = nn.CrossEntropyLoss()
 
         while True:
-            print(f"worker {replica_id} starting step {manager.current_step()}")
+            print(
+                f"worker {replica_id=} {rank=} {world_size=} starting step {manager.current_step()}"
+            )
             inputs = torch.rand(2, 3)
             labels = torch.randint(4, (2,))
 
@@ -126,7 +172,7 @@ def state_dict() -> Dict[str, Dict[str, object]]:
             if manager.current_step() >= 5:
                 break
 
-            failure_injector.check(manager.current_step())
+            failure_injector.check(rank, manager.current_step())
 
         # return state_dict so we can check consistency
         return state_dict()
@@ -173,7 +219,7 @@ def test_ddp_recovery(self) -> None:
 
         failure_injectors = [
             FailureInjector(),
-            FailureInjector().fail_at(2),
+            FailureInjector().fail_at(0, 2),
         ]
 
         with ThreadPoolExecutor(max_workers=num_replicas) as executor:
@@ -200,3 +246,45 @@ def test_ddp_recovery(self) -> None:
             torch.testing.assert_close(state_dict, state_dicts[0])
 
         self.assertEqual(failure_injectors[1].count, 1)
+
+    def test_ddp_recovery_multi_rank(self) -> None:
+        lighthouse = Lighthouse(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 2
+        world_size = 2
+        futures = []
+
+        failure_injectors = [
+            FailureInjector(),
+            FailureInjector().fail_at(0, 2).fail_at(1, 2),
+        ]
+
+        with ThreadPoolExecutor(max_workers=num_replicas) as executor:
+            for replica_id, failure_injector in zip(
+                range(num_replicas), failure_injectors
+            ):
+                futures.append(
+                    executor.submit(
+                        worker_manager,
+                        replica_id,
+                        lighthouse.address(),
+                        failure_injector=failure_injector,
+                        world_size=world_size,
+                    )
+                )
+
+            state_dicts = []
+
+            for fut in as_completed(futures):
+                try:
+                    state_dicts.append(fut.result())
+                except Exception as e:
+                    print(e)
+                    raise
+
+        lighthouse.shutdown()
+
+        for state_dict in state_dicts:
+            torch.testing.assert_close(state_dict, state_dicts[0])