lint and feedback

amirafzali · amirafzali · commit 8e4bbfe5f1ba · 2025-09-18T15:39:12.000Z
diff --git a/examples/monarch/README.md b/examples/monarch/README.md
@@ -1,23 +1,23 @@
-Monarch-TorchFT-TorchTitan Distributed Training Orchestrator
+### Monarch-TorchFT-TorchTitan Distributed Training Orchestrator
 
+#### Overview 
 This script orchestrates fault-tolerant distributed training using TorchTitan and TorchMonarch
 frameworks. It manages multiple training replicas across SLURM-scheduled compute nodes
 with automatic failure recovery and TorchFT lighthouse coordination.
 
-PREREQUISITES:
+##### PREREQUISITES
 - Access to a SLURM cluster with GPU nodes
-- Environment with nightly TorchFT, TorchTitan, and Monarch libraries installed.
 - TorchTitan training configuration file in script directory (debug_model.toml)
 - A training dataset (c4_test) and tokenizer in script directory
 
-CONFIGURATION:
+##### CONFIGURATION
 Before running, update the cluster-specific constants:
 - MACHINE: TorchX named resource for your cluster (currently: "gpu.xlarge")
 - MACHINE_MEMORY: Memory per machine in MB (currently: 2062607)
 You can also override the resource configuration manually:
 - https://docs.pytorch.org/torchx/main/specs.html#resource
 
-USAGE:
+##### USAGE
     python train_distributed.py --help
 
     Basic usage with 2 replicas, each with 1 node and 8 GPUs:
@@ -30,21 +30,21 @@ USAGE:
     With remote TorchFT lighthouse:
         python train_distributed.py --remote-lighthouse
 
-KEY COMPONENTS:
+##### KEY COMPONENTS
 - LighthouseActor: Coordination server for fault tolerance
 - TrainingActor: Individual trainer processes
 - ReplicaActor: Manages groups of trainers
 - OrchestrationManager: Top-level orchestration and failure recovery
 
-FAILURE RECOVERY:
-- Automatic replica retry with configurable delays (PER_ATTEMPT_DELAY)
+##### FAILURE RECOVERY
+- Automatic retry with configurable delays (PER_ATTEMPT_DELAY)
 - New allocations after repeated failures (PROC_ATTEMPTS)
 - Maximum attempts per replica (MAX_ATTEMPT)
 
-OUTPUT:
+##### OUTPUT
 - Training outputs saved to ./outputs directory
 - Logs streamed from all distributed processes
 - TensorBoard metrics enabled by default
 
-CLEANUP:
+##### CLEANUP
 All SLURM jobs are automatically terminated at script completion.
diff --git a/examples/monarch/train_distributed.py b/examples/monarch/train_distributed.py
@@ -7,23 +7,20 @@
 
 import argparse
 import asyncio
+import atexit
 import os
-
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Dict
-import atexit
 
 import torch
-
 from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
 from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
-from monarch.actor import Actor, current_rank, endpoint, ProcMesh, this_host
+from monarch.actor import Actor, ProcMesh, current_rank, endpoint, this_host
 from monarch.tools import commands
 from monarch.tools.components import hyperactor
 from monarch.tools.config import Config
 from monarch.utils import setup_env_for_distributed
-
 from torchtitan.config import ConfigManager, JobConfig
 from torchtitan.tools.logging import init_logger, logger
 from torchtitan.train import Trainer
@@ -73,7 +70,9 @@ def proc_mesh(
     ) -> ProcMesh:
         allocator = RemoteAllocator(
             world_id=MonarchSlurm.job_name_prefix,
-            initializer=TorchXRemoteAllocInitializer(f"slurm:///{cls.job_handles[mesh_name]}"),
+            initializer=TorchXRemoteAllocInitializer(
+                f"slurm:///{cls.job_handles[mesh_name]}"
+            ),
         )
         alloc = allocator.allocate(
             AllocSpec(AllocConstraints(), hosts=num_hosts, gpus=num_gpus)
@@ -84,13 +83,16 @@ def proc_mesh(
 
 # ==== allocation boilerplate ====
 
+
 class LighthouseActor(Actor):
     def __init__(self) -> None:
         self.lighthouse = None
 
     @endpoint
     def start_lighthouse(self) -> str:
+        # inline import because of https://github.com/meta-pytorch/monarch/issues/804
         from torchft.coordination import LighthouseServer
+
         self.lighthouse = LighthouseServer(
             bind="[::]:0", min_replicas=1, join_timeout_ms=10000
         )
@@ -217,7 +219,9 @@ async def start_training(self) -> None:
         )
 
         for replica_id in range(self.spec.replica_count):
-            await MonarchSlurm.get_or_create_job(f"replica_{replica_id}", self.spec.hosts_per_replica)
+            await MonarchSlurm.get_or_create_job(
+                f"replica_{replica_id}", self.spec.hosts_per_replica
+            )
 
         mesh_futures = {}
         for i in range(self.spec.replica_count):
@@ -305,6 +309,7 @@ async def _teardown(self, replica_id: int) -> None:
 
 # === CLI / CONFIG === #
 
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Monarch-TorchFT Distributed Training Example"
@@ -398,6 +403,8 @@ def make_job_spec(args: argparse.Namespace) -> JobSpec:
         hosts_per_replica=args.host_per_replica,
         gpus_per_node=args.gpu_per_node,
     )
+
+
 # === CLI / CONFIG === #