File tree Expand file tree Collapse file tree 3 files changed +15
-7
lines changed Expand file tree Collapse file tree 3 files changed +15
-7
lines changed Original file line number Diff line number Diff line change @@ -13,7 +13,7 @@ Easy Per Step Fault Tolerance for PyTorch
1313 | <a href =" https://pytorch.org/torchft/ " ><b >Documentation</b ></a >
1414 | <a href =" https://github.com/pytorch-labs/torchft/blob/main/media/fault_tolerance_poster.pdf " ><b >Poster</b ></a >
1515 | <a href =" https://docs.google.com/document/d/1OZsOsz34gRDSxYXiKkj4WqcD9x0lP9TcsfBeu_SsOY4/edit " ><b >Design Doc</b ></a >
16- |
16+ |
1717</p >
1818<p align =" center " >
1919 <a href =" https://pypi.org/project/torchft-nightly/ " ><img alt =" PyPI - Version " src =" https://img.shields.io/pypi/v/torchft-nightly " ></a >
@@ -98,7 +98,7 @@ when using synchronous training.
9898You can start a lighthouse server by running:
9999
100100``` sh
101- $ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 1000
101+ $ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000
102102```
103103
104104### Example Training Loop (DDP)
@@ -108,7 +108,7 @@ See [train_ddp.py](./train_ddp.py) for the full example.
108108Invoke with:
109109
110110``` sh
111- $ TORCHFT_MANAGER_PORT=29512 TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
111+ $ TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
112112```
113113
114114train.py:
Original file line number Diff line number Diff line change @@ -77,7 +77,7 @@ pub struct LighthouseOpt {
7777 #[ structopt(
7878 long = "join_timeout_ms" ,
7979 default_value = "60000" ,
80- help = "How long to wait for new replicas to join before considering a quorum"
80+ help = "How long to wait for heartbeating stragglers to join before issuing quorum"
8181 ) ]
8282 pub join_timeout_ms : u64 ,
8383
@@ -90,14 +90,14 @@ pub struct LighthouseOpt {
9090 #[ structopt(
9191 long = "quorum_tick_ms" ,
9292 default_value = "100" ,
93- help = "How frequently to check for quorum when waiting for workers ."
93+ help = "How frequently to check for quorum when waiting for stragglers ."
9494 ) ]
9595 pub quorum_tick_ms : u64 ,
9696
9797 #[ structopt(
9898 long = "heartbeat_timeout_ms" ,
9999 default_value = "5000" ,
100- help = "how long to wait for a heartbeat before considering a replica dead."
100+ help = "How long to wait for a heartbeat before considering a replica dead."
101101 ) ]
102102 pub heartbeat_timeout_ms : u64 ,
103103}
Original file line number Diff line number Diff line change 77import logging
88import os
99import sys
10+ from datetime import timedelta
1011
1112import torch
1213import torch .nn .functional as F
@@ -70,14 +71,21 @@ def state_dict():
7071 }
7172
7273 device = "cuda" if torch .cuda .is_available () else "cpu"
73- pg = ProcessGroupBabyNCCL () if torch .cuda .is_available () else ProcessGroupGloo ()
74+ pg = (
75+ ProcessGroupBabyNCCL (
76+ timeout = timedelta (seconds = 5 ),
77+ )
78+ if torch .cuda .is_available ()
79+ else ProcessGroupGloo (timeout = timedelta (seconds = 5 ))
80+ )
7481
7582 manager = Manager (
7683 pg = pg ,
7784 min_replica_size = 1 ,
7885 load_state_dict = load_state_dict ,
7986 state_dict = state_dict ,
8087 replica_id = f"train_ddp_{ REPLICA_GROUP_ID } " ,
88+ timeout = timedelta (seconds = 10 ),
8189 )
8290
8391 class Net (nn .Module ):
You can’t perform that action at this time.
0 commit comments