We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 949a981 commit fef4abcCopy full SHA for fef4abc
train_diloco.py
@@ -34,7 +34,7 @@
34
ProcessGroupGloo,
35
ProcessGroupNCCL,
36
)
37
-from torchft.checkpointing.pg_transport import PGTransport
+from torchft.checkpointing.http_transport import HTTPTransport
38
from torchft.local_sgd import DiLoCo
39
40
logging.basicConfig(level=logging.INFO)
@@ -67,13 +67,12 @@ def state_dict():
67
timeout=timedelta(seconds=10),
68
69
if torch.cuda.is_available() and USE_NCCL
70
- else ProcessGroupGloo(timeout=timedelta(seconds=5))
+ else ProcessGroupGloo(timeout=timedelta(seconds=10))
71
72
73
- transport = PGTransport(
74
- pg,
+ transport = HTTPTransport(
75
76
- device=device,
+ num_chunks=0,
77
78
79
manager = Manager(
0 commit comments