add hta

tushar00jain · tushar00jain · commit 03702a4674b4 · 2025-08-19T10:39:08.000-07:00
Test Plan:
```
$ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000

$ USE_NCCL=True LOG_LEVEL=DEBUG RUST_LOG=error USE_STREAMING=True torchx run ./torchft/torchx.py:hsdp --script='train_diloco.py'

$ python trace_analysis.py
```
diff --git a/trace_analysis.py b/trace_analysis.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+from hta.trace_analysis import TraceAnalysis
+
+_PROFILES_DIR = "output/replica-0/profiles/step-120"
+
+
+def main():
+    analyzer = TraceAnalysis(trace_dir=_PROFILES_DIR)
+    cp_graph, success = analyzer.critical_path_analysis(
+        rank=0, annotation="", instance_id=None
+    )
+    if not success:
+        print("Critical path analysis failed")
+        return
+    analyzer.overlay_critical_path_analysis(0, cp_graph, output_dir=_PROFILES_DIR)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train_diloco.py b/train_diloco.py
@@ -43,6 +43,7 @@
 @record
 def main() -> None:
     REPLICA_GROUP_ID = int(os.environ.get("REPLICA_GROUP_ID", 0))
+    RANK = int(os.environ.get("RANK", 0))
     RUN = int(os.environ.get("RUN", 0))
 
     output_folder = f"output/replica-{REPLICA_GROUP_ID}"
@@ -177,11 +178,11 @@ def forward(self, x):
     print(f"Total number of parameters: {num_params}")
 
     def trace_handler(p):
-        dir = f"{output_folder}/profiles"
+        dir = f"{output_folder}/profiles/step-{p.step_num}"
         if not os.path.exists(dir):
             os.makedirs(dir, exist_ok=True)
 
-        p.export_chrome_trace(f"{dir}/step-{p.step_num}.json")
+        p.export_chrome_trace(f"{dir}/rank-{RANK}.json")
 
     # You can use an epoch based training but with faults it's easier to use step
     # based training.
@@ -190,6 +191,9 @@ def trace_handler(p):
         on_trace_ready=trace_handler,
         record_shapes=False,
         profile_memory=False,
+        experimental_config=torch.profiler._ExperimentalConfig(  # type: ignore
+            enable_cuda_sync_events=True
+        ),
     )
 
     prof.start()