File tree Expand file tree Collapse file tree 2 files changed +32
-2
lines changed Expand file tree Collapse file tree 2 files changed +32
-2
lines changed Original file line number Diff line number Diff line change
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import os
8
+
9
+ from hta .trace_analysis import TraceAnalysis
10
+
11
+ _PROFILES_DIR = "output/replica-0/profiles/step-120"
12
+
13
+
14
+ def main ():
15
+ analyzer = TraceAnalysis (trace_dir = _PROFILES_DIR )
16
+ cp_graph , success = analyzer .critical_path_analysis (
17
+ rank = 0 , annotation = "" , instance_id = None
18
+ )
19
+ if not success :
20
+ print ("Critical path analysis failed" )
21
+ return
22
+ analyzer .overlay_critical_path_analysis (0 , cp_graph , output_dir = _PROFILES_DIR )
23
+
24
+
25
+ if __name__ == "__main__" :
26
+ main ()
Original file line number Diff line number Diff line change 43
43
@record
44
44
def main () -> None :
45
45
REPLICA_GROUP_ID = int (os .environ .get ("REPLICA_GROUP_ID" , 0 ))
46
+ RANK = int (os .environ .get ("RANK" , 0 ))
46
47
RUN = int (os .environ .get ("RUN" , 0 ))
47
48
48
49
output_folder = f"output/replica-{ REPLICA_GROUP_ID } "
@@ -177,11 +178,11 @@ def forward(self, x):
177
178
print (f"Total number of parameters: { num_params } " )
178
179
179
180
def trace_handler (p ):
180
- dir = f"{ output_folder } /profiles"
181
+ dir = f"{ output_folder } /profiles/step- { p . step_num } "
181
182
if not os .path .exists (dir ):
182
183
os .makedirs (dir , exist_ok = True )
183
184
184
- p .export_chrome_trace (f"{ dir } /step- { p . step_num } .json" )
185
+ p .export_chrome_trace (f"{ dir } /rank- { RANK } .json" )
185
186
186
187
# You can use an epoch based training but with faults it's easier to use step
187
188
# based training.
@@ -190,6 +191,9 @@ def trace_handler(p):
190
191
on_trace_ready = trace_handler ,
191
192
record_shapes = False ,
192
193
profile_memory = False ,
194
+ experimental_config = torch .profiler ._ExperimentalConfig ( # type: ignore
195
+ enable_cuda_sync_events = True
196
+ ),
193
197
)
194
198
195
199
prof .start ()
You can’t perform that action at this time.
0 commit comments