_registrations.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # flake8: noqa: B950
  2. from ._internal import register_artifact, register_log
  3. DYNAMIC = [
  4. "torch.fx.experimental.symbolic_shapes",
  5. "torch.fx.experimental.sym_node",
  6. "torch.fx.experimental.recording",
  7. ]
  8. DISTRIBUTED = [
  9. "torch.distributed",
  10. "torch._dynamo.backends.distributed",
  11. "torch.nn.parallel.distributed",
  12. ]
  13. register_log(
  14. "async_compile",
  15. [
  16. "torch._inductor.async_compile",
  17. "torch._inductor.compile_worker.tracked_process_pool",
  18. ],
  19. )
  20. register_log(
  21. "cache", ("torch._inductor.remote_cache", "torch._inductor.fb.remote_cache")
  22. )
  23. register_log("dynamo", ["torch._dynamo", *DYNAMIC])
  24. register_log("fake_tensor", ["torch._subclasses.fake_tensor"])
  25. register_log("aot", ["torch._functorch.aot_autograd", "torch._functorch._aot_autograd"])
  26. register_log("autograd", "torch.autograd")
  27. register_log("inductor", ["torch._inductor", "torch._inductor.cudagraph_trees"])
  28. register_artifact(
  29. "cudagraphs",
  30. "Logs information from wrapping inductor generated code with cudagraphs.",
  31. )
  32. register_log("dynamic", DYNAMIC)
  33. register_log("torch", "torch")
  34. register_log("distributed", DISTRIBUTED)
  35. register_log(
  36. "c10d", ["torch.distributed.distributed_c10d", "torch.distributed.rendezvous"]
  37. )
  38. register_log(
  39. "ddp", ["torch.nn.parallel.distributed", "torch._dynamo.backends.distributed"]
  40. )
  41. register_log("pp", ["torch.distributed.pipelining"])
  42. register_log("fsdp", ["torch.distributed.fsdp", "torch.distributed._composable.fsdp"])
  43. register_log("dtensor", ["torch.distributed._tensor", "torch.distributed.tensor"])
  44. register_log("onnx", "torch.onnx")
  45. register_log(
  46. "export",
  47. [
  48. "torch._dynamo",
  49. "torch.export",
  50. "torch.export.dynamic_shapes",
  51. *DYNAMIC,
  52. "torch._export.converter",
  53. "torch._export.non_strict_utils",
  54. "torch._export.serde.serialize",
  55. "torch.fx.experimental.proxy_tensor",
  56. ],
  57. )
  58. register_artifact(
  59. "guards",
  60. "This prints the guards for every compiled Dynamo frame. It does not tell you where the guards come from.",
  61. visible=True,
  62. )
  63. register_artifact("verbose_guards", "", off_by_default=True)
  64. register_artifact(
  65. "bytecode",
  66. "Prints the original and modified bytecode from Dynamo. Mostly useful if you're debugging our bytecode generation in Dynamo.",
  67. off_by_default=True,
  68. )
  69. register_artifact(
  70. "graph",
  71. "Prints the dynamo traced graph (prior to AOTDispatch) in a table. If you prefer python code use `graph_code` instead. ",
  72. )
  73. register_artifact("graph_code", "Like `graph`, but gives you the Python code instead.")
  74. register_artifact(
  75. "graph_code_verbose",
  76. "Verbose FX pass logs, e.g. from tensorify_python_scalars and runtime_assert.",
  77. )
  78. register_artifact(
  79. "graph_sizes", "Prints the sizes of all FX nodes in the dynamo graph."
  80. )
  81. register_artifact(
  82. "trace_source",
  83. "As we execute bytecode, prints the file name / line number we are processing and the actual source code. Useful with `bytecode`",
  84. )
  85. register_artifact(
  86. "trace_call",
  87. "Like trace_source, but it will give you the per-expression blow-by-blow if your Python is recent enough.",
  88. )
  89. register_artifact(
  90. "trace_bytecode",
  91. "As we trace bytecode, prints the instruction and the current stack.",
  92. )
  93. register_artifact(
  94. "aot_graphs",
  95. "Prints the FX forward and backward graph generated by AOTDispatch, after partitioning. Useful to understand what's being given to Inductor",
  96. visible=True,
  97. )
  98. register_artifact(
  99. "aot_joint_graph",
  100. "Print FX joint graph from AOTAutograd, prior to partitioning. Useful for debugging partitioning",
  101. )
  102. register_artifact(
  103. "aot_graphs_effects",
  104. "Prints the FX forward and backward graph generated by AOTDispatch, useful for debugging effects processing.",
  105. visible=True,
  106. )
  107. register_artifact(
  108. "pre_grad_graphs",
  109. "Prints the FX graph before inductor pre grad passes. Useful to understand what's being given to Inductor before grad passes",
  110. )
  111. register_artifact(
  112. "post_grad_graphs",
  113. "Prints the FX graph generated by post grad passes. Useful to understand what's being given to Inductor after post grad passes",
  114. )
  115. register_artifact(
  116. "ir_pre_fusion",
  117. "Prints the IR before inductor fusion passes.",
  118. off_by_default=True,
  119. )
  120. register_artifact(
  121. "ir_post_fusion",
  122. "Prints the IR after inductor fusion passes.",
  123. off_by_default=True,
  124. )
  125. register_artifact(
  126. "compiled_autograd",
  127. "Prints various logs in compiled_autograd, including but not limited to the graphs. Useful for debugging compiled_autograd.",
  128. visible=True,
  129. )
  130. register_artifact(
  131. "compiled_autograd_verbose",
  132. "Will affect performance. Prints compiled_autograd logs with C++ info e.g. autograd node -> fx node mapping",
  133. off_by_default=True,
  134. )
  135. register_artifact(
  136. "ddp_graphs",
  137. "Only relevant for compiling DDP. DDP splits into multiple graphs to trigger comms early. This will print each individual graph here.",
  138. )
  139. register_artifact(
  140. "recompiles",
  141. "Prints the reason why we recompiled a graph. Very, very useful.",
  142. visible=True,
  143. )
  144. register_artifact(
  145. "recompiles_verbose",
  146. "Prints all guard checks that fail during a recompilation. "
  147. "At runtime, Dynamo will stop at the first failed check for each failing guard. "
  148. "So not all logged failing checks are actually ran by Dynamo.",
  149. visible=True,
  150. off_by_default=True,
  151. )
  152. register_artifact(
  153. "graph_breaks",
  154. "Prints whenever Dynamo decides that it needs to graph break (i.e. create a new graph). Useful for debugging why torch.compile has poor performance",
  155. visible=True,
  156. )
  157. register_artifact(
  158. "side_effects",
  159. "Prints all side effects that Dynamo codegenerates, including mutations to variables, attributes, cells, and globals. Useful for debugging side effect handling",
  160. visible=True,
  161. )
  162. register_artifact(
  163. "not_implemented",
  164. "Prints log messages whenever we return NotImplemented in a multi-dispatch, letting you trace through each object we attempted to dispatch to",
  165. )
  166. register_artifact(
  167. "output_code",
  168. "Prints the code that Inductor generates (either Triton or C++)",
  169. off_by_default=True,
  170. visible=True,
  171. )
  172. register_artifact(
  173. "kernel_code",
  174. "Prints the code that Inductor generates (on a per-kernel basis)",
  175. off_by_default=True,
  176. visible=True,
  177. )
  178. register_artifact(
  179. "schedule",
  180. "Inductor scheduler information. Useful if working on Inductor fusion algo",
  181. off_by_default=True,
  182. )
  183. register_artifact("perf_hints", "", off_by_default=True)
  184. register_artifact("onnx_diagnostics", "", off_by_default=True)
  185. register_artifact("compute_dependencies", "", off_by_default=True)
  186. register_artifact(
  187. "fusion",
  188. "Detailed Inductor fusion decisions. More detailed than 'schedule'",
  189. off_by_default=True,
  190. )
  191. register_artifact(
  192. "loop_ordering",
  193. "Logs related to loop ordering",
  194. off_by_default=True,
  195. )
  196. register_artifact(
  197. "loop_tiling",
  198. "Logs related to loop ordering",
  199. off_by_default=True,
  200. )
  201. register_artifact(
  202. "auto_chunker",
  203. "Logs related to the auto chunker",
  204. off_by_default=True,
  205. )
  206. register_artifact(
  207. "overlap",
  208. "Detailed Inductor compute/comm overlap decisions",
  209. off_by_default=True,
  210. )
  211. register_artifact(
  212. "overlap_scheduling",
  213. "Detailed Inductor overlap scheduling pass information",
  214. off_by_default=True,
  215. )
  216. register_artifact(
  217. "sym_node",
  218. "Logs extra info for various SymNode operations",
  219. off_by_default=True,
  220. )
  221. register_artifact(
  222. "trace_shape_events",
  223. "Logs traces for every ShapeEnv operation that we record for replay",
  224. off_by_default=True,
  225. )
  226. register_artifact(
  227. "cudagraph_static_inputs",
  228. "Logs static inputs handling in dynamo, AOT, and cudagraphs",
  229. off_by_default=True,
  230. )
  231. register_artifact(
  232. "benchmarking",
  233. "Detailed Inductor benchmarking information.",
  234. off_by_default=True,
  235. )
  236. register_artifact(
  237. "node_runtime_estimation",
  238. "Node runtime estimation for compile-time optimization decisions.",
  239. off_by_default=True,
  240. )
  241. register_artifact(
  242. "autotuning",
  243. "Autotuning choice logs, such as kernel source, perf, and tuning parameters.",
  244. off_by_default=True,
  245. )
  246. register_artifact(
  247. "graph_region_expansion",
  248. "Logs detailed steps of the duplicate graph region tracker expansion algorithm",
  249. off_by_default=True,
  250. )
  251. register_artifact(
  252. "inductor_metrics",
  253. "Logs Inductor metrics, such as num_bytes, nodes_num_elem, node_runtimes",
  254. off_by_default=True,
  255. )
  256. register_artifact(
  257. "hierarchical_compile",
  258. "Logs debug info for hierarchical compilation",
  259. off_by_default=True,
  260. )
  261. register_artifact(
  262. "annotation",
  263. "Logs detailed steps of the creating annotation on graph nodes",
  264. off_by_default=True,
  265. )
  266. register_artifact("custom_format_test_artifact", "Testing only", log_format="")
  267. register_artifact(
  268. "caching",
  269. "Detailed Inductor caching information.",
  270. off_by_default=True,
  271. )