Config.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. /*
  3. * Copyright (c) Meta Platforms, Inc. and affiliates.
  4. * All rights reserved.
  5. *
  6. * This source code is licensed under the BSD-style license found in the
  7. * LICENSE file in the root directory of this source tree.
  8. */
  9. #pragma once
  10. #include "AbstractConfig.h"
  11. #include "ActivityType.h"
  12. #include <cassert>
  13. #include <chrono>
  14. #include <functional>
  15. #include <set>
  16. #include <string>
  17. #include <vector>
  18. namespace libkineto {
  19. class Config : public AbstractConfig {
  20. public:
  21. Config();
  22. Config& operator=(const Config&) = delete;
  23. Config(Config&&) = delete;
  24. Config& operator=(Config&&) = delete;
  25. ~Config() override = default;
  26. // Return a full copy including feature config object
  27. [[nodiscard]] std::unique_ptr<Config> clone() const {
  28. auto cfg = std::unique_ptr<Config>(new Config(*this));
  29. cloneFeaturesInto(*cfg);
  30. return cfg;
  31. }
  32. bool handleOption(const std::string& name, std::string& val) override;
  33. void setClientDefaults() override;
  34. // Log events to this file
  35. [[nodiscard]] const std::string& eventLogFile() const {
  36. return eventLogFile_;
  37. }
  38. [[nodiscard]] bool activityProfilerEnabled() const {
  39. return activityProfilerEnabled_ ||
  40. activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
  41. }
  42. // Log activitiy trace to this file
  43. [[nodiscard]] const std::string& activitiesLogFile() const {
  44. return activitiesLogFile_;
  45. }
  46. // Log activitiy trace to this url
  47. [[nodiscard]] const std::string& activitiesLogUrl() const {
  48. return activitiesLogUrl_;
  49. }
  50. void setActivitiesLogUrl(const std::string& url) {
  51. activitiesLogUrl_ = url;
  52. }
  53. [[nodiscard]] bool activitiesLogToMemory() const {
  54. return activitiesLogToMemory_;
  55. }
  56. [[nodiscard]] bool eventProfilerEnabled() const {
  57. return !eventNames_.empty() || !metricNames_.empty();
  58. }
  59. // Is profiling enabled for the given device?
  60. [[nodiscard]] bool eventProfilerEnabledForDevice(uint32_t dev) const {
  61. return 0 != (eventProfilerDeviceMask_ & (1 << dev));
  62. }
  63. // Take a sample (read hardware counters) at this frequency.
  64. // This controls how often counters are read - if all counters cannot
  65. // be collected simultaneously then multiple samples are needed to
  66. // collect all requested counters - see multiplex period.
  67. [[nodiscard]] std::chrono::milliseconds samplePeriod() const {
  68. return samplePeriod_;
  69. }
  70. void setSamplePeriod(std::chrono::milliseconds period) {
  71. samplePeriod_ = period;
  72. }
  73. // When all requested counters cannot be collected simultaneously,
  74. // counters will be multiplexed at this frequency.
  75. // Multiplexing can have a large performance impact if done frequently.
  76. // To avoid a perf impact, keep this at 1s or above.
  77. [[nodiscard]] std::chrono::milliseconds multiplexPeriod() const {
  78. return multiplexPeriod_;
  79. }
  80. void setMultiplexPeriod(std::chrono::milliseconds period) {
  81. multiplexPeriod_ = period;
  82. }
  83. // Report counters at this frequency. Note that several samples can
  84. // be reported each time, see samplesPerReport.
  85. [[nodiscard]] std::chrono::milliseconds reportPeriod() const {
  86. return reportPeriod_;
  87. }
  88. void setReportPeriod(std::chrono::milliseconds msecs);
  89. // Number of samples dispatched each report period.
  90. // Must be in the range [1, report period / sample period].
  91. // In other words, aggregation is supported but not interpolation.
  92. [[nodiscard]] int samplesPerReport() const {
  93. return samplesPerReport_;
  94. }
  95. void setSamplesPerReport(int count) {
  96. samplesPerReport_ = count;
  97. }
  98. // The names of events to collect
  99. [[nodiscard]] const std::set<std::string>& eventNames() const {
  100. return eventNames_;
  101. }
  102. // Add additional events to be profiled
  103. void addEvents(const std::set<std::string>& names) {
  104. eventNames_.insert(names.begin(), names.end());
  105. }
  106. // The names of metrics to collect
  107. [[nodiscard]] const std::set<std::string>& metricNames() const {
  108. return metricNames_;
  109. }
  110. // Add additional metrics to be profiled
  111. void addMetrics(const std::set<std::string>& names) {
  112. metricNames_.insert(names.begin(), names.end());
  113. }
  114. [[nodiscard]] const std::vector<int>& percentiles() const {
  115. return eventReportPercentiles_;
  116. }
  117. // Profile for this long, then revert to base config
  118. [[nodiscard]] std::chrono::seconds eventProfilerOnDemandDuration() const {
  119. return eventProfilerOnDemandDuration_;
  120. }
  121. void setEventProfilerOnDemandDuration(std::chrono::seconds duration) {
  122. eventProfilerOnDemandDuration_ = duration;
  123. }
  124. // Too many event profilers on a single system can overload the driver.
  125. // At some point, latencies shoot through the roof and collection of samples
  126. // becomes impossible. To avoid this situation we have a limit of profilers
  127. // per GPU.
  128. // NOTE: Communication with a daemon is needed for this feature.
  129. // Library must be built with an active DaemonConfigLoader.
  130. [[nodiscard]] int maxEventProfilersPerGpu() const {
  131. return eventProfilerMaxInstancesPerGpu_;
  132. }
  133. // On Cuda11 we've seen occasional hangs when reprogramming counters
  134. // Monitor profiling threads and report when a thread is not responding
  135. // for a given number of seconds.
  136. // A period of 0 means disable.
  137. [[nodiscard]] std::chrono::seconds eventProfilerHeartbeatMonitorPeriod()
  138. const {
  139. return eventProfilerHeartbeatMonitorPeriod_;
  140. }
  141. // The types of activities selected in the configuration file
  142. [[nodiscard]] const std::set<ActivityType>& selectedActivityTypes() const {
  143. return selectedActivityTypes_;
  144. }
  145. // Set the types of activities to be traced
  146. [[nodiscard]] bool perThreadBufferEnabled() const {
  147. return perThreadBufferEnabled_;
  148. }
  149. void setSelectedActivityTypes(const std::set<ActivityType>& types) {
  150. selectedActivityTypes_ = types;
  151. }
  152. [[nodiscard]] bool isReportInputShapesEnabled() const {
  153. return enableReportInputShapes_;
  154. }
  155. [[nodiscard]] bool isProfileMemoryEnabled() const {
  156. return enableProfileMemory_;
  157. }
  158. [[nodiscard]] bool isWithStackEnabled() const {
  159. return enableWithStack_;
  160. }
  161. [[nodiscard]] bool isWithFlopsEnabled() const {
  162. return enableWithFlops_;
  163. }
  164. [[nodiscard]] bool isWithModulesEnabled() const {
  165. return enableWithModules_;
  166. }
  167. // Trace for this long
  168. [[nodiscard]] std::chrono::milliseconds activitiesDuration() const {
  169. return activitiesDuration_;
  170. }
  171. // Trace for this many iterations, determined by external API
  172. [[nodiscard]] int activitiesRunIterations() const {
  173. return activitiesRunIterations_;
  174. }
  175. [[nodiscard]] int activitiesMaxGpuBufferSize() const {
  176. return activitiesMaxGpuBufferSize_;
  177. }
  178. [[nodiscard]] std::chrono::seconds activitiesWarmupDuration() const {
  179. return activitiesWarmupDuration_;
  180. }
  181. [[nodiscard]] int activitiesWarmupIterations() const {
  182. return activitiesWarmupIterations_;
  183. }
  184. // Show CUDA Synchronization Stream Wait Events
  185. [[nodiscard]] bool activitiesCudaSyncWaitEvents() const {
  186. return activitiesCudaSyncWaitEvents_;
  187. }
  188. void setActivitiesCudaSyncWaitEvents(bool enable) {
  189. activitiesCudaSyncWaitEvents_ = enable;
  190. }
  191. // Timestamp at which the profiling to start, requested by the user.
  192. [[nodiscard]] std::chrono::time_point<std::chrono::system_clock>
  193. requestTimestamp() const {
  194. if (profileStartTime_.time_since_epoch().count()) {
  195. return profileStartTime_;
  196. }
  197. // If no one requested timestamp, return 0.
  198. if (requestTimestamp_.time_since_epoch().count() == 0) {
  199. return requestTimestamp_;
  200. }
  201. // TODO(T94634890): Deprecate requestTimestamp
  202. return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration();
  203. }
  204. [[nodiscard]] bool hasProfileStartTime() const {
  205. return requestTimestamp_.time_since_epoch().count() > 0 ||
  206. profileStartTime_.time_since_epoch().count() > 0;
  207. }
  208. [[nodiscard]] int profileStartIteration() const {
  209. return profileStartIteration_;
  210. }
  211. [[nodiscard]] bool hasProfileStartIteration() const {
  212. return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0;
  213. }
  214. void setProfileStartIteration(int iter) {
  215. profileStartIteration_ = iter;
  216. }
  217. [[nodiscard]] int profileStartIterationRoundUp() const {
  218. return profileStartIterationRoundUp_;
  219. }
  220. // calculate the start iteration accounting for warmup
  221. [[nodiscard]] int startIterationIncludingWarmup() const {
  222. if (!hasProfileStartIteration()) {
  223. return -1;
  224. }
  225. return profileStartIteration_ - activitiesWarmupIterations_;
  226. }
  227. [[nodiscard]] std::chrono::seconds maxRequestAge() const;
  228. // All VLOG* macros will log if the verbose log level is >=
  229. // the verbosity specified for the verbose log message.
  230. // Default value is -1, so messages with log level 0 will log by default.
  231. [[nodiscard]] int verboseLogLevel() const {
  232. return verboseLogLevel_;
  233. }
  234. // Modules for which verbose logging is enabled.
  235. // If empty, logging is enabled for all modules.
  236. [[nodiscard]] const std::vector<std::string>& verboseLogModules() const {
  237. return verboseLogModules_;
  238. }
  239. [[nodiscard]] bool sigUsr2Enabled() const {
  240. return enableSigUsr2_;
  241. }
  242. [[nodiscard]] bool ipcFabricEnabled() const {
  243. return enableIpcFabric_;
  244. }
  245. [[nodiscard]] std::chrono::seconds onDemandConfigUpdateIntervalSecs() const {
  246. return onDemandConfigUpdateIntervalSecs_;
  247. }
  248. static std::chrono::milliseconds alignUp(
  249. std::chrono::milliseconds duration,
  250. std::chrono::milliseconds alignment) {
  251. duration += alignment;
  252. return duration - (duration % alignment);
  253. }
  254. [[nodiscard]] std::chrono::time_point<std::chrono::system_clock>
  255. eventProfilerOnDemandStartTime() const {
  256. return eventProfilerOnDemandTimestamp_;
  257. }
  258. [[nodiscard]] std::chrono::time_point<std::chrono::system_clock>
  259. eventProfilerOnDemandEndTime() const {
  260. return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_;
  261. }
  262. [[nodiscard]] std::chrono::time_point<std::chrono::system_clock>
  263. activityProfilerRequestReceivedTime() const {
  264. return activitiesOnDemandTimestamp_;
  265. }
  266. static constexpr std::chrono::milliseconds kControllerIntervalMsecs{1000};
  267. // Users may request and set trace id and group trace id.
  268. [[nodiscard]] const std::string& requestTraceID() const {
  269. return requestTraceID_;
  270. }
  271. void setRequestTraceID(const std::string& tid) {
  272. requestTraceID_ = tid;
  273. }
  274. [[nodiscard]] const std::string& requestGroupTraceID() const {
  275. return requestGroupTraceID_;
  276. }
  277. void setRequestGroupTraceID(const std::string& gtid) {
  278. requestGroupTraceID_ = gtid;
  279. }
  280. [[nodiscard]] size_t cuptiDeviceBufferSize() const {
  281. return cuptiDeviceBufferSize_;
  282. }
  283. [[nodiscard]] size_t cuptiDeviceBufferPoolLimit() const {
  284. return cuptiDeviceBufferPoolLimit_;
  285. }
  286. [[nodiscard]] bool memoryProfilerEnabled() const {
  287. return memoryProfilerEnabled_;
  288. }
  289. [[nodiscard]] int profileMemoryDuration() const {
  290. return profileMemoryDuration_;
  291. }
  292. void updateActivityProfilerRequestReceivedTime();
  293. void printActivityProfilerConfig(std::ostream& s) const override;
  294. void setActivityDependentConfig() override;
  295. void validate(
  296. const std::chrono::time_point<std::chrono::system_clock>&
  297. fallbackProfileStartTime) override;
  298. static void addConfigFactory(
  299. std::string name,
  300. std::function<AbstractConfig*(Config&)> factory);
  301. void print(std::ostream& s) const;
  302. // Config relies on some state with global static lifetime. If other
  303. // threads are using the config, it's possible that the global state
  304. // is destroyed before the threads stop. By hanging onto this handle,
  305. // correct destruction order can be ensured.
  306. static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
  307. [[nodiscard]] bool getTSCTimestampFlag() const {
  308. return useTSCTimestamp_;
  309. }
  310. void setTSCTimestampFlag(bool flag) {
  311. useTSCTimestamp_ = flag;
  312. }
  313. [[nodiscard]] const std::string& getCustomConfig() const {
  314. return customConfig_;
  315. }
  316. [[nodiscard]] uint32_t maxEvents() const {
  317. return maxEvents_;
  318. }
  319. private:
  320. explicit Config(const Config& other) = default;
  321. AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
  322. // Clone from AbstractConfig not supported
  323. assert(false);
  324. return nullptr;
  325. }
  326. uint8_t createDeviceMask(const std::string& val);
  327. // Adds valid activity types from the user defined string list in the
  328. // configuration file
  329. void setActivityTypes(const std::vector<std::string>& selected_activities);
  330. // Sets the default activity types to be traced
  331. void selectDefaultActivityTypes() {
  332. // If the user has not specified an activity list, add all types
  333. for (ActivityType t : defaultActivityTypes()) {
  334. selectedActivityTypes_.insert(t);
  335. }
  336. }
  337. int verboseLogLevel_;
  338. std::vector<std::string> verboseLogModules_;
  339. // Event profiler
  340. // These settings are also supported in on-demand mode
  341. std::chrono::milliseconds samplePeriod_;
  342. std::chrono::milliseconds reportPeriod_;
  343. int samplesPerReport_;
  344. std::set<std::string> eventNames_;
  345. std::set<std::string> metricNames_;
  346. // On-demand duration
  347. std::chrono::seconds eventProfilerOnDemandDuration_;
  348. // Last on-demand request
  349. std::chrono::time_point<std::chrono::system_clock>
  350. eventProfilerOnDemandTimestamp_;
  351. int eventProfilerMaxInstancesPerGpu_;
  352. // Monitor whether event profiler threads are stuck
  353. // at this frequency
  354. std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_;
  355. // These settings can not be changed on-demand
  356. std::string eventLogFile_;
  357. std::vector<int> eventReportPercentiles_ = {5, 25, 50, 75, 95};
  358. uint8_t eventProfilerDeviceMask_ = ~0;
  359. std::chrono::milliseconds multiplexPeriod_;
  360. // Activity profiler
  361. bool activityProfilerEnabled_;
  362. // Enable per-thread buffer
  363. bool perThreadBufferEnabled_;
  364. std::set<ActivityType> selectedActivityTypes_;
  365. // The activity profiler settings are all on-demand
  366. std::string activitiesLogFile_;
  367. std::string activitiesLogUrl_;
  368. // Log activities to memory buffer
  369. bool activitiesLogToMemory_{false};
  370. int activitiesMaxGpuBufferSize_;
  371. std::chrono::seconds activitiesWarmupDuration_;
  372. int activitiesWarmupIterations_;
  373. bool activitiesCudaSyncWaitEvents_;
  374. // Enable Profiler Config Options
  375. // Temporarily disable shape collection until we re-roll out the feature for
  376. // on-demand cases
  377. bool enableReportInputShapes_{false};
  378. bool enableProfileMemory_{false};
  379. bool enableWithStack_{false};
  380. bool enableWithFlops_{false};
  381. bool enableWithModules_{false};
  382. // Profile for specified iterations and duration
  383. std::chrono::milliseconds activitiesDuration_;
  384. int activitiesRunIterations_;
  385. // Below are not used
  386. // Use this net name for iteration count
  387. std::string activitiesExternalAPIIterationsTarget_;
  388. // Only profile nets that includes this in the name
  389. std::vector<std::string> activitiesExternalAPIFilter_;
  390. // Only profile nets with at least this many operators
  391. int activitiesExternalAPINetSizeThreshold_;
  392. // Only profile nets with at least this many GPU operators
  393. int activitiesExternalAPIGpuOpCountThreshold_;
  394. // Last activity profiler request
  395. std::chrono::time_point<std::chrono::system_clock>
  396. activitiesOnDemandTimestamp_;
  397. // ActivityProfilers are triggered by either:
  398. // Synchronized start timestamps
  399. std::chrono::time_point<std::chrono::system_clock> profileStartTime_;
  400. // Or start iterations.
  401. int profileStartIteration_;
  402. int profileStartIterationRoundUp_;
  403. // DEPRECATED
  404. std::chrono::time_point<std::chrono::system_clock> requestTimestamp_;
  405. // Enable profiling via SIGUSR2
  406. bool enableSigUsr2_;
  407. // Enable IPC Fabric instead of thrift communication
  408. bool enableIpcFabric_;
  409. std::chrono::seconds onDemandConfigUpdateIntervalSecs_;
  410. // Logger Metadata
  411. std::string requestTraceID_;
  412. std::string requestGroupTraceID_;
  413. // CUPTI Device Buffer
  414. size_t cuptiDeviceBufferSize_;
  415. size_t cuptiDeviceBufferPoolLimit_;
  416. // CUPTI Timestamp Format
  417. bool useTSCTimestamp_{true};
  418. // Memory Profiler
  419. bool memoryProfilerEnabled_{false};
  420. int profileMemoryDuration_{1000};
  421. // Used to flexibly configure some custom options, especially for custom
  422. // backends. How to parse this string is handled by the custom backend.
  423. std::string customConfig_;
  424. // Roctracer settings
  425. uint32_t maxEvents_{5000000};
  426. };
  427. constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
  428. bool isDaemonEnvVarSet();
  429. // Returns a reference to the protobuf trace enabled flag.
  430. // This allows the flag to be set externally (e.g., from JustKnobs in FBConfig)
  431. // and read in other components (e.g., ChromeTraceLogger).
  432. bool& get_protobuf_trace_enabled();
  433. } // namespace libkineto
  434. #else
  435. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  436. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)