CUDADeviceAssertionHost.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <c10/cuda/CUDAMacros.h>
  4. #include <cstdint>
  5. #include <memory>
  6. #include <mutex>
  7. #include <string>
  8. #include <utility>
  9. #include <vector>
  10. #if defined(USE_CUDA) || defined(USE_ROCM)
  11. #define TORCH_USE_CUDA_DSA
  12. #endif
  13. /// Number of assertion failure messages we can store. If this is too small
  14. /// threads will fail silently.
  15. constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
  16. constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
  17. namespace c10::cuda {
  18. /// Holds information about any device-side assertions that fail.
  19. /// Held in managed memory and access by both the CPU and the GPU.
  20. struct DeviceAssertionData {
  21. /// Stringification of the assertion
  22. // NOLINTNEXTLINE(*-c-arrays)
  23. char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
  24. /// File the assertion was in
  25. // NOLINTNEXTLINE(*-c-arrays)
  26. char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
  27. /// Name of the function the assertion was in
  28. // NOLINTNEXTLINE(*-c-arrays)
  29. char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
  30. /// Line number the assertion was at
  31. int line_number{};
  32. /// Number uniquely identifying the kernel launch that triggered the assertion
  33. uint32_t caller{};
  34. /// block_id of the thread that failed the assertion
  35. // NOLINTNEXTLINE(*-c-arrays)
  36. int32_t block_id[3]{};
  37. /// third_id of the thread that failed the assertion
  38. // NOLINTNEXTLINE(*-c-arrays)
  39. int32_t thread_id[3]{};
  40. };
  41. /// Used to hold assertions generated by the device
  42. /// Held in managed memory and access by both the CPU and the GPU.
  43. struct DeviceAssertionsData {
  44. /// Total number of assertions found; a subset of these will be recorded
  45. /// in `assertions`
  46. int32_t assertion_count{};
  47. /// An array of assertions that will be written to in a race-free manner
  48. // NOLINTNEXTLINE(*-c-arrays)
  49. DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
  50. };
  51. /// Use to hold info about kernel launches so that we can run kernels
  52. /// asynchronously and still associate launches with device-side
  53. /// assertion failures
  54. struct CUDAKernelLaunchInfo {
  55. /// Filename of the code where the kernel was launched from
  56. const char* launch_filename;
  57. /// Function from which the kernel was launched
  58. const char* launch_function;
  59. /// Line number of where the code was launched from
  60. uint32_t launch_linenum;
  61. /// Backtrace of where the kernel was launched from, only populated if
  62. /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
  63. std::string launch_stacktrace;
  64. /// Kernel that was launched
  65. const char* kernel_name;
  66. /// Device the kernel was launched on
  67. int device;
  68. /// Stream the kernel was launched on
  69. int32_t stream;
  70. /// A number that uniquely identifies the kernel launch
  71. uint64_t generation_number;
  72. };
  73. /// Circular buffer used to hold information about kernel launches
  74. /// this is later used to reconstruct how a device-side kernel assertion failure
  75. /// occurred CUDAKernelLaunchRegistry is used as a singleton
  76. class C10_CUDA_API CUDAKernelLaunchRegistry {
  77. private:
  78. /// Assume that this is the max number of kernel launches that might ever be
  79. /// enqueued across all streams on a single device
  80. static constexpr int max_kernel_launches = 1024;
  81. /// How many kernel launch infos we've inserted. Used to ensure that circular
  82. /// queue doesn't provide false information by always increasing, but also to
  83. /// mark where we are inserting into the queue
  84. #ifdef TORCH_USE_CUDA_DSA
  85. uint64_t generation_number = 0;
  86. #endif
  87. /// Shared mutex between writer and accessor to ensure multi-threaded safety.
  88. mutable std::mutex read_write_mutex;
  89. /// Used to ensure prevent race conditions in GPU memory allocation
  90. mutable std::mutex gpu_alloc_mutex;
  91. /// Pointer to managed memory keeping track of device-side assertions. There
  92. /// is one entry for each possible device the process might work with. Unused
  93. /// entries are nullptrs. We could also use an unordered_set here, but this
  94. /// vector design will be faster and the wasted memory is small since we
  95. /// expect the number of GPUs per node will always be small
  96. std::vector<
  97. std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
  98. uvm_assertions;
  99. /// A single circular buffer holds information about every kernel launch the
  100. /// process makes across all devices.
  101. std::vector<CUDAKernelLaunchInfo> kernel_launches;
  102. bool check_env_for_enable_launch_stacktracing() const;
  103. bool check_env_for_dsa_enabled() const;
  104. public:
  105. CUDAKernelLaunchRegistry();
  106. /// Register a new kernel launch and obtain a generation number back to be
  107. /// passed to the kernel
  108. uint32_t insert(
  109. const char* launch_filename,
  110. const char* launch_function,
  111. const uint32_t launch_linenum,
  112. const char* kernel_name,
  113. const int32_t stream_id);
  114. /// Get copies of the kernel launch registry and each device's assertion
  115. /// failure buffer so they can be inspected without raising race conditions
  116. std::
  117. pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
  118. snapshot() const;
  119. /// Get a pointer to the current device's assertion failure buffer. If no such
  120. /// buffer exists then one is created. This means that the first kernel launch
  121. /// made on each device will be slightly slower because memory allocations are
  122. /// required
  123. DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
  124. /// Gets the global singleton of the registry
  125. static CUDAKernelLaunchRegistry& get_singleton_ref();
  126. /// If not all devices support DSA, we disable it
  127. const bool do_all_devices_support_managed_memory = false;
  128. /// Whether or not to gather stack traces when launching kernels
  129. bool gather_launch_stacktrace = false;
  130. /// Whether or not host-side DSA is enabled or disabled at run-time
  131. /// Note: Device-side code cannot be enabled/disabled at run-time
  132. bool enabled_at_runtime = false;
  133. /// Whether or not a device has indicated a failure
  134. bool has_failed() const;
  135. #ifdef TORCH_USE_CUDA_DSA
  136. const bool enabled_at_compile_time = true;
  137. #else
  138. const bool enabled_at_compile_time = false;
  139. #endif
  140. };
  141. C10_CUDA_API std::string c10_retrieve_device_side_assertion_info();
  142. } // namespace c10::cuda
  143. // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
  144. // requires the same input arguments. We introduce the following macro to
  145. // standardize these.
  146. #define TORCH_DSA_KERNEL_ARGS \
  147. [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
  148. [[maybe_unused]] uint32_t assertion_caller_id
  149. // This macro can be used to pass the DSA arguments onward to another
  150. // function
  151. #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
  152. #else
  153. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  154. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)