CUDAException.h 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <c10/cuda/CUDADeviceAssertionHost.h>
  4. #include <c10/cuda/CUDAMacros.h>
  5. #include <c10/cuda/CUDAMiscFunctions.h>
  6. #include <c10/macros/Macros.h>
  7. #include <c10/util/Exception.h>
  8. #include <c10/util/irange.h>
  9. #include <cuda.h>
  10. // Note [CHECK macro]
  11. // ~~~~~~~~~~~~~~~~~~
  12. // This is a macro so that AT_ERROR can get accurate __LINE__
  13. // and __FILE__ information. We could split this into a short
  14. // macro and a function implementation if we pass along __LINE__
  15. // and __FILE__, but no one has found this worth doing.
  16. // Used to denote errors from CUDA framework.
  17. // This needs to be declared here instead util/Exception.h for proper conversion
  18. // during hipify.
  19. namespace c10 {
  20. class C10_CUDA_API CUDAError : public c10::Error {
  21. using Error::Error;
  22. };
  23. } // namespace c10
  24. #define C10_CUDA_CHECK(EXPR) \
  25. do { \
  26. const cudaError_t __err = EXPR; \
  27. c10::cuda::c10_cuda_check_implementation( \
  28. static_cast<int32_t>(__err), \
  29. __FILE__, \
  30. __func__, /* Line number data type not well-defined between \
  31. compilers, so we perform an explicit cast */ \
  32. static_cast<uint32_t>(__LINE__), \
  33. true); \
  34. } while (0)
  35. // backwards compat due to hipify v2 changes, for extension projects
  36. #define C10_HIP_CHECK C10_CUDA_CHECK
  37. #define C10_CUDA_CHECK_WARN(EXPR) \
  38. do { \
  39. const cudaError_t __err = EXPR; \
  40. if (C10_UNLIKELY(__err != cudaSuccess)) { \
  41. [[maybe_unused]] auto error_unused = cudaGetLastError(); \
  42. TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
  43. } \
  44. } while (0)
  45. // Indicates that a CUDA error is handled in a non-standard way
  46. #define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
  47. // Intentionally ignore a CUDA error
  48. #define C10_CUDA_IGNORE_ERROR(EXPR) \
  49. do { \
  50. const cudaError_t __err = EXPR; \
  51. if (C10_UNLIKELY(__err != cudaSuccess)) { \
  52. [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
  53. } \
  54. } while (0)
  55. // Clear the last CUDA error
  56. #define C10_CUDA_CLEAR_ERROR() \
  57. do { \
  58. [[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
  59. } while (0)
  60. // This should be used directly after every kernel launch to ensure
  61. // the launch happened correctly and provide an early, close-to-source
  62. // diagnostic if it didn't.
  63. #define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
  64. /// Launches a CUDA kernel appending to it all the information need to handle
  65. /// device-side assertion failures. Checks that the launch was successful.
  66. #define TORCH_DSA_KERNEL_LAUNCH( \
  67. kernel, blocks, threads, shared_mem, stream, ...) \
  68. do { \
  69. auto& launch_registry = \
  70. c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref(); \
  71. kernel<<<blocks, threads, shared_mem, stream>>>( \
  72. __VA_ARGS__, \
  73. launch_registry.get_uvm_assertions_ptr_for_current_device(), \
  74. launch_registry.insert( \
  75. __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
  76. C10_CUDA_KERNEL_LAUNCH_CHECK(); \
  77. } while (0)
  78. namespace c10::cuda {
  79. /// In the event of a CUDA failure, formats a nice error message about that
  80. /// failure and also checks for device-side assertion failures
  81. C10_CUDA_API void c10_cuda_check_implementation(
  82. const int32_t err,
  83. const char* filename,
  84. const char* function_name,
  85. const uint32_t line_number,
  86. const bool include_device_assertions);
  87. } // namespace c10::cuda
  88. #else
  89. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  90. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)