| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
- #pragma once
- #include <c10/cuda/CUDAException.h>
- #include <c10/macros/Macros.h>
- namespace c10::cuda {
- #ifdef TORCH_USE_CUDA_DSA
- C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
- // Copy string from `src` to `dst`
- static __device__ void dstrcpy(char* dst, const char* src) {
- int i = 0;
- // Copy string from source to destination, ensuring that it
- // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
- while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
- *dst++ = *src++;
- }
- *dst = '\0';
- }
- static __device__ void dsa_add_new_assertion_failure(
- DeviceAssertionsData* assertions_data,
- const char* assertion_msg,
- const char* filename,
- const char* function_name,
- const int line_number,
- const uint32_t caller,
- const dim3 block_id,
- const dim3 thread_id) {
- // `assertions_data` may be nullptr if device-side assertion checking
- // is disabled at run-time. If it is disabled at compile time this
- // function will never be called
- if (!assertions_data) {
- return;
- }
- // Atomically increment so other threads can fail at the same time
- // Note that incrementing this means that the CPU can observe that
- // a failure has happened and can begin to respond before we've
- // written information about that failure out to the buffer.
- const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
- if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
- // At this point we're ran out of assertion buffer space.
- // We could print a message about this, but that'd get
- // spammy if a lot of threads did it, so we just silently
- // ignore any other assertion failures. In most cases the
- // failures will all probably be analogous anyway.
- return;
- }
- // Write information about the assertion failure to memory.
- // Note that this occurs only after the `assertion_count`
- // increment broadcasts that there's been a problem.
- auto& self = assertions_data->assertions[nid];
- dstrcpy(self.assertion_msg, assertion_msg);
- dstrcpy(self.filename, filename);
- dstrcpy(self.function_name, function_name);
- self.line_number = line_number;
- self.caller = caller;
- self.block_id[0] = block_id.x;
- self.block_id[1] = block_id.y;
- self.block_id[2] = block_id.z;
- self.thread_id[0] = thread_id.x;
- self.thread_id[1] = thread_id.y;
- self.thread_id[2] = thread_id.z;
- }
- C10_CLANG_DIAGNOSTIC_POP()
- // Emulates a kernel assertion. The assertion won't stop the kernel's progress,
- // so you should assume everything the kernel produces is garbage if there's an
- // assertion failure.
- // NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
- // arguments of the kernel and therefore accessible.
- #define CUDA_KERNEL_ASSERT2(condition) \
- do { \
- if (C10_UNLIKELY(!(condition))) { \
- /* Has an atomic element so threads can fail at the same time */ \
- c10::cuda::dsa_add_new_assertion_failure( \
- assertions_data, \
- C10_STRINGIZE(condition), \
- __FILE__, \
- __FUNCTION__, \
- __LINE__, \
- assertion_caller_id, \
- blockIdx, \
- threadIdx); \
- /* Now that the kernel has failed we early exit the kernel, but */ \
- /* otherwise keep going and rely on the host to check UVM and */ \
- /* determine we've had a problem */ \
- return; \
- } \
- } while (false)
- #else
- #define CUDA_KERNEL_ASSERT2(condition) assert(condition)
- #endif
- } // namespace c10::cuda
- #else
- #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
- #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
|