ApplyGridUtils.cuh 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #include <ATen/cuda/CUDAContext.h>
  3. #include <cuda_runtime.h>
  4. namespace at::cuda {
  5. /**
  6. Computes ceil(a / b)
  7. */
  8. template <typename T>
  9. __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
  10. return (a + b - 1) / b;
  11. }
  12. namespace {
  13. // Threads per block for our apply kernel
  14. // FIXME: use occupancy calculator instead
  15. constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
  16. constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
  17. template <int step = 1>
  18. inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
  19. if (curDevice == -1) return false;
  20. uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
  21. uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
  22. uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
  23. if (numBlocks > maxGridX)
  24. numBlocks = maxGridX;
  25. grid = dim3(numBlocks);
  26. return true;
  27. }
  28. constexpr int getApplyBlocksPerSM() {
  29. return AT_APPLY_BLOCKS_PER_SM;
  30. }
  31. constexpr int getApplyBlockSize() {
  32. return AT_APPLY_THREADS_PER_BLOCK;
  33. }
  34. inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
  35. return dim3(max_threads_per_block);
  36. }
  37. } // anonymous namespace
  38. } // namespace at::cuda
  39. #else
  40. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  41. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)