CUDAApplyUtils.cuh 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <ATen/cuda/ApplyGridUtils.cuh>
  4. #include <ATen/cuda/detail/IndexUtils.cuh>
  5. #include <ATen/core/TensorBase.h>
  6. #include <ATen/ceil_div.h>
  7. #include <ATen/cuda/Atomic.cuh>
  8. #include <ATen/cuda/CUDAContext.h>
  9. #include <c10/macros/Macros.h>
  10. #include <ATen/native/Copy.h>
  11. #include <math.h>
  12. //
  13. // This file contains pointwise operation functions and kernels that
  14. // work on both contiguous and non-contiguous tensor arguments of
  15. // arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
  16. // copying or temporary storage.
  17. //
  18. /*
  19. NOTE [ CUDA_tensor_applyN helpers ]
  20. The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4)
  21. functions apply a pointwise operator to N tensor(s).
  22. The calling convention is
  23. 1. The template arguments should be, sequentially,
  24. - First N typename args specify the scalar types of each of the N tensors.
  25. - (Optional) `int step` arg specifies the number of elements processed
  26. together at the same time.
  27. Default is 1.
  28. - A usually omitted (i.e., inferred) typename arg specifies the type of the
  29. function/functor applied on `N * step` values in each iteration of each
  30. CUDA thread.
  31. 2. The arguments should be, sequentially,
  32. - N tensors
  33. - op: a function/functor that processes `N * step` values at the same time.
  34. - If `step == 1`, it must have signature
  35. `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where
  36. `scalar*_t`s are the first N typename template args, and the inputs
  37. are the `N` values from the `N` tensors retrieved at a common index.
  38. - Otherwise, it must must have signature
  39. void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&, // repeat `step` times
  40. scalar2_t&, scalar2_t&, ..., scalar2_t&, // repeat `step` times
  41. ...,
  42. scalarN_t&, scalarN_t&, ..., scalarN_t&) // repeat `step` times
  43. Different from `step == 1` case, it processes `N * step` values taken
  44. from `step` common indices. Moreover, the first input `n` represents the
  45. number of valid indices (it will always have `0 < n <= step`). It will
  46. almost always be `step`, but at the boundary we may not have full `step`
  47. elements and `n` can be a lesser value.
  48. E.g., if `step == 4` and `N == 2`, `op` could be
  49. [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4,
  50. scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) {
  51. // Only process u1, ..., un and v1, ..., vn.
  52. // So if `n == 3`, `u4` and `v4` need not to be considered.
  53. }
  54. In both cases, the references can actually be const, but at least one of
  55. them should be non-const in order to write the output.
  56. - (Optional, but recommended) N TensorArgType args that specify for each
  57. tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite),
  58. or only reads (i.e., TensorArgType::ReadOnly).
  59. Default is TensorArgType::ReadWrite for first Tensor, and
  60. TensorArgType::ReadOnly for the rest.
  61. E.g.,
  62. to compute a = b^2 for a and b of same dtype, we can call
  63. CUDA_tensor_apply2<scalar, scalar>(
  64. a, b,
  65. [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }
  66. );
  67. to work on 2 values at the same time, we can call
  68. CUDA_tensor_apply2<scalar1, scalar2, 2>(
  69. a, b,
  70. [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2,
  71. const scalar2 &b_val1, const scalar2 &b_val2) {
  72. // call special vectorized op here, or just do elementwise and enjoy unrolling...
  73. // if n == 1, only process a_val1 and b_val1
  74. }
  75. );
  76. */
  77. namespace at::cuda {
  78. // TODO: combine with TensorArg? So far that's been for debugging, and this is functional...
  79. enum class TensorArgType { ReadWrite, ReadOnly };
  80. namespace {
  81. // Rearrange dimensions for pointwise operations so that strides are in
  82. // decreasing order as much as possible, so that kernels have better memory
  83. // access patterns.
  84. //
  85. // For example, consider a binary operation on two "transposed" 2-dim tensors:
  86. // sizes: 256 512
  87. // aInfo->strides: 1 256
  88. // bInfo->strides: 1 256
  89. //
  90. // Given this, each concurrent memory access inside kernelPointwiseApply2() is
  91. // exactly 256 elements apart, resulting in poor performance.
  92. //
  93. // This function exchanges dimensions so that memory access is contiguous:
  94. // sizes: 512 256
  95. // aInfo->strides: 256 1
  96. // bInfo->strides: 256 1
  97. //
  98. // (Actually, it becomes even better because now collapseDims() can turn each
  99. // input into one contiguous array.)
  100. //
  101. // In general, given M (<=4) TensorInfo's with N dimensions, we can view each
  102. // strides[i] (0 <= i < N) as an M-tuple. Given each pair i < j, we exchange
  103. // strides[i] and [j] if
  104. // (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
  105. // (exchanging them will benefit input #k), and
  106. // (2) strides[i][k] <= strieds[j][k] for all k
  107. // (exchanging them will not make any input worse).
  108. template <typename T1, typename IndexType,
  109. typename T2 = void, typename T3 = void, typename T4 = void>
  110. inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
  111. detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
  112. detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
  113. detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
  114. int numInfos = 1;
  115. int dims = aInfo->dims;
  116. IndexType *sizes[4] = { aInfo->sizes, };
  117. IndexType *strides[4] = { aInfo->strides, };
  118. if (bInfo != nullptr) {
  119. ++numInfos;
  120. if (bInfo->dims != dims) return;
  121. sizes[1] = bInfo->sizes;
  122. strides[1] = bInfo->strides;
  123. }
  124. if (cInfo != nullptr) {
  125. ++numInfos;
  126. if (cInfo->dims != dims) return;
  127. sizes[2] = cInfo->sizes;
  128. strides[2] = cInfo->strides;
  129. }
  130. if (dInfo != nullptr) {
  131. ++numInfos;
  132. if (dInfo->dims != dims) return;
  133. sizes[3] = dInfo->sizes;
  134. strides[3] = dInfo->strides;
  135. }
  136. // Bail out if sizes do not match: we are using "deprecated pointwise
  137. // behavior" among tensors of different shapes but same number of elements.
  138. for (int i = 1; i < numInfos; ++i) {
  139. for (int j = 0; j < dims; ++j) {
  140. if (sizes[i][j] != sizes[0][j]) return;
  141. }
  142. }
  143. for (int i = 0; i < dims - 1; ++i) {
  144. // No need to consider dimensions of size 1.
  145. if (sizes[0][i] == 1) continue;
  146. for (int j = i + 1; j < dims; ++j) {
  147. if (sizes[0][j] == 1) continue;
  148. // Compare the relative sizes of strides between dim #i and dim #j.
  149. bool hasIncreasingStrides = false;
  150. bool hasDecreasingStrides = false;
  151. for (int k = 0; k < numInfos; k++) {
  152. IndexType stride_i = strides[k][i];
  153. IndexType stride_j = strides[k][j];
  154. if (stride_i < stride_j) {
  155. hasIncreasingStrides = true;
  156. } else if (stride_i > stride_j) {
  157. hasDecreasingStrides = true;
  158. }
  159. }
  160. if (hasIncreasingStrides && !hasDecreasingStrides) {
  161. for (int k = 0; k < numInfos; k++) {
  162. IndexType size = sizes[k][i];
  163. sizes[k][i] = sizes[k][j];
  164. sizes[k][j] = size;
  165. IndexType stride = strides[k][i];
  166. strides[k][i] = strides[k][j];
  167. strides[k][j] = stride;
  168. }
  169. }
  170. }
  171. }
  172. }
  173. // The `remaining_steps` argument is used to support Op that operates on
  174. // multiple elements at the same time. Generally, the strategy of ApplyOpN is to
  175. // 1. Initialize `remaining_steps = step`, where `step` is the template arg of
  176. // CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the
  177. // number of elements in bound for this call. It will almost always equal to
  178. // `step` except at boundaries.
  179. // 2. If `remaining_steps > 0` convert the current linearIndex to offset (if in
  180. // bound), and recursively call `ApplyOpN` with `remaining_steps - 1`.
  181. // 3. At `remaining_steps = 0`,
  182. // if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`;
  183. // if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep,
  184. // tensor2_val1, tensor2_val2, ..., tesor2_valstep,
  185. // ...
  186. // tensorN_val1, tensorN_val2, ..., tesorN_valstep);`
  187. //
  188. // See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like.
  189. template <typename Op,
  190. typename scalar,
  191. typename IndexType,
  192. int ADims,
  193. int remaining_steps,
  194. typename... Offsets>
  195. struct ApplyOp1 {
  196. __device__ __forceinline__
  197. static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
  198. IndexType linearIndex, Offsets... aOffsets) {
  199. // Convert `linearIndex` into an offset of `a`
  200. const IndexType aOffset = sizeof...(Offsets) < n ?
  201. detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;
  202. ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
  203. a, op, n, linearIndex + 1, aOffsets..., aOffset
  204. );
  205. }
  206. };
  207. // Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
  208. // We don't need to pass in how many elements need to processed in this case.
  209. template <typename Op,
  210. typename scalar,
  211. typename IndexType,
  212. int ADims,
  213. typename Offset>
  214. struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
  215. __device__ __forceinline__
  216. static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op,
  217. int n, IndexType linearIndex, Offset offset) {
  218. op(a.data[offset]);
  219. }
  220. };
  221. template <typename Op,
  222. typename scalar,
  223. typename IndexType,
  224. int ADims,
  225. typename... Offsets>
  226. struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
  227. __device__ __forceinline__
  228. static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
  229. IndexType linearIndex, Offsets... offsets) {
  230. op(n, a.data[offsets]...);
  231. }
  232. };
  233. template <typename Op,
  234. typename scalar,
  235. typename IndexType,
  236. int ADims,
  237. int step>
  238. C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
  239. __global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
  240. IndexType totalElements, const Op op) {
  241. for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
  242. linearIndex < totalElements;
  243. linearIndex += gridDim.x * blockDim.x * step) {
  244. ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
  245. a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
  246. }
  247. }
  248. template <typename Op,
  249. typename scalar1,
  250. typename scalar2,
  251. typename IndexType,
  252. int ADims,
  253. int BDims,
  254. int remaining_steps,
  255. typename... Offsets>
  256. struct ApplyOp2 {
  257. __device__ __forceinline__
  258. static void apply(detail::TensorInfo<scalar1, IndexType> &a,
  259. detail::TensorInfo<scalar2, IndexType> &b,
  260. const Op &op, int64_t n, IndexType linearIndex,
  261. Offsets... aOffsets, Offsets... bOffsets) {
  262. // Convert `linearIndex` into an offset of `a`
  263. const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
  264. detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
  265. // Convert `linearIndex` into an offset of `b`
  266. const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
  267. detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
  268. ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
  269. a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
  270. );
  271. }
  272. };
  273. // Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
  274. // We don't need to pass in how many elements need to processed in this case.
  275. template <typename Op,
  276. typename scalar1,
  277. typename scalar2,
  278. typename IndexType,
  279. int ADims,
  280. int BDims,
  281. typename Offset>
  282. struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
  283. __device__ __forceinline__
  284. static void apply(detail::TensorInfo<scalar1, IndexType> &a,
  285. detail::TensorInfo<scalar2, IndexType> &b,
  286. const Op &op, int /*n*/, IndexType /*linearIndex*/,
  287. Offset aOffset, Offset bOffset) {
  288. op(a.data[aOffset], b.data[bOffset]);
  289. }
  290. };
  291. template <typename Op,
  292. typename scalar1,
  293. typename scalar2,
  294. typename IndexType,
  295. int ADims,
  296. int BDims,
  297. typename... Offsets>
  298. struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
  299. __device__ __forceinline__
  300. static void apply(detail::TensorInfo<scalar1, IndexType> &a,
  301. detail::TensorInfo<scalar2, IndexType> &b,
  302. const Op &op, int n, IndexType linearIndex,
  303. Offsets... aOffsets, Offsets... bOffsets) {
  304. op(n, a.data[aOffsets]..., b.data[bOffsets]...);
  305. }
  306. };
  307. template <typename Op,
  308. typename scalar1,
  309. typename scalar2,
  310. typename IndexType,
  311. int ADims, int BDims,
  312. int step,
  313. int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
  314. int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
  315. C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
  316. __global__ void
  317. kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
  318. detail::TensorInfo<scalar2, IndexType> b,
  319. IndexType totalElements,
  320. const Op op) {
  321. for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
  322. linearIndex < totalElements;
  323. linearIndex += gridDim.x * blockDim.x * step) {
  324. ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
  325. a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
  326. linearIndex);
  327. }
  328. }
  329. } // anonymous namespace
  330. template <typename scalar1, typename scalar2, int step, typename Op,
  331. int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
  332. int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
  333. inline bool CUDA_tensor_apply2(at::TensorBase a,
  334. at::TensorBase b,
  335. const Op op,
  336. TensorArgType aType = TensorArgType::ReadWrite,
  337. TensorArgType bType = TensorArgType::ReadOnly) {
  338. TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
  339. "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
  340. "tensors with type ", a.device().type(), " and ", b.device().type());
  341. int64_t totalElements = a.numel();
  342. if (totalElements != b.numel()) {
  343. return false;
  344. }
  345. if (a.dim() > MAX_TENSORINFO_DIMS ||
  346. b.dim() > MAX_TENSORINFO_DIMS) {
  347. return false;
  348. }
  349. if (a.numel() == 0) {
  350. // Empty tensor; do nothing
  351. return true;
  352. }
  353. const dim3 block = getApplyBlock(max_threads_per_block);
  354. dim3 grid;
  355. auto curDevice = current_device();
  356. if (curDevice == -1) return false;
  357. if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
  358. return false;
  359. }
  360. /*
  361. Expands readable/writable tensors whose indices may be "overlapped."
  362. This ensures that each element of the tensor is operated on once and only
  363. once.
  364. */
  365. TensorBase oldA;
  366. TensorBase oldB;
  367. if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
  368. // Must perform in contiguous space
  369. oldA = std::exchange(a, a.contiguous());
  370. }
  371. if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
  372. // Must perform in contiguous space
  373. oldB = std::exchange(b, b.contiguous());
  374. }
  375. // It is possible that the tensor dimensions are able to be collapsed,
  376. // and thus we can reduce the actual code complexity of the copy by
  377. // exploiting this knowledge statically, since the div/mod is the
  378. // most expensive part of the operation, more so than memory accesses.
  379. // For instance, when copying a non-contiguous to a contiguous tensor
  380. // (or vice versa), the contiguous tensor can be collapsed to one
  381. // dimension, and the loop to translate the linear index to the array
  382. // index can be similarly collapsed. That is what this unrolling is for.
  383. #define HANDLE_CASE(TYPE, A, B) \
  384. kernelPointwiseApply2<Op, \
  385. scalar1, \
  386. scalar2, \
  387. TYPE, A, B, step, \
  388. max_threads_per_block, \
  389. min_blocks_per_sm> \
  390. <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>( \
  391. aInfo, bInfo, static_cast<TYPE>(totalElements), op); \
  392. C10_CUDA_KERNEL_LAUNCH_CHECK();
  393. #define HANDLE_B_CASE(TYPE, A, B) { \
  394. switch (B) { \
  395. case 1: \
  396. HANDLE_CASE(TYPE, A, 1); \
  397. break; \
  398. case 2: \
  399. HANDLE_CASE(TYPE, A, 2); \
  400. break; \
  401. default: \
  402. HANDLE_CASE(TYPE, A, -1); \
  403. break; \
  404. } \
  405. }
  406. #define HANDLE_A_CASE(TYPE, A, B) { \
  407. switch (A) { \
  408. case 1: \
  409. HANDLE_B_CASE(TYPE, 1, B); \
  410. break; \
  411. case 2: \
  412. HANDLE_B_CASE(TYPE, 2, B); \
  413. break; \
  414. default: \
  415. HANDLE_B_CASE(TYPE, -1, B); \
  416. break; \
  417. } \
  418. }
  419. if (detail::canUse32BitIndexMath(a) &&
  420. detail::canUse32BitIndexMath(b)) {
  421. detail::TensorInfo<scalar1, unsigned int> aInfo =
  422. detail::getTensorInfo<scalar1, unsigned int>(a);
  423. detail::TensorInfo<scalar2, unsigned int> bInfo =
  424. detail::getTensorInfo<scalar2, unsigned int>(b);
  425. rearrangeDims(&aInfo, &bInfo);
  426. aInfo.collapseDims();
  427. bInfo.collapseDims();
  428. HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
  429. } else {
  430. detail::TensorInfo<scalar1, uint64_t> aInfo =
  431. detail::getTensorInfo<scalar1, uint64_t>(a);
  432. detail::TensorInfo<scalar2, uint64_t> bInfo =
  433. detail::getTensorInfo<scalar2, uint64_t>(b);
  434. rearrangeDims(&aInfo, &bInfo);
  435. aInfo.collapseDims();
  436. bInfo.collapseDims();
  437. /*
  438. Only instantiates the all 1D special case and the fallback all nD case for
  439. large (64-bit indexed) tensors to reduce compilation time.
  440. */
  441. if (aInfo.dims == 1 && bInfo.dims == 1) {
  442. HANDLE_CASE(uint64_t, 1, 1);
  443. } else {
  444. HANDLE_CASE(uint64_t, -1, -1);
  445. }
  446. }
  447. #undef HANDLE_CASE
  448. #undef HANDLE_B_CASE
  449. #undef HANDLE_A_CASE
  450. if (oldA.defined()) {
  451. at::native::copy_ignoring_overlaps(oldA, a);
  452. }
  453. if (oldB.defined()) {
  454. at::native::copy_ignoring_overlaps(oldB, b);
  455. }
  456. return true;
  457. }
  458. /* Provides default step = 1 to CUDA_tensor_apply2. */
  459. template <typename scalar1, typename scalar2, typename Op,
  460. int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
  461. int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
  462. inline bool CUDA_tensor_apply2(const at::TensorBase &a,
  463. const at::TensorBase &b,
  464. const Op op,
  465. TensorArgType aType = TensorArgType::ReadWrite,
  466. TensorArgType bType = TensorArgType::ReadOnly) {
  467. return CUDA_tensor_apply2<scalar1, scalar2, 1, Op,
  468. max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
  469. }
  470. } // namespace at::cuda
  471. #else
  472. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  473. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)