fxdiv.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #ifndef FXDIV_H
  4. #define FXDIV_H
  5. #if defined(__cplusplus) && (__cplusplus >= 201103L)
  6. #include <cstddef>
  7. #include <cstdint>
  8. #include <climits>
  9. #elif !defined(__OPENCL_VERSION__)
  10. #include <stddef.h>
  11. #include <stdint.h>
  12. #include <limits.h>
  13. #endif
  14. #if defined(_MSC_VER)
  15. #include <intrin.h>
  16. #if defined(_M_IX86) || defined(_M_X64)
  17. #include <immintrin.h>
  18. #endif
  19. #endif
  20. #ifndef FXDIV_USE_INLINE_ASSEMBLY
  21. #define FXDIV_USE_INLINE_ASSEMBLY 0
  22. #endif
  23. static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
  24. #if defined(_MSC_VER) && defined(_M_IX86)
  25. return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
  26. #else
  27. return (uint64_t) a * (uint64_t) b;
  28. #endif
  29. }
  30. static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
  31. #if defined(__OPENCL_VERSION__)
  32. return mul_hi(a, b);
  33. #elif defined(__CUDA_ARCH__)
  34. return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
  35. #elif defined(_MSC_VER) && defined(_M_IX86)
  36. return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
  37. #elif defined(_MSC_VER) && defined(_M_ARM)
  38. return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
  39. #else
  40. return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
  41. #endif
  42. }
  43. static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
  44. #if defined(__OPENCL_VERSION__)
  45. return mul_hi(a, b);
  46. #elif defined(__CUDA_ARCH__)
  47. return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
  48. #elif defined(_MSC_VER) && defined(_M_X64)
  49. return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
  50. #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
  51. return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
  52. #else
  53. const uint32_t a_lo = (uint32_t) a;
  54. const uint32_t a_hi = (uint32_t) (a >> 32);
  55. const uint32_t b_lo = (uint32_t) b;
  56. const uint32_t b_hi = (uint32_t) (b >> 32);
  57. const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
  58. (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
  59. return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
  60. ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
  61. #endif
  62. }
  63. static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
  64. #if SIZE_MAX == UINT32_MAX
  65. return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
  66. #elif SIZE_MAX == UINT64_MAX
  67. return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
  68. #else
  69. #error Unsupported platform
  70. #endif
  71. }
  72. struct fxdiv_divisor_uint32_t {
  73. uint32_t value;
  74. uint32_t m;
  75. uint8_t s1;
  76. uint8_t s2;
  77. };
  78. struct fxdiv_result_uint32_t {
  79. uint32_t quotient;
  80. uint32_t remainder;
  81. };
  82. struct fxdiv_divisor_uint64_t {
  83. uint64_t value;
  84. uint64_t m;
  85. uint8_t s1;
  86. uint8_t s2;
  87. };
  88. struct fxdiv_result_uint64_t {
  89. uint64_t quotient;
  90. uint64_t remainder;
  91. };
  92. struct fxdiv_divisor_size_t {
  93. size_t value;
  94. size_t m;
  95. uint8_t s1;
  96. uint8_t s2;
  97. };
  98. struct fxdiv_result_size_t {
  99. size_t quotient;
  100. size_t remainder;
  101. };
  102. static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
  103. struct fxdiv_divisor_uint32_t result = { d };
  104. if (d == 1) {
  105. result.m = UINT32_C(1);
  106. result.s1 = 0;
  107. result.s2 = 0;
  108. } else {
  109. #if defined(__OPENCL_VERSION__)
  110. const uint32_t l_minus_1 = 31 - clz(d - 1);
  111. #elif defined(__CUDA_ARCH__)
  112. const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
  113. #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
  114. unsigned long l_minus_1;
  115. _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
  116. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
  117. uint32_t l_minus_1;
  118. __asm__("BSRL %[d_minus_1], %[l_minus_1]"
  119. : [l_minus_1] "=r" (l_minus_1)
  120. : [d_minus_1] "r" (d - 1)
  121. : "cc");
  122. #elif defined(__GNUC__)
  123. const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
  124. #else
  125. /* Based on Algorithm 2 from Hacker's delight */
  126. uint32_t l_minus_1 = 0;
  127. uint32_t x = d - 1;
  128. uint32_t y = x >> 16;
  129. if (y != 0) {
  130. l_minus_1 += 16;
  131. x = y;
  132. }
  133. y = x >> 8;
  134. if (y != 0) {
  135. l_minus_1 += 8;
  136. x = y;
  137. }
  138. y = x >> 4;
  139. if (y != 0) {
  140. l_minus_1 += 4;
  141. x = y;
  142. }
  143. y = x >> 2;
  144. if (y != 0) {
  145. l_minus_1 += 2;
  146. x = y;
  147. }
  148. if ((x & 2) != 0) {
  149. l_minus_1 += 1;
  150. }
  151. #endif
  152. uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
  153. /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
  154. #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
  155. uint32_t q;
  156. __asm__("DIVL %[d]"
  157. : "=a" (q), "+d" (u_hi)
  158. : [d] "r" (d), "a" (0)
  159. : "cc");
  160. #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
  161. unsigned int remainder;
  162. const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
  163. #else
  164. const uint32_t q = ((uint64_t) u_hi << 32) / d;
  165. #endif
  166. result.m = q + UINT32_C(1);
  167. result.s1 = 1;
  168. result.s2 = (uint8_t) l_minus_1;
  169. }
  170. return result;
  171. }
  172. static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
  173. struct fxdiv_divisor_uint64_t result = { d };
  174. if (d == 1) {
  175. result.m = UINT64_C(1);
  176. result.s1 = 0;
  177. result.s2 = 0;
  178. } else {
  179. #if defined(__OPENCL_VERSION__)
  180. const uint32_t nlz_d = clz(d);
  181. const uint32_t l_minus_1 = 63 - clz(d - 1);
  182. #elif defined(__CUDA_ARCH__)
  183. const uint32_t nlz_d = __clzll((long long) d);
  184. const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
  185. #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
  186. unsigned long l_minus_1;
  187. _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
  188. unsigned long bsr_d;
  189. _BitScanReverse64(&bsr_d, (unsigned __int64) d);
  190. const uint32_t nlz_d = bsr_d ^ 0x3F;
  191. #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
  192. const uint64_t d_minus_1 = d - 1;
  193. const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
  194. unsigned long l_minus_1;
  195. if ((uint32_t) (d_minus_1 >> 32) == 0) {
  196. _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
  197. } else {
  198. _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
  199. l_minus_1 += 32;
  200. }
  201. const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
  202. #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
  203. uint64_t l_minus_1;
  204. __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
  205. : [l_minus_1] "=r" (l_minus_1)
  206. : [d_minus_1] "r" (d - 1)
  207. : "cc");
  208. #elif defined(__GNUC__)
  209. const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
  210. const uint32_t nlz_d = __builtin_clzll(d);
  211. #else
  212. /* Based on Algorithm 2 from Hacker's delight */
  213. const uint64_t d_minus_1 = d - 1;
  214. const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
  215. uint32_t l_minus_1 = 0;
  216. uint32_t x = (uint32_t) d_minus_1;
  217. uint32_t y = d_minus_1 >> 32;
  218. if (y != 0) {
  219. l_minus_1 += 32;
  220. x = y;
  221. }
  222. y = x >> 16;
  223. if (y != 0) {
  224. l_minus_1 += 16;
  225. x = y;
  226. }
  227. y = x >> 8;
  228. if (y != 0) {
  229. l_minus_1 += 8;
  230. x = y;
  231. }
  232. y = x >> 4;
  233. if (y != 0) {
  234. l_minus_1 += 4;
  235. x = y;
  236. }
  237. y = x >> 2;
  238. if (y != 0) {
  239. l_minus_1 += 2;
  240. x = y;
  241. }
  242. if ((x & 2) != 0) {
  243. l_minus_1 += 1;
  244. }
  245. const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
  246. #endif
  247. uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
  248. /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
  249. #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
  250. uint64_t q;
  251. __asm__("DIVQ %[d]"
  252. : "=a" (q), "+d" (u_hi)
  253. : [d] "r" (d), "a" (UINT64_C(0))
  254. : "cc");
  255. #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
  256. /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
  257. const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
  258. #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
  259. unsigned __int64 remainder;
  260. const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
  261. #else
  262. /* Implementation based on code from Hacker's delight */
  263. /* Normalize divisor and shift divident left */
  264. d <<= nlz_d;
  265. u_hi <<= nlz_d;
  266. /* Break divisor up into two 32-bit digits */
  267. const uint64_t d_hi = (uint32_t) (d >> 32);
  268. const uint32_t d_lo = (uint32_t) d;
  269. /* Compute the first quotient digit, q1 */
  270. uint64_t q1 = u_hi / d_hi;
  271. uint64_t r1 = u_hi - q1 * d_hi;
  272. while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
  273. q1 -= 1;
  274. r1 += d_hi;
  275. if ((r1 >> 32) != 0) {
  276. break;
  277. }
  278. }
  279. /* Multiply and subtract. */
  280. u_hi = (u_hi << 32) - q1 * d;
  281. /* Compute the second quotient digit, q0 */
  282. uint64_t q0 = u_hi / d_hi;
  283. uint64_t r0 = u_hi - q0 * d_hi;
  284. while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
  285. q0 -= 1;
  286. r0 += d_hi;
  287. if ((r0 >> 32) != 0) {
  288. break;
  289. }
  290. }
  291. const uint64_t q = (q1 << 32) | (uint32_t) q0;
  292. #endif
  293. result.m = q + UINT64_C(1);
  294. result.s1 = 1;
  295. result.s2 = (uint8_t) l_minus_1;
  296. }
  297. return result;
  298. }
  299. static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
  300. #if SIZE_MAX == UINT32_MAX
  301. const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
  302. #elif SIZE_MAX == UINT64_MAX
  303. const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
  304. #else
  305. #error Unsupported platform
  306. #endif
  307. struct fxdiv_divisor_size_t size_result = {
  308. (size_t) uint_result.value,
  309. (size_t) uint_result.m,
  310. uint_result.s1,
  311. uint_result.s2
  312. };
  313. return size_result;
  314. }
  315. static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  316. const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
  317. return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
  318. }
  319. static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  320. const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
  321. return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
  322. }
  323. static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  324. #if SIZE_MAX == UINT32_MAX
  325. const struct fxdiv_divisor_uint32_t uint32_divisor = {
  326. (uint32_t) divisor.value,
  327. (uint32_t) divisor.m,
  328. divisor.s1,
  329. divisor.s2
  330. };
  331. return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
  332. #elif SIZE_MAX == UINT64_MAX
  333. const struct fxdiv_divisor_uint64_t uint64_divisor = {
  334. (uint64_t) divisor.value,
  335. (uint64_t) divisor.m,
  336. divisor.s1,
  337. divisor.s2
  338. };
  339. return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
  340. #else
  341. #error Unsupported platform
  342. #endif
  343. }
  344. static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  345. const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
  346. return n - quotient * divisor.value;
  347. }
  348. static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  349. const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
  350. return n - quotient * divisor.value;
  351. }
  352. static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  353. const size_t quotient = fxdiv_quotient_size_t(n, divisor);
  354. return n - quotient * divisor.value;
  355. }
  356. static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
  357. const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
  358. return quotient * granularity.value;
  359. }
  360. static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
  361. const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
  362. return quotient * granularity.value;
  363. }
  364. static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
  365. const size_t quotient = fxdiv_quotient_size_t(n, granularity);
  366. return quotient * granularity.value;
  367. }
  368. static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  369. const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
  370. const uint32_t remainder = n - quotient * divisor.value;
  371. struct fxdiv_result_uint32_t result = { quotient, remainder };
  372. return result;
  373. }
  374. static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  375. const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
  376. const uint64_t remainder = n - quotient * divisor.value;
  377. struct fxdiv_result_uint64_t result = { quotient, remainder };
  378. return result;
  379. }
  380. static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  381. const size_t quotient = fxdiv_quotient_size_t(n, divisor);
  382. const size_t remainder = n - quotient * divisor.value;
  383. struct fxdiv_result_size_t result = { quotient, remainder };
  384. return result;
  385. }
  386. #endif /* FXDIV_H */
  387. #else
  388. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  389. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)