psimd.h 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #ifndef PSIMD_H
  4. #define PSIMD_H
  5. #if defined(__CUDA_ARCH__)
  6. /* CUDA compiler */
  7. #define PSIMD_INTRINSIC __forceinline__ __device__
  8. #elif defined(__OPENCL_VERSION__)
  9. /* OpenCL compiler */
  10. #define PSIMD_INTRINSIC inline static
  11. #elif defined(__INTEL_COMPILER)
  12. /* Intel compiler, even on Windows */
  13. #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
  14. #elif defined(__GNUC__)
  15. /* GCC-compatible compiler (gcc/clang/icc) */
  16. #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
  17. #elif defined(_MSC_VER)
  18. /* MSVC-compatible compiler (cl/icl/clang-cl) */
  19. #define PSIMD_INTRINSIC __forceinline static
  20. #elif defined(__cplusplus)
  21. /* Generic C++ compiler */
  22. #define PSIMD_INTRINSIC inline static
  23. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  24. /* Generic C99 compiler */
  25. #define PSIMD_INTRINSIC inline static
  26. #else
  27. /* Generic C compiler */
  28. #define PSIMD_INTRINSIC static
  29. #endif
  30. #if defined(__GNUC__) || defined(__clang__)
  31. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  32. #include <arm_neon.h>
  33. #endif
  34. #if defined(__SSE2__)
  35. #include <emmintrin.h>
  36. #endif
  37. #if defined(__SSE3__)
  38. #include <pmmintrin.h>
  39. #endif
  40. #if defined(__SSSE3__)
  41. #include <tmmintrin.h>
  42. #endif
  43. #if defined(__SSE4_1__)
  44. #include <smmintrin.h>
  45. #endif
  46. #if defined(__SSE4_2__)
  47. #include <nmmintrin.h>
  48. #endif
  49. #if defined(__AVX__)
  50. #include <immintrin.h>
  51. #endif
  52. #elif defined(_MSC_VER)
  53. #include <intrin.h>
  54. #endif
  55. #if defined(__cplusplus)
  56. #define PSIMD_CXX_SYNTAX
  57. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
  58. #define PSIMD_C11_SYNTAX
  59. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  60. #define PSIMD_C99_SYNTAX
  61. #else
  62. #define PSIMD_C89_SYNTAX
  63. #endif
  64. #if defined(__cplusplus) && (__cplusplus >= 201103L)
  65. #include <cstddef>
  66. #include <cstdint>
  67. #elif !defined(__OPENCL_VERSION__)
  68. #include <stddef.h>
  69. #include <stdint.h>
  70. #endif
  71. #if defined(__GNUC__) || defined(__clang__)
  72. #define PSIMD_HAVE_F64 0
  73. #define PSIMD_HAVE_F32 1
  74. #define PSIMD_HAVE_U8 1
  75. #define PSIMD_HAVE_S8 1
  76. #define PSIMD_HAVE_U16 1
  77. #define PSIMD_HAVE_S16 1
  78. #define PSIMD_HAVE_U32 1
  79. #define PSIMD_HAVE_S32 1
  80. #define PSIMD_HAVE_U64 0
  81. #define PSIMD_HAVE_S64 0
  82. typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
  83. typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
  84. typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
  85. typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
  86. typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
  87. typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
  88. typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
  89. typedef struct {
  90. psimd_s8 lo;
  91. psimd_s8 hi;
  92. } psimd_s8x2;
  93. typedef struct {
  94. psimd_u8 lo;
  95. psimd_u8 hi;
  96. } psimd_u8x2;
  97. typedef struct {
  98. psimd_s16 lo;
  99. psimd_s16 hi;
  100. } psimd_s16x2;
  101. typedef struct {
  102. psimd_u16 lo;
  103. psimd_u16 hi;
  104. } psimd_u16x2;
  105. typedef struct {
  106. psimd_s32 lo;
  107. psimd_s32 hi;
  108. } psimd_s32x2;
  109. typedef struct {
  110. psimd_u32 lo;
  111. psimd_u32 hi;
  112. } psimd_u32x2;
  113. typedef struct {
  114. psimd_f32 lo;
  115. psimd_f32 hi;
  116. } psimd_f32x2;
  117. /* Bit casts */
  118. PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
  119. return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
  120. }
  121. PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
  122. return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
  123. }
  124. PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
  125. return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
  126. }
  127. PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
  128. return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
  129. }
  130. PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
  131. return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
  132. }
  133. PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
  134. return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
  135. }
  136. /* Swap */
  137. PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
  138. const psimd_s8 new_a = *b;
  139. const psimd_s8 new_b = *a;
  140. *a = new_a;
  141. *b = new_b;
  142. }
  143. PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
  144. const psimd_u8 new_a = *b;
  145. const psimd_u8 new_b = *a;
  146. *a = new_a;
  147. *b = new_b;
  148. }
  149. PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
  150. const psimd_s16 new_a = *b;
  151. const psimd_s16 new_b = *a;
  152. *a = new_a;
  153. *b = new_b;
  154. }
  155. PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
  156. const psimd_u16 new_a = *b;
  157. const psimd_u16 new_b = *a;
  158. *a = new_a;
  159. *b = new_b;
  160. }
  161. PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
  162. const psimd_s32 new_a = *b;
  163. const psimd_s32 new_b = *a;
  164. *a = new_a;
  165. *b = new_b;
  166. }
  167. PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
  168. const psimd_u32 new_a = *b;
  169. const psimd_u32 new_b = *a;
  170. *a = new_a;
  171. *b = new_b;
  172. }
  173. PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
  174. const psimd_f32 new_a = *b;
  175. const psimd_f32 new_b = *a;
  176. *a = new_a;
  177. *b = new_b;
  178. }
  179. /* Zero-initialization */
  180. PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
  181. return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  182. }
  183. PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
  184. return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  185. }
  186. PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
  187. return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
  188. }
  189. PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
  190. return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
  191. }
  192. PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
  193. return (psimd_s32) { 0, 0, 0, 0 };
  194. }
  195. PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
  196. return (psimd_u32) { 0, 0, 0, 0 };
  197. }
  198. PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
  199. return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
  200. }
  201. /* Initialization to the same constant */
  202. PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
  203. return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
  204. }
  205. PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
  206. return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
  207. }
  208. PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
  209. return (psimd_s16) { c, c, c, c, c, c, c, c };
  210. }
  211. PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
  212. return (psimd_u16) { c, c, c, c, c, c, c, c };
  213. }
  214. PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
  215. return (psimd_s32) { c, c, c, c };
  216. }
  217. PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
  218. return (psimd_u32) { c, c, c, c };
  219. }
  220. PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
  221. return (psimd_f32) { c, c, c, c };
  222. }
  223. /* Load vector */
  224. PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
  225. return *((const psimd_s8*) address);
  226. }
  227. PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
  228. return *((const psimd_u8*) address);
  229. }
  230. PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
  231. return *((const psimd_s16*) address);
  232. }
  233. PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
  234. return *((const psimd_u16*) address);
  235. }
  236. PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
  237. return *((const psimd_s32*) address);
  238. }
  239. PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
  240. return *((const psimd_u32*) address);
  241. }
  242. PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
  243. return *((const psimd_f32*) address);
  244. }
  245. PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
  246. return psimd_splat_s8(*((const int8_t*) address));
  247. }
  248. PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
  249. return psimd_splat_u8(*((const uint8_t*) address));
  250. }
  251. PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
  252. return psimd_splat_s16(*((const int16_t*) address));
  253. }
  254. PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
  255. return psimd_splat_u16(*((const uint16_t*) address));
  256. }
  257. PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
  258. return psimd_splat_s32(*((const int32_t*) address));
  259. }
  260. PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
  261. return psimd_splat_u32(*((const uint32_t*) address));
  262. }
  263. PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
  264. return psimd_splat_f32(*((const float*) address));
  265. }
  266. PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
  267. return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
  268. }
  269. PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
  270. return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
  271. }
  272. PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
  273. return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
  274. }
  275. PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
  276. const int32_t* address_s32 = (const int32_t*) address;
  277. return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
  278. }
  279. PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
  280. const uint32_t* address_u32 = (const uint32_t*) address;
  281. return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
  282. }
  283. PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
  284. const float* address_f32 = (const float*) address;
  285. return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
  286. }
  287. PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
  288. const int32_t* address_s32 = (const int32_t*) address;
  289. return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
  290. }
  291. PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
  292. const uint32_t* address_u32 = (const uint32_t*) address;
  293. return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
  294. }
  295. PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
  296. const float* address_f32 = (const float*) address;
  297. return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
  298. }
  299. PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
  300. return psimd_load_s32(address);
  301. }
  302. PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
  303. return psimd_load_u32(address);
  304. }
  305. PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
  306. return psimd_load_f32(address);
  307. }
  308. PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
  309. const psimd_f32 v0x1x = psimd_load_f32(address);
  310. const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
  311. #if defined(__clang__)
  312. return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
  313. #else
  314. return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
  315. #endif
  316. }
  317. PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
  318. return psimd_load_f32(address);
  319. }
  320. PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
  321. const float* address_f32 = (const float*) address;
  322. return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
  323. }
  324. PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
  325. const psimd_f32 v0x1x = psimd_load_f32(address);
  326. const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
  327. #if defined(__clang__)
  328. return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
  329. #else
  330. return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
  331. #endif
  332. }
  333. PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
  334. return psimd_load_stride2_f32(address);
  335. }
  336. PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
  337. const float* address0_f32 = (const float*) address;
  338. const float* address1_f32 = address0_f32 + stride;
  339. const float* address2_f32 = address1_f32 + stride;
  340. const float* address3_f32 = address2_f32 + stride;
  341. return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
  342. }
  343. PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
  344. return psimd_load1_f32(address);
  345. }
  346. PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
  347. const float* address_f32 = (const float*) address;
  348. return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
  349. }
  350. PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
  351. const float* address0_f32 = (const float*) address;
  352. const float* address1_f32 = address0_f32 + stride;
  353. const float* address2_f32 = address1_f32 + stride;
  354. return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
  355. }
  356. PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
  357. return psimd_load_stride_f32(address, stride);
  358. }
  359. /* Store vector */
  360. PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
  361. *((psimd_s8*) address) = value;
  362. }
  363. PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
  364. *((psimd_u8*) address) = value;
  365. }
  366. PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
  367. *((psimd_s16*) address) = value;
  368. }
  369. PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
  370. *((psimd_u16*) address) = value;
  371. }
  372. PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
  373. *((psimd_s32*) address) = value;
  374. }
  375. PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
  376. *((psimd_u32*) address) = value;
  377. }
  378. PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
  379. *((psimd_f32*) address) = value;
  380. }
  381. PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
  382. *((int32_t*) address) = value[0];
  383. }
  384. PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
  385. *((uint32_t*) address) = value[0];
  386. }
  387. PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
  388. *((float*) address) = value[0];
  389. }
  390. PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
  391. int32_t* address_s32 = (int32_t*) address;
  392. address_s32[0] = value[0];
  393. address_s32[1] = value[1];
  394. }
  395. PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
  396. uint32_t* address_u32 = (uint32_t*) address;
  397. address_u32[0] = value[0];
  398. address_u32[1] = value[1];
  399. }
  400. PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
  401. float* address_f32 = (float*) address;
  402. address_f32[0] = value[0];
  403. address_f32[1] = value[1];
  404. }
  405. PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
  406. int32_t* address_s32 = (int32_t*) address;
  407. address_s32[0] = value[0];
  408. address_s32[1] = value[1];
  409. address_s32[2] = value[2];
  410. }
  411. PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
  412. uint32_t* address_u32 = (uint32_t*) address;
  413. address_u32[0] = value[0];
  414. address_u32[1] = value[1];
  415. address_u32[2] = value[2];
  416. }
  417. PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
  418. float* address_f32 = (float*) address;
  419. address_f32[0] = value[0];
  420. address_f32[1] = value[1];
  421. address_f32[2] = value[2];
  422. }
  423. PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
  424. psimd_store_s32(address, value);
  425. }
  426. PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
  427. psimd_store_u32(address, value);
  428. }
  429. PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
  430. psimd_store_f32(address, value);
  431. }
  432. PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
  433. float* address0_f32 = (float*) address;
  434. float* address1_f32 = address0_f32 + stride;
  435. float* address2_f32 = address1_f32 + stride;
  436. float* address3_f32 = address2_f32 + stride;
  437. *address0_f32 = value[0];
  438. *address1_f32 = value[1];
  439. *address2_f32 = value[2];
  440. *address3_f32 = value[3];
  441. }
  442. PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
  443. psimd_store1_f32(address, value);
  444. }
  445. PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
  446. float* address_f32 = (float*) address;
  447. address_f32[0] = value[0];
  448. address_f32[stride] = value[1];
  449. }
  450. PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
  451. float* address0_f32 = (float*) address;
  452. float* address1_f32 = address0_f32 + stride;
  453. float* address2_f32 = address1_f32 + stride;
  454. *address0_f32 = value[0];
  455. *address1_f32 = value[1];
  456. *address2_f32 = value[2];
  457. }
  458. /* Vector addition */
  459. PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
  460. return a + b;
  461. }
  462. PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
  463. return a + b;
  464. }
  465. PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
  466. return a + b;
  467. }
  468. PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
  469. return a + b;
  470. }
  471. PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
  472. return a + b;
  473. }
  474. PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
  475. return a + b;
  476. }
  477. PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
  478. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  479. return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
  480. #else
  481. return a + b;
  482. #endif
  483. }
  484. /* Vector subtraction */
  485. PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
  486. return a - b;
  487. }
  488. PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
  489. return a - b;
  490. }
  491. PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
  492. return a - b;
  493. }
  494. PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
  495. return a - b;
  496. }
  497. PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
  498. return a - b;
  499. }
  500. PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
  501. return a - b;
  502. }
  503. PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
  504. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  505. return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
  506. #else
  507. return a - b;
  508. #endif
  509. }
  510. /* Vector multiplication */
  511. PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
  512. return a * b;
  513. }
  514. PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
  515. return a * b;
  516. }
  517. PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
  518. return a * b;
  519. }
  520. PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
  521. return a * b;
  522. }
  523. PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
  524. return a * b;
  525. }
  526. PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
  527. return a * b;
  528. }
  529. PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
  530. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  531. return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
  532. #else
  533. return a * b;
  534. #endif
  535. }
  536. /* Quasi-Fused Multiply-Add */
  537. PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
  538. #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
  539. return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
  540. #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
  541. return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
  542. #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
  543. return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
  544. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
  545. return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
  546. #else
  547. return a + b * c;
  548. #endif
  549. }
  550. PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
  551. return a / b;
  552. }
  553. /* Vector and */
  554. PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
  555. return (psimd_f32) (mask & (psimd_s32) v);
  556. }
  557. /* Vector and-not */
  558. PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
  559. return (psimd_f32) (~mask & (psimd_s32) v);
  560. }
  561. /* Vector blend */
  562. PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
  563. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  564. return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
  565. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  566. return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
  567. #else
  568. return (mask & a) | (~mask & b);
  569. #endif
  570. }
  571. PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
  572. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  573. return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
  574. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  575. return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
  576. #else
  577. return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
  578. #endif
  579. }
  580. PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
  581. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  582. return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
  583. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  584. return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
  585. #else
  586. return (mask & a) | (~mask & b);
  587. #endif
  588. }
  589. PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
  590. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  591. return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
  592. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  593. return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
  594. #else
  595. return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
  596. #endif
  597. }
  598. PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
  599. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  600. return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
  601. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  602. return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
  603. #else
  604. return (mask & a) | (~mask & b);
  605. #endif
  606. }
  607. PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
  608. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  609. return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
  610. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  611. return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
  612. #else
  613. return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
  614. #endif
  615. }
  616. PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
  617. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  618. return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
  619. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  620. return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
  621. #else
  622. return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
  623. #endif
  624. }
  625. /* Vector blend on sign */
  626. PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
  627. return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
  628. }
  629. PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
  630. return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
  631. }
  632. PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
  633. return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
  634. }
  635. PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
  636. return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
  637. }
  638. PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
  639. return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
  640. }
  641. PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
  642. return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
  643. }
  644. PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
  645. const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
  646. return psimd_blend_f32(mask, a, b);
  647. }
  648. /* Vector absolute value */
  649. PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
  650. const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
  651. return (psimd_f32) ((psimd_s32) v & ~mask);
  652. }
  653. /* Vector negation */
  654. PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
  655. const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
  656. return (psimd_f32) ((psimd_s32) v ^ mask);
  657. }
  658. /* Vector maximum */
  659. PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
  660. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  661. return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
  662. #else
  663. return psimd_blend_s8(a > b, a, b);
  664. #endif
  665. }
  666. PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
  667. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  668. return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
  669. #else
  670. return psimd_blend_u8(a > b, a, b);
  671. #endif
  672. }
  673. PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
  674. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  675. return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
  676. #else
  677. return psimd_blend_s16(a > b, a, b);
  678. #endif
  679. }
  680. PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
  681. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  682. return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
  683. #else
  684. return psimd_blend_u16(a > b, a, b);
  685. #endif
  686. }
  687. PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
  688. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  689. return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
  690. #else
  691. return psimd_blend_s32(a > b, a, b);
  692. #endif
  693. }
  694. PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
  695. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  696. return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
  697. #else
  698. return psimd_blend_u32(a > b, a, b);
  699. #endif
  700. }
  701. PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
  702. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  703. return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
  704. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  705. return __builtin_wasm_max_f32x4(a, b);
  706. #else
  707. return psimd_blend_f32(a > b, a, b);
  708. #endif
  709. }
  710. /* Vector minimum */
  711. PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
  712. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  713. return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
  714. #else
  715. return psimd_blend_s8(a < b, a, b);
  716. #endif
  717. }
  718. PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
  719. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  720. return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
  721. #else
  722. return psimd_blend_u8(a < b, a, b);
  723. #endif
  724. }
  725. PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
  726. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  727. return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
  728. #else
  729. return psimd_blend_s16(a < b, a, b);
  730. #endif
  731. }
  732. PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
  733. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  734. return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
  735. #else
  736. return psimd_blend_u16(a < b, a, b);
  737. #endif
  738. }
  739. PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
  740. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  741. return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
  742. #else
  743. return psimd_blend_s32(a < b, a, b);
  744. #endif
  745. }
  746. PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
  747. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  748. return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
  749. #else
  750. return psimd_blend_u32(a < b, a, b);
  751. #endif
  752. }
  753. PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
  754. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  755. return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
  756. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  757. return __builtin_wasm_min_f32x4(a, b);
  758. #else
  759. return psimd_blend_f32(a < b, a, b);
  760. #endif
  761. }
  762. PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
  763. #if defined(__clang__)
  764. return __builtin_convertvector(v, psimd_f32);
  765. #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
  766. return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
  767. #elif defined(__SSE2__)
  768. return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
  769. #else
  770. return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
  771. #endif
  772. }
  773. /* Broadcast vector element */
  774. #if defined(__clang__)
  775. PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
  776. return __builtin_shufflevector(v, v, 0, 0, 0, 0);
  777. }
  778. PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
  779. return __builtin_shufflevector(v, v, 1, 1, 1, 1);
  780. }
  781. PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
  782. return __builtin_shufflevector(v, v, 2, 2, 2, 2);
  783. }
  784. PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
  785. return __builtin_shufflevector(v, v, 3, 3, 3, 3);
  786. }
  787. #else
  788. PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
  789. return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
  790. }
  791. PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
  792. return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
  793. }
  794. PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
  795. return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
  796. }
  797. PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
  798. return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
  799. }
  800. #endif
  801. /* Reversal of vector elements */
  802. #if defined(__clang__)
  803. PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
  804. return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  805. }
  806. PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
  807. return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  808. }
  809. PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
  810. return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
  811. }
  812. PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
  813. return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
  814. }
  815. PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
  816. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  817. }
  818. PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
  819. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  820. }
  821. PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
  822. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  823. }
  824. #else
  825. PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
  826. return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
  827. }
  828. PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
  829. return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
  830. }
  831. PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
  832. return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
  833. }
  834. PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
  835. return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
  836. }
  837. PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
  838. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  839. }
  840. PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
  841. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  842. }
  843. PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
  844. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  845. }
  846. #endif
  847. /* Interleaving of vector elements */
  848. #if defined(__clang__)
  849. PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
  850. return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  851. }
  852. PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
  853. return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  854. }
  855. PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
  856. return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  857. }
  858. PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
  859. return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  860. }
  861. PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
  862. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  863. }
  864. PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
  865. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  866. }
  867. PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
  868. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  869. }
  870. PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
  871. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  872. }
  873. PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
  874. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  875. }
  876. PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
  877. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  878. }
  879. #else
  880. PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
  881. return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
  882. }
  883. PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
  884. return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
  885. }
  886. PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
  887. return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
  888. }
  889. PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
  890. return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
  891. }
  892. PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
  893. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  894. }
  895. PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
  896. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  897. }
  898. PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
  899. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  900. }
  901. PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
  902. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  903. }
  904. PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
  905. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  906. }
  907. PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
  908. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  909. }
  910. #endif
  911. /* Concatenation of low/high vector elements */
  912. #if defined(__clang__)
  913. PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
  914. return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
  915. }
  916. PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
  917. return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
  918. }
  919. PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
  920. return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
  921. }
  922. PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
  923. return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
  924. }
  925. PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
  926. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  927. }
  928. PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
  929. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  930. }
  931. PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
  932. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  933. }
  934. PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
  935. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  936. }
  937. PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
  938. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  939. }
  940. PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
  941. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  942. }
  943. #else
  944. PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
  945. return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
  946. }
  947. PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
  948. return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
  949. }
  950. PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
  951. return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
  952. }
  953. PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
  954. return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
  955. }
  956. PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
  957. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  958. }
  959. PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
  960. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  961. }
  962. PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
  963. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  964. }
  965. PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
  966. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  967. }
  968. PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
  969. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  970. }
  971. PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
  972. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  973. }
  974. #endif
  975. /* Concatenation of even/odd vector elements */
  976. #if defined(__clang__)
  977. PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
  978. return __builtin_shufflevector(a, b,
  979. 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
  980. }
  981. PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
  982. return __builtin_shufflevector(a, b,
  983. 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
  984. }
  985. PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
  986. return __builtin_shufflevector(a, b,
  987. 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
  988. }
  989. PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
  990. return __builtin_shufflevector(a, b,
  991. 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
  992. }
  993. PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
  994. return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
  995. }
  996. PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
  997. return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
  998. }
  999. PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
  1000. return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
  1001. }
  1002. PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
  1003. return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
  1004. }
  1005. PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
  1006. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1007. }
  1008. PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
  1009. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1010. }
  1011. PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
  1012. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1013. }
  1014. PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
  1015. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1016. }
  1017. PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
  1018. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1019. }
  1020. PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
  1021. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1022. }
  1023. #else
  1024. PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
  1025. return __builtin_shuffle(a, b,
  1026. (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
  1027. }
  1028. PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
  1029. return __builtin_shuffle(a, b,
  1030. (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
  1031. }
  1032. PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
  1033. return __builtin_shuffle(a, b,
  1034. (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
  1035. }
  1036. PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
  1037. return __builtin_shuffle(a, b,
  1038. (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
  1039. }
  1040. PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
  1041. return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
  1042. }
  1043. PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
  1044. return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
  1045. }
  1046. PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
  1047. return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
  1048. }
  1049. PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
  1050. return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
  1051. }
  1052. PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
  1053. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1054. }
  1055. PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
  1056. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1057. }
  1058. PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
  1059. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1060. }
  1061. PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
  1062. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1063. }
  1064. PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
  1065. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1066. }
  1067. PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
  1068. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1069. }
  1070. #endif
  1071. /* Vector reduce */
  1072. #if defined(__clang__)
  1073. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
  1074. const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
  1075. return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
  1076. }
  1077. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
  1078. const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
  1079. return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
  1080. }
  1081. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
  1082. const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
  1083. return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
  1084. }
  1085. PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
  1086. const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
  1087. const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
  1088. return result[0];
  1089. }
  1090. PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
  1091. const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
  1092. const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
  1093. return result[0];
  1094. }
  1095. PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
  1096. const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
  1097. const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
  1098. return result[0];
  1099. }
  1100. #else
  1101. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
  1102. const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
  1103. return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
  1104. }
  1105. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
  1106. const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
  1107. return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
  1108. }
  1109. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
  1110. const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
  1111. return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
  1112. }
  1113. PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
  1114. const psimd_f32 result = psimd_allreduce_sum_f32(v);
  1115. return result[0];
  1116. }
  1117. PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
  1118. const psimd_f32 result = psimd_allreduce_max_f32(v);
  1119. return result[0];
  1120. }
  1121. PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
  1122. const psimd_f32 result = psimd_allreduce_min_f32(v);
  1123. return result[0];
  1124. }
  1125. #endif
  1126. #endif
  1127. #endif /* PSIMD_H */
  1128. #else
  1129. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  1130. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)