crc32_power8.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /* crc32 for POWER8 using VSX instructions
  2. * Copyright (C) 2021 IBM Corporation
  3. *
  4. * Author: Rogerio Alves <rogealve@br.ibm.com>
  5. *
  6. * For conditions of distribution and use, see copyright notice in zlib.h
  7. *
  8. * Calculate the checksum of data that is 16 byte aligned and a multiple of
  9. * 16 bytes.
  10. *
  11. * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  12. * chunks in order to mask the latency of the vpmsum instructions. If we
  13. * have more than 32 kB of data to checksum we repeat this step multiple
  14. * times, passing in the previous 1024 bits.
  15. *
  16. * The next step is to reduce the 1024 bits to 64 bits. This step adds
  17. * 32 bits of 0s to the end - this matches what a CRC does. We just
  18. * calculate constants that land the data in this 32 bits.
  19. *
  20. * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  21. * for n = CRC using POWER8 instructions. We use x = 32.
  22. *
  23. * http://en.wikipedia.org/wiki/Barrett_reduction
  24. *
  25. * This code uses gcc vector builtins instead using assembly directly.
  26. */
  27. #include <altivec.h>
  28. #include "zendian.h"
  29. #include "zbuild.h"
  30. #include "crc32_constants.h"
  31. #include "crc32_braid_tbl.h"
  32. #if defined (__clang__)
  33. #include "fallback_builtins.h"
  34. #endif
  35. #define MAX_SIZE 32768
  36. #define VMX_ALIGN 16
  37. #define VMX_ALIGN_MASK (VMX_ALIGN-1)
  38. static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
  39. while (len--)
  40. crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
  41. return crc;
  42. }
  43. static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
  44. Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
  45. unsigned int prealign;
  46. unsigned int tail;
  47. unsigned long len = (unsigned long) _len;
  48. if (p == (const unsigned char *) 0x0)
  49. return 0;
  50. crc ^= 0xffffffff;
  51. if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
  52. crc = crc32_align(crc, p, len);
  53. goto out;
  54. }
  55. if ((unsigned long)p & VMX_ALIGN_MASK) {
  56. prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
  57. crc = crc32_align(crc, p, prealign);
  58. len -= prealign;
  59. p += prealign;
  60. }
  61. crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
  62. tail = len & VMX_ALIGN_MASK;
  63. if (tail) {
  64. p += len & ~VMX_ALIGN_MASK;
  65. crc = crc32_align(crc, p, tail);
  66. }
  67. out:
  68. crc ^= 0xffffffff;
  69. return crc;
  70. }
  71. /* When we have a load-store in a single-dispatch group and address overlap
  72. * such that forward is not allowed (load-hit-store) the group must be flushed.
  73. * A group ending NOP prevents the flush.
  74. */
  75. #define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
  76. #if BYTE_ORDER == BIG_ENDIAN
  77. #define BYTESWAP_DATA
  78. #endif
  79. #ifdef BYTESWAP_DATA
  80. #define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
  81. #if BYTE_ORDER == LITTLE_ENDIAN
  82. /* Byte reverse permute constant LE. */
  83. static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
  84. #else
  85. static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
  86. #endif
  87. #else
  88. #define VEC_PERM(vr, va, vb, vc)
  89. #endif
  90. static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
  91. const __vector unsigned long long vzero = {0,0};
  92. const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
  93. const __vector unsigned long long vmask_32bit =
  94. (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
  95. const __vector unsigned long long vmask_64bit =
  96. (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
  97. __vector unsigned long long vcrc;
  98. __vector unsigned long long vconst1, vconst2;
  99. /* vdata0-vdata7 will contain our data (p). */
  100. __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
  101. /* v0-v7 will contain our checksums */
  102. __vector unsigned long long v0 = {0,0};
  103. __vector unsigned long long v1 = {0,0};
  104. __vector unsigned long long v2 = {0,0};
  105. __vector unsigned long long v3 = {0,0};
  106. __vector unsigned long long v4 = {0,0};
  107. __vector unsigned long long v5 = {0,0};
  108. __vector unsigned long long v6 = {0,0};
  109. __vector unsigned long long v7 = {0,0};
  110. /* Vector auxiliary variables. */
  111. __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
  112. unsigned int offset; /* Constant table offset. */
  113. unsigned long i; /* Counter. */
  114. unsigned long chunks;
  115. unsigned long block_size;
  116. int next_block = 0;
  117. /* Align by 128 bits. The last 128 bit block will be processed at end. */
  118. unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
  119. vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
  120. /* Short version. */
  121. if (len < 256) {
  122. /* Calculate where in the constant table we need to start. */
  123. offset = 256 - len;
  124. vconst1 = vec_ld(offset, vcrc_short_const);
  125. vdata0 = vec_ld(0, (__vector unsigned long long*) p);
  126. VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
  127. /* xor initial value */
  128. vdata0 = vec_xor(vdata0, vcrc);
  129. vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
  130. (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
  131. v0 = vec_xor(v0, vdata0);
  132. for (i = 16; i < len; i += 16) {
  133. vconst1 = vec_ld(offset + i, vcrc_short_const);
  134. vdata0 = vec_ld(i, (__vector unsigned long long*) p);
  135. VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
  136. vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
  137. (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
  138. v0 = vec_xor(v0, vdata0);
  139. }
  140. } else {
  141. /* Load initial values. */
  142. vdata0 = vec_ld(0, (__vector unsigned long long*) p);
  143. vdata1 = vec_ld(16, (__vector unsigned long long*) p);
  144. VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
  145. VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
  146. vdata2 = vec_ld(32, (__vector unsigned long long*) p);
  147. vdata3 = vec_ld(48, (__vector unsigned long long*) p);
  148. VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
  149. VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
  150. vdata4 = vec_ld(64, (__vector unsigned long long*) p);
  151. vdata5 = vec_ld(80, (__vector unsigned long long*) p);
  152. VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
  153. VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
  154. vdata6 = vec_ld(96, (__vector unsigned long long*) p);
  155. vdata7 = vec_ld(112, (__vector unsigned long long*) p);
  156. VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
  157. VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
  158. /* xor in initial value */
  159. vdata0 = vec_xor(vdata0, vcrc);
  160. p = (char *)p + 128;
  161. do {
  162. /* Checksum in blocks of MAX_SIZE. */
  163. block_size = length;
  164. if (block_size > MAX_SIZE) {
  165. block_size = MAX_SIZE;
  166. }
  167. length = length - block_size;
  168. /*
  169. * Work out the offset into the constants table to start at. Each
  170. * constant is 16 bytes, and it is used against 128 bytes of input
  171. * data - 128 / 16 = 8
  172. */
  173. offset = (MAX_SIZE/8) - (block_size/8);
  174. /* We reduce our final 128 bytes in a separate step */
  175. chunks = (block_size/128)-1;
  176. vconst1 = vec_ld(offset, vcrc_const);
  177. va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
  178. (__vector unsigned long long)vconst1);
  179. va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
  180. (__vector unsigned long long)vconst1);
  181. va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
  182. (__vector unsigned long long)vconst1);
  183. va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
  184. (__vector unsigned long long)vconst1);
  185. va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
  186. (__vector unsigned long long)vconst1);
  187. va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
  188. (__vector unsigned long long)vconst1);
  189. va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
  190. (__vector unsigned long long)vconst1);
  191. va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
  192. (__vector unsigned long long)vconst1);
  193. if (chunks > 1) {
  194. offset += 16;
  195. vconst2 = vec_ld(offset, vcrc_const);
  196. GROUP_ENDING_NOP;
  197. vdata0 = vec_ld(0, (__vector unsigned long long*) p);
  198. VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
  199. vdata1 = vec_ld(16, (__vector unsigned long long*) p);
  200. VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
  201. vdata2 = vec_ld(32, (__vector unsigned long long*) p);
  202. VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
  203. vdata3 = vec_ld(48, (__vector unsigned long long*) p);
  204. VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
  205. vdata4 = vec_ld(64, (__vector unsigned long long*) p);
  206. VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
  207. vdata5 = vec_ld(80, (__vector unsigned long long*) p);
  208. VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
  209. vdata6 = vec_ld(96, (__vector unsigned long long*) p);
  210. VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
  211. vdata7 = vec_ld(112, (__vector unsigned long long*) p);
  212. VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
  213. p = (char *)p + 128;
  214. /*
  215. * main loop. Each iteration calculates the CRC for a 128-byte
  216. * block.
  217. */
  218. for (i = 0; i < chunks-2; i++) {
  219. vconst1 = vec_ld(offset, vcrc_const);
  220. offset += 16;
  221. GROUP_ENDING_NOP;
  222. v0 = vec_xor(v0, va0);
  223. va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
  224. (__vector unsigned long long)vconst2);
  225. vdata0 = vec_ld(0, (__vector unsigned long long*) p);
  226. VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
  227. GROUP_ENDING_NOP;
  228. v1 = vec_xor(v1, va1);
  229. va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
  230. (__vector unsigned long long)vconst2);
  231. vdata1 = vec_ld(16, (__vector unsigned long long*) p);
  232. VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
  233. GROUP_ENDING_NOP;
  234. v2 = vec_xor(v2, va2);
  235. va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
  236. vdata2, (__vector unsigned long long)vconst2);
  237. vdata2 = vec_ld(32, (__vector unsigned long long*) p);
  238. VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
  239. GROUP_ENDING_NOP;
  240. v3 = vec_xor(v3, va3);
  241. va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
  242. (__vector unsigned long long)vconst2);
  243. vdata3 = vec_ld(48, (__vector unsigned long long*) p);
  244. VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
  245. vconst2 = vec_ld(offset, vcrc_const);
  246. GROUP_ENDING_NOP;
  247. v4 = vec_xor(v4, va4);
  248. va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
  249. (__vector unsigned long long)vconst1);
  250. vdata4 = vec_ld(64, (__vector unsigned long long*) p);
  251. VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
  252. GROUP_ENDING_NOP;
  253. v5 = vec_xor(v5, va5);
  254. va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
  255. (__vector unsigned long long)vconst1);
  256. vdata5 = vec_ld(80, (__vector unsigned long long*) p);
  257. VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
  258. GROUP_ENDING_NOP;
  259. v6 = vec_xor(v6, va6);
  260. va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
  261. (__vector unsigned long long)vconst1);
  262. vdata6 = vec_ld(96, (__vector unsigned long long*) p);
  263. VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
  264. GROUP_ENDING_NOP;
  265. v7 = vec_xor(v7, va7);
  266. va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
  267. (__vector unsigned long long)vconst1);
  268. vdata7 = vec_ld(112, (__vector unsigned long long*) p);
  269. VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
  270. p = (char *)p + 128;
  271. }
  272. /* First cool down */
  273. vconst1 = vec_ld(offset, vcrc_const);
  274. offset += 16;
  275. v0 = vec_xor(v0, va0);
  276. va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
  277. (__vector unsigned long long)vconst1);
  278. GROUP_ENDING_NOP;
  279. v1 = vec_xor(v1, va1);
  280. va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
  281. (__vector unsigned long long)vconst1);
  282. GROUP_ENDING_NOP;
  283. v2 = vec_xor(v2, va2);
  284. va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
  285. (__vector unsigned long long)vconst1);
  286. GROUP_ENDING_NOP;
  287. v3 = vec_xor(v3, va3);
  288. va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
  289. (__vector unsigned long long)vconst1);
  290. GROUP_ENDING_NOP;
  291. v4 = vec_xor(v4, va4);
  292. va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
  293. (__vector unsigned long long)vconst1);
  294. GROUP_ENDING_NOP;
  295. v5 = vec_xor(v5, va5);
  296. va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
  297. (__vector unsigned long long)vconst1);
  298. GROUP_ENDING_NOP;
  299. v6 = vec_xor(v6, va6);
  300. va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
  301. (__vector unsigned long long)vconst1);
  302. GROUP_ENDING_NOP;
  303. v7 = vec_xor(v7, va7);
  304. va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
  305. (__vector unsigned long long)vconst1);
  306. }/* else */
  307. /* Second cool down. */
  308. v0 = vec_xor(v0, va0);
  309. v1 = vec_xor(v1, va1);
  310. v2 = vec_xor(v2, va2);
  311. v3 = vec_xor(v3, va3);
  312. v4 = vec_xor(v4, va4);
  313. v5 = vec_xor(v5, va5);
  314. v6 = vec_xor(v6, va6);
  315. v7 = vec_xor(v7, va7);
  316. /*
  317. * vpmsumd produces a 96 bit result in the least significant bits
  318. * of the register. Since we are bit reflected we have to shift it
  319. * left 32 bits so it occupies the least significant bits in the
  320. * bit reflected domain.
  321. */
  322. v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
  323. (__vector unsigned char)vzero, 4);
  324. v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
  325. (__vector unsigned char)vzero, 4);
  326. v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
  327. (__vector unsigned char)vzero, 4);
  328. v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
  329. (__vector unsigned char)vzero, 4);
  330. v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
  331. (__vector unsigned char)vzero, 4);
  332. v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
  333. (__vector unsigned char)vzero, 4);
  334. v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
  335. (__vector unsigned char)vzero, 4);
  336. v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
  337. (__vector unsigned char)vzero, 4);
  338. /* xor with the last 1024 bits. */
  339. va0 = vec_ld(0, (__vector unsigned long long*) p);
  340. VEC_PERM(va0, va0, va0, vperm_const);
  341. va1 = vec_ld(16, (__vector unsigned long long*) p);
  342. VEC_PERM(va1, va1, va1, vperm_const);
  343. va2 = vec_ld(32, (__vector unsigned long long*) p);
  344. VEC_PERM(va2, va2, va2, vperm_const);
  345. va3 = vec_ld(48, (__vector unsigned long long*) p);
  346. VEC_PERM(va3, va3, va3, vperm_const);
  347. va4 = vec_ld(64, (__vector unsigned long long*) p);
  348. VEC_PERM(va4, va4, va4, vperm_const);
  349. va5 = vec_ld(80, (__vector unsigned long long*) p);
  350. VEC_PERM(va5, va5, va5, vperm_const);
  351. va6 = vec_ld(96, (__vector unsigned long long*) p);
  352. VEC_PERM(va6, va6, va6, vperm_const);
  353. va7 = vec_ld(112, (__vector unsigned long long*) p);
  354. VEC_PERM(va7, va7, va7, vperm_const);
  355. p = (char *)p + 128;
  356. vdata0 = vec_xor(v0, va0);
  357. vdata1 = vec_xor(v1, va1);
  358. vdata2 = vec_xor(v2, va2);
  359. vdata3 = vec_xor(v3, va3);
  360. vdata4 = vec_xor(v4, va4);
  361. vdata5 = vec_xor(v5, va5);
  362. vdata6 = vec_xor(v6, va6);
  363. vdata7 = vec_xor(v7, va7);
  364. /* Check if we have more blocks to process */
  365. next_block = 0;
  366. if (length != 0) {
  367. next_block = 1;
  368. /* zero v0-v7 */
  369. v0 = vec_xor(v0, v0);
  370. v1 = vec_xor(v1, v1);
  371. v2 = vec_xor(v2, v2);
  372. v3 = vec_xor(v3, v3);
  373. v4 = vec_xor(v4, v4);
  374. v5 = vec_xor(v5, v5);
  375. v6 = vec_xor(v6, v6);
  376. v7 = vec_xor(v7, v7);
  377. }
  378. length = length + 128;
  379. } while (next_block);
  380. /* Calculate how many bytes we have left. */
  381. length = (len & 127);
  382. /* Calculate where in (short) constant table we need to start. */
  383. offset = 128 - length;
  384. v0 = vec_ld(offset, vcrc_short_const);
  385. v1 = vec_ld(offset + 16, vcrc_short_const);
  386. v2 = vec_ld(offset + 32, vcrc_short_const);
  387. v3 = vec_ld(offset + 48, vcrc_short_const);
  388. v4 = vec_ld(offset + 64, vcrc_short_const);
  389. v5 = vec_ld(offset + 80, vcrc_short_const);
  390. v6 = vec_ld(offset + 96, vcrc_short_const);
  391. v7 = vec_ld(offset + 112, vcrc_short_const);
  392. offset += 128;
  393. v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  394. (__vector unsigned int)vdata0, (__vector unsigned int)v0);
  395. v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  396. (__vector unsigned int)vdata1, (__vector unsigned int)v1);
  397. v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  398. (__vector unsigned int)vdata2, (__vector unsigned int)v2);
  399. v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  400. (__vector unsigned int)vdata3, (__vector unsigned int)v3);
  401. v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  402. (__vector unsigned int)vdata4, (__vector unsigned int)v4);
  403. v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  404. (__vector unsigned int)vdata5, (__vector unsigned int)v5);
  405. v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  406. (__vector unsigned int)vdata6, (__vector unsigned int)v6);
  407. v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  408. (__vector unsigned int)vdata7, (__vector unsigned int)v7);
  409. /* Now reduce the tail (0-112 bytes). */
  410. for (i = 0; i < length; i+=16) {
  411. vdata0 = vec_ld(i,(__vector unsigned long long*)p);
  412. VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
  413. va0 = vec_ld(offset + i,vcrc_short_const);
  414. va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
  415. (__vector unsigned int)vdata0, (__vector unsigned int)va0);
  416. v0 = vec_xor(v0, va0);
  417. }
  418. /* xor all parallel chunks together. */
  419. v0 = vec_xor(v0, v1);
  420. v2 = vec_xor(v2, v3);
  421. v4 = vec_xor(v4, v5);
  422. v6 = vec_xor(v6, v7);
  423. v0 = vec_xor(v0, v2);
  424. v4 = vec_xor(v4, v6);
  425. v0 = vec_xor(v0, v4);
  426. }
  427. /* Barrett Reduction */
  428. vconst1 = vec_ld(0, v_Barrett_const);
  429. vconst2 = vec_ld(16, v_Barrett_const);
  430. v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
  431. (__vector unsigned char)v0, 8);
  432. v0 = vec_xor(v1,v0);
  433. /* shift left one bit */
  434. __vector unsigned char vsht_splat = vec_splat_u8 (1);
  435. v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
  436. v0 = vec_and(v0, vmask_64bit);
  437. /*
  438. * The reflected version of Barrett reduction. Instead of bit
  439. * reflecting our data (which is expensive to do), we bit reflect our
  440. * constants and our algorithm, which means the intermediate data in
  441. * our vector registers goes from 0-63 instead of 63-0. We can reflect
  442. * the algorithm because we don't carry in mod 2 arithmetic.
  443. */
  444. /* bottom 32 bits of a */
  445. v1 = vec_and(v0, vmask_32bit);
  446. /* ma */
  447. v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
  448. (__vector unsigned long long)vconst1);
  449. /* bottom 32bits of ma */
  450. v1 = vec_and(v1, vmask_32bit);
  451. /* qn */
  452. v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
  453. (__vector unsigned long long)vconst2);
  454. /* a - qn, subtraction is xor in GF(2) */
  455. v0 = vec_xor (v0, v1);
  456. /*
  457. * Since we are bit reflected, the result (ie the low 32 bits) is in
  458. * the high 32 bits. We just need to shift it left 4 bytes
  459. * V0 [ 0 1 X 3 ]
  460. * V0 [ 0 X 2 3 ]
  461. */
  462. /* shift result into top 64 bits of */
  463. v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
  464. (__vector unsigned char)vzero, 4);
  465. #if BYTE_ORDER == BIG_ENDIAN
  466. return v0[0];
  467. #else
  468. return v0[1];
  469. #endif
  470. }