crc32_braid_c.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /* crc32_braid.c -- compute the CRC-32 of a data stream
  2. * Copyright (C) 1995-2022 Mark Adler
  3. * For conditions of distribution and use, see copyright notice in zlib.h
  4. *
  5. * This interleaved implementation of a CRC makes use of pipelined multiple
  6. * arithmetic-logic units, commonly found in modern CPU cores. It is due to
  7. * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
  8. */
  9. #include "zbuild.h"
  10. #include "crc32_braid_p.h"
  11. #include "crc32_braid_tbl.h"
  12. /*
  13. A CRC of a message is computed on N braids of words in the message, where
  14. each word consists of W bytes (4 or 8). If N is 3, for example, then three
  15. running sparse CRCs are calculated respectively on each braid, at these
  16. indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
  17. This is done starting at a word boundary, and continues until as many blocks
  18. of N * W bytes as are available have been processed. The results are combined
  19. into a single CRC at the end. For this code, N must be in the range 1..6 and
  20. W must be 4 or 8. The upper limit on N can be increased if desired by adding
  21. more #if blocks, extending the patterns apparent in the code. In addition,
  22. crc32 tables would need to be regenerated, if the maximum N value is increased.
  23. N and W are chosen empirically by benchmarking the execution time on a given
  24. processor. The choices for N and W below were based on testing on Intel Kaby
  25. Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
  26. Octeon II processors. The Intel, AMD, and ARM processors were all fastest
  27. with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
  28. They were all tested with either gcc or clang, all using the -O3 optimization
  29. level. Your mileage may vary.
  30. */
  31. /* ========================================================================= */
  32. #ifdef W
  33. /*
  34. Return the CRC of the W bytes in the word_t data, taking the
  35. least-significant byte of the word as the first byte of data, without any pre
  36. or post conditioning. This is used to combine the CRCs of each braid.
  37. */
  38. #if BYTE_ORDER == LITTLE_ENDIAN
  39. static uint32_t crc_word(z_word_t data) {
  40. int k;
  41. for (k = 0; k < W; k++)
  42. data = (data >> 8) ^ crc_table[data & 0xff];
  43. return (uint32_t)data;
  44. }
  45. #elif BYTE_ORDER == BIG_ENDIAN
  46. static z_word_t crc_word(z_word_t data) {
  47. int k;
  48. for (k = 0; k < W; k++)
  49. data = (data << 8) ^
  50. crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
  51. return data;
  52. }
  53. #endif /* BYTE_ORDER */
  54. #endif /* W */
  55. /* ========================================================================= */
  56. Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
  57. uint32_t c;
  58. /* Pre-condition the CRC */
  59. c = (~crc) & 0xffffffff;
  60. #ifdef W
  61. /* If provided enough bytes, do a braided CRC calculation. */
  62. if (len >= N * W + W - 1) {
  63. size_t blks;
  64. z_word_t const *words;
  65. int k;
  66. /* Compute the CRC up to a z_word_t boundary. */
  67. while (len && ((uintptr_t)buf & (W - 1)) != 0) {
  68. len--;
  69. DO1;
  70. }
  71. /* Compute the CRC on as many N z_word_t blocks as are available. */
  72. blks = len / (N * W);
  73. len -= blks * N * W;
  74. words = (z_word_t const *)buf;
  75. z_word_t crc0, word0, comb;
  76. #if N > 1
  77. z_word_t crc1, word1;
  78. #if N > 2
  79. z_word_t crc2, word2;
  80. #if N > 3
  81. z_word_t crc3, word3;
  82. #if N > 4
  83. z_word_t crc4, word4;
  84. #if N > 5
  85. z_word_t crc5, word5;
  86. #endif
  87. #endif
  88. #endif
  89. #endif
  90. #endif
  91. /* Initialize the CRC for each braid. */
  92. crc0 = ZSWAPWORD(c);
  93. #if N > 1
  94. crc1 = 0;
  95. #if N > 2
  96. crc2 = 0;
  97. #if N > 3
  98. crc3 = 0;
  99. #if N > 4
  100. crc4 = 0;
  101. #if N > 5
  102. crc5 = 0;
  103. #endif
  104. #endif
  105. #endif
  106. #endif
  107. #endif
  108. /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
  109. while (--blks) {
  110. /* Load the word for each braid into registers. */
  111. word0 = crc0 ^ words[0];
  112. #if N > 1
  113. word1 = crc1 ^ words[1];
  114. #if N > 2
  115. word2 = crc2 ^ words[2];
  116. #if N > 3
  117. word3 = crc3 ^ words[3];
  118. #if N > 4
  119. word4 = crc4 ^ words[4];
  120. #if N > 5
  121. word5 = crc5 ^ words[5];
  122. #endif
  123. #endif
  124. #endif
  125. #endif
  126. #endif
  127. words += N;
  128. /* Compute and update the CRC for each word. The loop should get unrolled. */
  129. crc0 = BRAID_TABLE[0][word0 & 0xff];
  130. #if N > 1
  131. crc1 = BRAID_TABLE[0][word1 & 0xff];
  132. #if N > 2
  133. crc2 = BRAID_TABLE[0][word2 & 0xff];
  134. #if N > 3
  135. crc3 = BRAID_TABLE[0][word3 & 0xff];
  136. #if N > 4
  137. crc4 = BRAID_TABLE[0][word4 & 0xff];
  138. #if N > 5
  139. crc5 = BRAID_TABLE[0][word5 & 0xff];
  140. #endif
  141. #endif
  142. #endif
  143. #endif
  144. #endif
  145. for (k = 1; k < W; k++) {
  146. crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
  147. #if N > 1
  148. crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
  149. #if N > 2
  150. crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
  151. #if N > 3
  152. crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
  153. #if N > 4
  154. crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
  155. #if N > 5
  156. crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
  157. #endif
  158. #endif
  159. #endif
  160. #endif
  161. #endif
  162. }
  163. }
  164. /* Process the last block, combining the CRCs of the N braids at the same time. */
  165. comb = crc_word(crc0 ^ words[0]);
  166. #if N > 1
  167. comb = crc_word(crc1 ^ words[1] ^ comb);
  168. #if N > 2
  169. comb = crc_word(crc2 ^ words[2] ^ comb);
  170. #if N > 3
  171. comb = crc_word(crc3 ^ words[3] ^ comb);
  172. #if N > 4
  173. comb = crc_word(crc4 ^ words[4] ^ comb);
  174. #if N > 5
  175. comb = crc_word(crc5 ^ words[5] ^ comb);
  176. #endif
  177. #endif
  178. #endif
  179. #endif
  180. #endif
  181. words += N;
  182. c = ZSWAPWORD(comb);
  183. /* Update the pointer to the remaining bytes to process. */
  184. buf = (const unsigned char *)words;
  185. }
  186. #endif /* W */
  187. /* Complete the computation of the CRC on any remaining bytes. */
  188. while (len >= 8) {
  189. len -= 8;
  190. DO8;
  191. }
  192. while (len) {
  193. len--;
  194. DO1;
  195. }
  196. /* Return the CRC, post-conditioned. */
  197. return c ^ 0xffffffff;
  198. }