adler32_vmx.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. /* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
  2. * Copyright (C) 1995-2011 Mark Adler
  3. * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
  4. * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
  5. * For conditions of distribution and use, see copyright notice in zlib.h
  6. */
  7. #ifdef PPC_VMX
  8. #include <altivec.h>
  9. #include "zbuild.h"
  10. #include "zendian.h"
  11. #include "adler32_p.h"
  12. #define vmx_zero() (vec_splat_u32(0))
  13. static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
  14. unsigned int i;
  15. for (i = 0; i < len; ++i) {
  16. pair[0] += buf[i];
  17. pair[1] += pair[0];
  18. }
  19. }
  20. static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
  21. /* Different taps for the separable components of sums */
  22. const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
  23. const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
  24. const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
  25. const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
  26. /* As silly and inefficient as it seems, creating 1 permutation vector to permute
  27. * a 2 element vector from a single load + a subsequent shift is just barely faster
  28. * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
  29. const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
  30. const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
  31. vector unsigned int adacc, s2acc;
  32. vector unsigned int pair_vec = vec_ld(0, s);
  33. adacc = vec_perm(pair_vec, pair_vec, s0_perm);
  34. #if BYTE_ORDER == LITTLE_ENDIAN
  35. s2acc = vec_sro(pair_vec, shift_vec);
  36. #else
  37. s2acc = vec_slo(pair_vec, shift_vec);
  38. #endif
  39. vector unsigned int zero = vmx_zero();
  40. vector unsigned int s3acc = zero;
  41. vector unsigned int s3acc_0 = zero;
  42. vector unsigned int adacc_prev = adacc;
  43. vector unsigned int adacc_prev_0 = zero;
  44. vector unsigned int s2acc_0 = zero;
  45. vector unsigned int s2acc_1 = zero;
  46. vector unsigned int s2acc_2 = zero;
  47. /* Maintain a running sum of a second half, this might help use break yet another
  48. * data dependency bubble in the sum */
  49. vector unsigned int adacc_0 = zero;
  50. int num_iter = len / 4;
  51. int rem = len & 3;
  52. for (int i = 0; i < num_iter; ++i) {
  53. vector unsigned char d0 = vec_ld(0, buf);
  54. vector unsigned char d1 = vec_ld(16, buf);
  55. vector unsigned char d2 = vec_ld(32, buf);
  56. vector unsigned char d3 = vec_ld(48, buf);
  57. /* The core operation of the loop, basically
  58. * what is being unrolled below */
  59. adacc = vec_sum4s(d0, adacc);
  60. s3acc = vec_add(s3acc, adacc_prev);
  61. s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
  62. s2acc = vec_msum(t0, d0, s2acc);
  63. /* interleave dependent sums in here */
  64. adacc_0 = vec_sum4s(d1, adacc_0);
  65. s2acc_0 = vec_msum(t1, d1, s2acc_0);
  66. adacc = vec_sum4s(d2, adacc);
  67. s2acc_1 = vec_msum(t2, d2, s2acc_1);
  68. s2acc_2 = vec_msum(t3, d3, s2acc_2);
  69. adacc_0 = vec_sum4s(d3, adacc_0);
  70. adacc_prev = adacc;
  71. adacc_prev_0 = adacc_0;
  72. buf += 64;
  73. }
  74. adacc = vec_add(adacc, adacc_0);
  75. s3acc = vec_add(s3acc, s3acc_0);
  76. s3acc = vec_sl(s3acc, vec_splat_u32(6));
  77. if (rem) {
  78. adacc_prev = vec_add(adacc_prev_0, adacc_prev);
  79. adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
  80. while (rem--) {
  81. vector unsigned char d0 = vec_ld(0, buf);
  82. adacc = vec_sum4s(d0, adacc);
  83. s3acc = vec_add(s3acc, adacc_prev);
  84. s2acc = vec_msum(t3, d0, s2acc);
  85. adacc_prev = vec_sl(adacc, vec_splat_u32(4));
  86. buf += 16;
  87. }
  88. }
  89. /* Sum up independent second sums */
  90. s2acc = vec_add(s2acc, s2acc_0);
  91. s2acc_2 = vec_add(s2acc_1, s2acc_2);
  92. s2acc = vec_add(s2acc, s2acc_2);
  93. s2acc = vec_add(s2acc, s3acc);
  94. adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
  95. s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
  96. adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
  97. s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
  98. vec_ste(adacc, 0, s);
  99. vec_ste(s2acc, 0, s+1);
  100. }
  101. Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
  102. uint32_t sum2;
  103. uint32_t pair[16] ALIGNED_(16);
  104. memset(&pair[2], 0, 14);
  105. int n = NMAX;
  106. unsigned int done = 0, i;
  107. /* Split Adler-32 into component sums, it can be supplied by
  108. * the caller sites (e.g. in a PNG file).
  109. */
  110. sum2 = (adler >> 16) & 0xffff;
  111. adler &= 0xffff;
  112. pair[0] = adler;
  113. pair[1] = sum2;
  114. /* in case user likes doing a byte at a time, keep it fast */
  115. if (UNLIKELY(len == 1))
  116. return adler32_len_1(adler, buf, sum2);
  117. /* initial Adler-32 value (deferred check for len == 1 speed) */
  118. if (UNLIKELY(buf == NULL))
  119. return 1L;
  120. /* in case short lengths are provided, keep it somewhat fast */
  121. if (UNLIKELY(len < 16))
  122. return adler32_len_16(adler, buf, len, sum2);
  123. // Align buffer
  124. unsigned int al = 0;
  125. if ((uintptr_t)buf & 0xf) {
  126. al = 16-((uintptr_t)buf & 0xf);
  127. if (al > len) {
  128. al=len;
  129. }
  130. vmx_handle_head_or_tail(pair, buf, al);
  131. done += al;
  132. /* Rather than rebasing, we can reduce the max sums for the
  133. * first round only */
  134. n -= al;
  135. }
  136. for (i = al; i < len; i += n) {
  137. int remaining = (int)(len-i);
  138. n = MIN(remaining, (i == al) ? n : NMAX);
  139. if (n < 16)
  140. break;
  141. vmx_accum32(pair, buf + i, n / 16);
  142. pair[0] %= BASE;
  143. pair[1] %= BASE;
  144. done += (n / 16) * 16;
  145. }
  146. /* Handle the tail elements. */
  147. if (done < len) {
  148. vmx_handle_head_or_tail(pair, (buf + done), len - done);
  149. pair[0] %= BASE;
  150. pair[1] %= BASE;
  151. }
  152. /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
  153. return (pair[1] << 16) | pair[0];
  154. }
  155. #endif