adler32_power8.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. /* Adler32 for POWER8 using VSX instructions.
  2. * Copyright (C) 2020 IBM Corporation
  3. * Author: Rogerio Alves <rcardoso@linux.ibm.com>
  4. * For conditions of distribution and use, see copyright notice in zlib.h
  5. *
  6. * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
  7. * instructions.
  8. *
  9. * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
  10. * iteration n) is the initial value of adler - at start _0 is 1 unless
  11. * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
  12. * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
  13. * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
  14. * after iteration N.
  15. *
  16. * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
  17. * N-1*c[1] + ... + c[N]
  18. *
  19. * In a more general way:
  20. *
  21. * s1_N = s1_0 + sum(i=1 to N)c[i]
  22. * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
  23. *
  24. * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
  25. * can process N-bit at time we can do this at once.
  26. *
  27. * Since VSX can support 16-bit vector instructions, we can process
  28. * 16-bit at time using N = 16 we have:
  29. *
  30. * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
  31. * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
  32. *
  33. * After the first iteration we calculate the adler32 checksum for 16 bytes.
  34. *
  35. * For more background about adler32 please check the RFC:
  36. * https://www.ietf.org/rfc/rfc1950.txt
  37. */
  38. #ifdef POWER8_VSX
  39. #include <altivec.h>
  40. #include "zbuild.h"
  41. #include "adler32_p.h"
  42. /* Vector across sum unsigned int (saturate). */
  43. static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
  44. __b = vec_sld(__a, __a, 8);
  45. __b = vec_add(__b, __a);
  46. __a = vec_sld(__b, __b, 4);
  47. __a = vec_add(__a, __b);
  48. return __a;
  49. }
  50. Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
  51. uint32_t s1 = adler & 0xffff;
  52. uint32_t s2 = (adler >> 16) & 0xffff;
  53. /* in case user likes doing a byte at a time, keep it fast */
  54. if (UNLIKELY(len == 1))
  55. return adler32_len_1(s1, buf, s2);
  56. /* If buffer is empty or len=0 we need to return adler initial value. */
  57. if (UNLIKELY(buf == NULL))
  58. return 1;
  59. /* This is faster than VSX code for len < 64. */
  60. if (len < 64)
  61. return adler32_len_64(s1, buf, len, s2);
  62. /* Use POWER VSX instructions for len >= 64. */
  63. const vector unsigned int v_zeros = { 0 };
  64. const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
  65. 6, 5, 4, 3, 2, 1};
  66. const vector unsigned char vsh = vec_splat_u8(4);
  67. const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
  68. vector unsigned int vs1 = { 0 };
  69. vector unsigned int vs2 = { 0 };
  70. vector unsigned int vs1_save = { 0 };
  71. vector unsigned int vsum1, vsum2;
  72. vector unsigned char vbuf;
  73. int n;
  74. vs1[0] = s1;
  75. vs2[0] = s2;
  76. /* Do length bigger than NMAX in blocks of NMAX size. */
  77. while (len >= NMAX) {
  78. len -= NMAX;
  79. n = NMAX / 16;
  80. do {
  81. vbuf = vec_xl(0, (unsigned char *) buf);
  82. vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
  83. /* sum(i=1 to 16) buf[i]*(16-i+1). */
  84. vsum2 = vec_msum(vbuf, v_mul, v_zeros);
  85. /* Save vs1. */
  86. vs1_save = vec_add(vs1_save, vs1);
  87. /* Accumulate the sums. */
  88. vs1 = vec_add(vsum1, vs1);
  89. vs2 = vec_add(vsum2, vs2);
  90. buf += 16;
  91. } while (--n);
  92. /* Once each block of NMAX size. */
  93. vs1 = vec_sumsu(vs1, vsum1);
  94. vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
  95. vs2 = vec_add(vs1_save, vs2);
  96. vs2 = vec_sumsu(vs2, vsum2);
  97. /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
  98. vs1[0] = vs1[0] % BASE;
  99. /* vs2[0] = s2_i + 16*s1_save +
  100. sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
  101. vs2[0] = vs2[0] % BASE;
  102. vs1 = vec_and(vs1, vmask);
  103. vs2 = vec_and(vs2, vmask);
  104. vs1_save = v_zeros;
  105. }
  106. /* len is less than NMAX one modulo is needed. */
  107. if (len >= 16) {
  108. while (len >= 16) {
  109. len -= 16;
  110. vbuf = vec_xl(0, (unsigned char *) buf);
  111. vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
  112. /* sum(i=1 to 16) buf[i]*(16-i+1). */
  113. vsum2 = vec_msum(vbuf, v_mul, v_zeros);
  114. /* Save vs1. */
  115. vs1_save = vec_add(vs1_save, vs1);
  116. /* Accumulate the sums. */
  117. vs1 = vec_add(vsum1, vs1);
  118. vs2 = vec_add(vsum2, vs2);
  119. buf += 16;
  120. }
  121. /* Since the size will be always less than NMAX we do this once. */
  122. vs1 = vec_sumsu(vs1, vsum1);
  123. vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
  124. vs2 = vec_add(vs1_save, vs2);
  125. vs2 = vec_sumsu(vs2, vsum2);
  126. }
  127. /* Copy result back to s1, s2 (mod 65521). */
  128. s1 = vs1[0] % BASE;
  129. s2 = vs2[0] % BASE;
  130. /* Process tail (len < 16). */
  131. return adler32_len_16(s1, buf, len, s2);
  132. }
  133. #endif /* POWER8_VSX */