chunkset_neon.c 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. /* chunkset_neon.c -- NEON inline functions to copy small data chunks.
  2. * For conditions of distribution and use, see copyright notice in zlib.h
  3. */
  4. #ifdef ARM_NEON
  5. #include "neon_intrins.h"
  6. #include "zbuild.h"
  7. #include "arch/generic/chunk_permute_table.h"
  8. typedef uint8x16_t chunk_t;
  9. #define CHUNK_SIZE 16
  10. #define HAVE_CHUNKMEMSET_2
  11. #define HAVE_CHUNKMEMSET_4
  12. #define HAVE_CHUNKMEMSET_8
  13. #define HAVE_CHUNK_MAG
  14. static const lut_rem_pair perm_idx_lut[13] = {
  15. {0, 1}, /* 3 */
  16. {0, 0}, /* don't care */
  17. {1 * 32, 1}, /* 5 */
  18. {2 * 32, 4}, /* 6 */
  19. {3 * 32, 2}, /* 7 */
  20. {0 * 32, 0}, /* don't care */
  21. {4 * 32, 7}, /* 9 */
  22. {5 * 32, 6}, /* 10 */
  23. {6 * 32, 5}, /* 11 */
  24. {7 * 32, 4}, /* 12 */
  25. {8 * 32, 3}, /* 13 */
  26. {9 * 32, 2}, /* 14 */
  27. {10 * 32, 1},/* 15 */
  28. };
  29. static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
  30. uint16_t tmp;
  31. memcpy(&tmp, from, sizeof(tmp));
  32. *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
  33. }
  34. static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
  35. uint32_t tmp;
  36. memcpy(&tmp, from, sizeof(tmp));
  37. *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
  38. }
  39. static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
  40. uint64_t tmp;
  41. memcpy(&tmp, from, sizeof(tmp));
  42. *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
  43. }
  44. #define CHUNKSIZE chunksize_neon
  45. #define CHUNKCOPY chunkcopy_neon
  46. #define CHUNKUNROLL chunkunroll_neon
  47. #define CHUNKMEMSET chunkmemset_neon
  48. #define CHUNKMEMSET_SAFE chunkmemset_safe_neon
  49. static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
  50. *chunk = vld1q_u8(s);
  51. }
  52. static inline void storechunk(uint8_t *out, chunk_t *chunk) {
  53. vst1q_u8(out, *chunk);
  54. }
  55. static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
  56. lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
  57. *chunk_rem = lut_rem.remval;
  58. /* See note in chunkset_ssse3.c for why this is ok */
  59. __msan_unpoison(buf + dist, 16 - dist);
  60. /* This version of table is only available on aarch64 */
  61. #if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
  62. uint8x16_t ret_vec = vld1q_u8(buf);
  63. uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
  64. return vqtbl1q_u8(ret_vec, perm_vec);
  65. #else
  66. uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
  67. perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
  68. perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
  69. a = vld1_u8(buf);
  70. b = vld1_u8(buf + 8);
  71. ret0 = vtbl1_u8(a, perm_vec0);
  72. uint8x8x2_t ab = {{a, b}};
  73. ret1 = vtbl2_u8(ab, perm_vec1);
  74. return vcombine_u8(ret0, ret1);
  75. #endif
  76. }
  77. #include "chunkset_tpl.h"
  78. #define INFLATE_FAST inflate_fast_neon
  79. #include "inffast_tpl.h"
  80. #endif