compare256_avx2.c 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. /* compare256_avx2.c -- AVX2 version of compare256
  2. * Copyright Mika T. Lindqvist <postmaster@raasu.org>
  3. * For conditions of distribution and use, see copyright notice in zlib.h
  4. */
  5. #include "zbuild.h"
  6. #include "zutil_p.h"
  7. #include "deflate.h"
  8. #include "fallback_builtins.h"
  9. #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
  10. #include <immintrin.h>
  11. #ifdef _MSC_VER
  12. # include <nmmintrin.h>
  13. #endif
  14. static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
  15. uint32_t len = 0;
  16. do {
  17. __m256i ymm_src0, ymm_src1, ymm_cmp;
  18. ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
  19. ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
  20. ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
  21. unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
  22. if (mask != 0xFFFFFFFF) {
  23. uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
  24. return len + match_byte;
  25. }
  26. src0 += 32, src1 += 32, len += 32;
  27. ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
  28. ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
  29. ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
  30. mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
  31. if (mask != 0xFFFFFFFF) {
  32. uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
  33. return len + match_byte;
  34. }
  35. src0 += 32, src1 += 32, len += 32;
  36. } while (len < 256);
  37. return 256;
  38. }
  39. Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
  40. return compare256_avx2_static(src0, src1);
  41. }
  42. #define LONGEST_MATCH longest_match_avx2
  43. #define COMPARE256 compare256_avx2_static
  44. #include "match_tpl.h"
  45. #define LONGEST_MATCH_SLOW
  46. #define LONGEST_MATCH longest_match_slow_avx2
  47. #define COMPARE256 compare256_avx2_static
  48. #include "match_tpl.h"
  49. #endif