x86_functions.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. /* x86_functions.h -- x86 implementations for arch-specific functions.
  2. * Copyright (C) 2013 Intel Corporation Jim Kukunas
  3. * For conditions of distribution and use, see copyright notice in zlib.h
  4. */
  5. #ifndef X86_FUNCTIONS_H_
  6. #define X86_FUNCTIONS_H_
  7. #ifdef X86_SSE2
  8. uint32_t chunksize_sse2(void);
  9. uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  10. # ifdef HAVE_BUILTIN_CTZ
  11. uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
  12. uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
  13. uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
  14. void slide_hash_sse2(deflate_state *s);
  15. # endif
  16. void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
  17. #endif
  18. #ifdef X86_SSSE3
  19. uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
  20. uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  21. void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
  22. #endif
  23. #ifdef X86_SSE42
  24. uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  25. #endif
  26. #ifdef X86_AVX2
  27. uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
  28. uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  29. uint32_t chunksize_avx2(void);
  30. uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  31. # ifdef HAVE_BUILTIN_CTZ
  32. uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
  33. uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
  34. uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
  35. void slide_hash_avx2(deflate_state *s);
  36. # endif
  37. void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
  38. #endif
  39. #ifdef X86_AVX512
  40. uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
  41. uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  42. #endif
  43. #ifdef X86_AVX512VNNI
  44. uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
  45. uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  46. #endif
  47. #ifdef X86_PCLMULQDQ_CRC
  48. uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
  49. void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
  50. void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
  51. uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
  52. uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
  53. #endif
  54. #ifdef X86_VPCLMULQDQ_CRC
  55. uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
  56. void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
  57. void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
  58. uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
  59. uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
  60. #endif
  61. #ifdef DISABLE_RUNTIME_CPU_DETECTION
  62. // X86 - SSE2
  63. # if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
  64. # undef native_chunkmemset_safe
  65. # define native_chunkmemset_safe chunkmemset_safe_sse2
  66. # undef native_chunksize
  67. # define native_chunksize chunksize_sse2
  68. # undef native_inflate_fast
  69. # define native_inflate_fast inflate_fast_sse2
  70. # undef native_slide_hash
  71. # define native_slide_hash slide_hash_sse2
  72. # ifdef HAVE_BUILTIN_CTZ
  73. # undef native_compare256
  74. # define native_compare256 compare256_sse2
  75. # undef native_longest_match
  76. # define native_longest_match longest_match_sse2
  77. # undef native_longest_match_slow
  78. # define native_longest_match_slow longest_match_slow_sse2
  79. # endif
  80. #endif
  81. // X86 - SSSE3
  82. # if defined(X86_SSSE3) && defined(__SSSE3__)
  83. # undef native_adler32
  84. # define native_adler32 adler32_ssse3
  85. # undef native_chunkmemset_safe
  86. # define native_chunkmemset_safe chunkmemset_safe_ssse3
  87. # undef native_inflate_fast
  88. # define native_inflate_fast inflate_fast_ssse3
  89. # endif
  90. // X86 - SSE4.2
  91. # if defined(X86_SSE42) && defined(__SSE4_2__)
  92. # undef native_adler32_fold_copy
  93. # define native_adler32_fold_copy adler32_fold_copy_sse42
  94. # endif
  95. // X86 - PCLMUL
  96. #if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
  97. # undef native_crc32
  98. # define native_crc32 crc32_pclmulqdq
  99. # undef native_crc32_fold
  100. # define native_crc32_fold crc32_fold_pclmulqdq
  101. # undef native_crc32_fold_copy
  102. # define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
  103. # undef native_crc32_fold_final
  104. # define native_crc32_fold_final crc32_fold_pclmulqdq_final
  105. # undef native_crc32_fold_reset
  106. # define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
  107. #endif
  108. // X86 - AVX
  109. # if defined(X86_AVX2) && defined(__AVX2__)
  110. # undef native_adler32
  111. # define native_adler32 adler32_avx2
  112. # undef native_adler32_fold_copy
  113. # define native_adler32_fold_copy adler32_fold_copy_avx2
  114. # undef native_chunkmemset_safe
  115. # define native_chunkmemset_safe chunkmemset_safe_avx2
  116. # undef native_chunksize
  117. # define native_chunksize chunksize_avx2
  118. # undef native_inflate_fast
  119. # define native_inflate_fast inflate_fast_avx2
  120. # undef native_slide_hash
  121. # define native_slide_hash slide_hash_avx2
  122. # ifdef HAVE_BUILTIN_CTZ
  123. # undef native_compare256
  124. # define native_compare256 compare256_avx2
  125. # undef native_longest_match
  126. # define native_longest_match longest_match_avx2
  127. # undef native_longest_match_slow
  128. # define native_longest_match_slow longest_match_slow_avx2
  129. # endif
  130. # endif
  131. // X86 - AVX512 (F,DQ,BW,Vl)
  132. # if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
  133. # undef native_adler32
  134. # define native_adler32 adler32_avx512
  135. # undef native_adler32_fold_copy
  136. # define native_adler32_fold_copy adler32_fold_copy_avx512
  137. // X86 - AVX512 (VNNI)
  138. # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
  139. # undef native_adler32
  140. # define native_adler32 adler32_avx512_vnni
  141. # undef native_adler32_fold_copy
  142. # define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
  143. # endif
  144. // X86 - VPCLMULQDQ
  145. # if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
  146. # undef native_crc32
  147. # define native_crc32 crc32_vpclmulqdq
  148. # undef native_crc32_fold
  149. # define native_crc32_fold crc32_fold_vpclmulqdq
  150. # undef native_crc32_fold_copy
  151. # define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
  152. # undef native_crc32_fold_final
  153. # define native_crc32_fold_final crc32_fold_vpclmulqdq_final
  154. # undef native_crc32_fold_reset
  155. # define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
  156. # endif
  157. # endif
  158. #endif
  159. #endif /* X86_FUNCTIONS_H_ */