functable.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /* functable.c -- Choose relevant optimized functions at runtime
  2. * Copyright (C) 2017 Hans Kristian Rosbach
  3. * For conditions of distribution and use, see copyright notice in zlib.h
  4. */
  5. #ifndef DISABLE_RUNTIME_CPU_DETECTION
  6. #include "zbuild.h"
  7. #include "functable.h"
  8. #include "cpu_features.h"
  9. #include "arch_functions.h"
  10. #if defined(_MSC_VER)
  11. # include <intrin.h>
  12. #endif
  13. /* Platform has pointer size atomic store */
  14. #if defined(__GNUC__) || defined(__clang__)
  15. # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
  16. __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
  17. # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
  18. #elif defined(_MSC_VER)
  19. # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
  20. _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
  21. # if defined(_M_ARM) || defined(_M_ARM64)
  22. # define FUNCTABLE_BARRIER() do { \
  23. _ReadWriteBarrier(); \
  24. __dmb(0xB); /* _ARM_BARRIER_ISH */ \
  25. _ReadWriteBarrier(); \
  26. } while (0)
  27. # else
  28. # define FUNCTABLE_BARRIER() _ReadWriteBarrier()
  29. # endif
  30. #else
  31. # warning Unable to detect atomic intrinsic support.
  32. # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
  33. *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
  34. # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
  35. #endif
  36. static void force_init_empty(void) {
  37. // empty
  38. }
  39. static void init_functable(void) {
  40. struct functable_s ft;
  41. struct cpu_features cf;
  42. cpu_check_features(&cf);
  43. // Generic code
  44. ft.force_init = &force_init_empty;
  45. ft.adler32 = &adler32_c;
  46. ft.adler32_fold_copy = &adler32_fold_copy_c;
  47. ft.chunkmemset_safe = &chunkmemset_safe_c;
  48. ft.chunksize = &chunksize_c;
  49. ft.crc32 = &PREFIX(crc32_braid);
  50. ft.crc32_fold = &crc32_fold_c;
  51. ft.crc32_fold_copy = &crc32_fold_copy_c;
  52. ft.crc32_fold_final = &crc32_fold_final_c;
  53. ft.crc32_fold_reset = &crc32_fold_reset_c;
  54. ft.inflate_fast = &inflate_fast_c;
  55. ft.slide_hash = &slide_hash_c;
  56. ft.longest_match = &longest_match_generic;
  57. ft.longest_match_slow = &longest_match_slow_generic;
  58. ft.compare256 = &compare256_generic;
  59. // Select arch-optimized functions
  60. // X86 - SSE2
  61. #ifdef X86_SSE2
  62. # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
  63. if (cf.x86.has_sse2)
  64. # endif
  65. {
  66. ft.chunkmemset_safe = &chunkmemset_safe_sse2;
  67. ft.chunksize = &chunksize_sse2;
  68. ft.inflate_fast = &inflate_fast_sse2;
  69. ft.slide_hash = &slide_hash_sse2;
  70. # ifdef HAVE_BUILTIN_CTZ
  71. ft.compare256 = &compare256_sse2;
  72. ft.longest_match = &longest_match_sse2;
  73. ft.longest_match_slow = &longest_match_slow_sse2;
  74. # endif
  75. }
  76. #endif
  77. // X86 - SSSE3
  78. #ifdef X86_SSSE3
  79. if (cf.x86.has_ssse3) {
  80. ft.adler32 = &adler32_ssse3;
  81. ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
  82. ft.inflate_fast = &inflate_fast_ssse3;
  83. }
  84. #endif
  85. // X86 - SSE4.2
  86. #ifdef X86_SSE42
  87. if (cf.x86.has_sse42) {
  88. ft.adler32_fold_copy = &adler32_fold_copy_sse42;
  89. }
  90. #endif
  91. // X86 - PCLMUL
  92. #ifdef X86_PCLMULQDQ_CRC
  93. if (cf.x86.has_pclmulqdq) {
  94. ft.crc32 = &crc32_pclmulqdq;
  95. ft.crc32_fold = &crc32_fold_pclmulqdq;
  96. ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
  97. ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
  98. ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
  99. }
  100. #endif
  101. // X86 - AVX
  102. #ifdef X86_AVX2
  103. if (cf.x86.has_avx2) {
  104. ft.adler32 = &adler32_avx2;
  105. ft.adler32_fold_copy = &adler32_fold_copy_avx2;
  106. ft.chunkmemset_safe = &chunkmemset_safe_avx2;
  107. ft.chunksize = &chunksize_avx2;
  108. ft.inflate_fast = &inflate_fast_avx2;
  109. ft.slide_hash = &slide_hash_avx2;
  110. # ifdef HAVE_BUILTIN_CTZ
  111. ft.compare256 = &compare256_avx2;
  112. ft.longest_match = &longest_match_avx2;
  113. ft.longest_match_slow = &longest_match_slow_avx2;
  114. # endif
  115. }
  116. #endif
  117. // X86 - AVX512 (F,DQ,BW,Vl)
  118. #ifdef X86_AVX512
  119. if (cf.x86.has_avx512_common) {
  120. ft.adler32 = &adler32_avx512;
  121. ft.adler32_fold_copy = &adler32_fold_copy_avx512;
  122. }
  123. #endif
  124. #ifdef X86_AVX512VNNI
  125. if (cf.x86.has_avx512vnni) {
  126. ft.adler32 = &adler32_avx512_vnni;
  127. ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
  128. }
  129. #endif
  130. // X86 - VPCLMULQDQ
  131. #ifdef X86_VPCLMULQDQ_CRC
  132. if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
  133. ft.crc32 = &crc32_vpclmulqdq;
  134. ft.crc32_fold = &crc32_fold_vpclmulqdq;
  135. ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
  136. ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
  137. ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
  138. }
  139. #endif
  140. // ARM - SIMD
  141. #ifdef ARM_SIMD
  142. # ifndef ARM_NOCHECK_SIMD
  143. if (cf.arm.has_simd)
  144. # endif
  145. {
  146. ft.slide_hash = &slide_hash_armv6;
  147. }
  148. #endif
  149. // ARM - NEON
  150. #ifdef ARM_NEON
  151. # ifndef ARM_NOCHECK_NEON
  152. if (cf.arm.has_neon)
  153. # endif
  154. {
  155. ft.adler32 = &adler32_neon;
  156. ft.chunkmemset_safe = &chunkmemset_safe_neon;
  157. ft.chunksize = &chunksize_neon;
  158. ft.inflate_fast = &inflate_fast_neon;
  159. ft.slide_hash = &slide_hash_neon;
  160. # ifdef HAVE_BUILTIN_CTZLL
  161. ft.compare256 = &compare256_neon;
  162. ft.longest_match = &longest_match_neon;
  163. ft.longest_match_slow = &longest_match_slow_neon;
  164. # endif
  165. }
  166. #endif
  167. // ARM - ACLE
  168. #ifdef ARM_ACLE
  169. if (cf.arm.has_crc32) {
  170. ft.crc32 = &crc32_acle;
  171. }
  172. #endif
  173. // Power - VMX
  174. #ifdef PPC_VMX
  175. if (cf.power.has_altivec) {
  176. ft.adler32 = &adler32_vmx;
  177. ft.slide_hash = &slide_hash_vmx;
  178. }
  179. #endif
  180. // Power8 - VSX
  181. #ifdef POWER8_VSX
  182. if (cf.power.has_arch_2_07) {
  183. ft.adler32 = &adler32_power8;
  184. ft.chunkmemset_safe = &chunkmemset_safe_power8;
  185. ft.chunksize = &chunksize_power8;
  186. ft.inflate_fast = &inflate_fast_power8;
  187. ft.slide_hash = &slide_hash_power8;
  188. }
  189. #endif
  190. #ifdef POWER8_VSX_CRC32
  191. if (cf.power.has_arch_2_07)
  192. ft.crc32 = &crc32_power8;
  193. #endif
  194. // Power9
  195. #ifdef POWER9
  196. if (cf.power.has_arch_3_00) {
  197. ft.compare256 = &compare256_power9;
  198. ft.longest_match = &longest_match_power9;
  199. ft.longest_match_slow = &longest_match_slow_power9;
  200. }
  201. #endif
  202. // RISCV - RVV
  203. #ifdef RISCV_RVV
  204. if (cf.riscv.has_rvv) {
  205. ft.adler32 = &adler32_rvv;
  206. ft.adler32_fold_copy = &adler32_fold_copy_rvv;
  207. ft.chunkmemset_safe = &chunkmemset_safe_rvv;
  208. ft.chunksize = &chunksize_rvv;
  209. ft.compare256 = &compare256_rvv;
  210. ft.inflate_fast = &inflate_fast_rvv;
  211. ft.longest_match = &longest_match_rvv;
  212. ft.longest_match_slow = &longest_match_slow_rvv;
  213. ft.slide_hash = &slide_hash_rvv;
  214. }
  215. #endif
  216. // S390
  217. #ifdef S390_CRC32_VX
  218. if (cf.s390.has_vx)
  219. ft.crc32 = crc32_s390_vx;
  220. #endif
  221. // Assign function pointers individually for atomic operation
  222. FUNCTABLE_ASSIGN(ft, force_init);
  223. FUNCTABLE_ASSIGN(ft, adler32);
  224. FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
  225. FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
  226. FUNCTABLE_ASSIGN(ft, chunksize);
  227. FUNCTABLE_ASSIGN(ft, compare256);
  228. FUNCTABLE_ASSIGN(ft, crc32);
  229. FUNCTABLE_ASSIGN(ft, crc32_fold);
  230. FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
  231. FUNCTABLE_ASSIGN(ft, crc32_fold_final);
  232. FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
  233. FUNCTABLE_ASSIGN(ft, inflate_fast);
  234. FUNCTABLE_ASSIGN(ft, longest_match);
  235. FUNCTABLE_ASSIGN(ft, longest_match_slow);
  236. FUNCTABLE_ASSIGN(ft, slide_hash);
  237. // Memory barrier for weak memory order CPUs
  238. FUNCTABLE_BARRIER();
  239. }
  240. /* stub functions */
  241. static void force_init_stub(void) {
  242. init_functable();
  243. }
  244. static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
  245. init_functable();
  246. return functable.adler32(adler, buf, len);
  247. }
  248. static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
  249. init_functable();
  250. return functable.adler32_fold_copy(adler, dst, src, len);
  251. }
  252. static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
  253. init_functable();
  254. return functable.chunkmemset_safe(out, dist, len, left);
  255. }
  256. static uint32_t chunksize_stub(void) {
  257. init_functable();
  258. return functable.chunksize();
  259. }
  260. static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
  261. init_functable();
  262. return functable.compare256(src0, src1);
  263. }
  264. static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
  265. init_functable();
  266. return functable.crc32(crc, buf, len);
  267. }
  268. static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
  269. init_functable();
  270. functable.crc32_fold(crc, src, len, init_crc);
  271. }
  272. static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
  273. init_functable();
  274. functable.crc32_fold_copy(crc, dst, src, len);
  275. }
  276. static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
  277. init_functable();
  278. return functable.crc32_fold_final(crc);
  279. }
  280. static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
  281. init_functable();
  282. return functable.crc32_fold_reset(crc);
  283. }
  284. static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
  285. init_functable();
  286. functable.inflate_fast(strm, start);
  287. }
  288. static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
  289. init_functable();
  290. return functable.longest_match(s, cur_match);
  291. }
  292. static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
  293. init_functable();
  294. return functable.longest_match_slow(s, cur_match);
  295. }
  296. static void slide_hash_stub(deflate_state* s) {
  297. init_functable();
  298. functable.slide_hash(s);
  299. }
  300. /* functable init */
  301. Z_INTERNAL struct functable_s functable = {
  302. force_init_stub,
  303. adler32_stub,
  304. adler32_fold_copy_stub,
  305. chunkmemset_safe_stub,
  306. chunksize_stub,
  307. compare256_stub,
  308. crc32_stub,
  309. crc32_fold_stub,
  310. crc32_fold_copy_stub,
  311. crc32_fold_final_stub,
  312. crc32_fold_reset_stub,
  313. inflate_fast_stub,
  314. longest_match_stub,
  315. longest_match_slow_stub,
  316. slide_hash_stub,
  317. };
  318. #endif