chunkset_rvv.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. /* chunkset_rvv.c - RVV version of chunkset
  2. * Copyright (C) 2023 SiFive, Inc. All rights reserved.
  3. * Contributed by Alex Chiang <alex.chiang@sifive.com>
  4. * For conditions of distribution and use, see copyright notice in zlib.h
  5. */
  6. #include <riscv_vector.h>
  7. #include "zbuild.h"
  8. /*
  9. * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
  10. * so we prefer using large size chunk and copy memory as much as possible.
  11. */
  12. #define CHUNK_SIZE 32
  13. #define HAVE_CHUNKMEMSET_2
  14. #define HAVE_CHUNKMEMSET_4
  15. #define HAVE_CHUNKMEMSET_8
  16. #define CHUNK_MEMSET_RVV_IMPL(elen) \
  17. do { \
  18. size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t); \
  19. uint##elen##_t val = *(uint##elen##_t*)from; \
  20. uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
  21. do { \
  22. vl = __riscv_vsetvl_e##elen##m4(len); \
  23. vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
  24. __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
  25. len -= vl; chunk_p += vl; \
  26. } while (len > 0); \
  27. } while (0)
  28. /* We don't have a 32-byte datatype for RISC-V arch. */
  29. typedef struct chunk_s {
  30. uint64_t data[4];
  31. } chunk_t;
  32. static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
  33. CHUNK_MEMSET_RVV_IMPL(16);
  34. }
  35. static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
  36. CHUNK_MEMSET_RVV_IMPL(32);
  37. }
  38. static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
  39. CHUNK_MEMSET_RVV_IMPL(64);
  40. }
  41. static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
  42. memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
  43. }
  44. static inline void storechunk(uint8_t *out, chunk_t *chunk) {
  45. memcpy(out, chunk->data, CHUNK_SIZE);
  46. }
  47. #define CHUNKSIZE chunksize_rvv
  48. #define CHUNKCOPY chunkcopy_rvv
  49. #define CHUNKUNROLL chunkunroll_rvv
  50. #define CHUNKMEMSET chunkmemset_rvv
  51. #define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
  52. #define HAVE_CHUNKCOPY
  53. /*
  54. * Assuming that the length is non-zero, and that `from` lags `out` by at least
  55. * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
  56. *
  57. * We load/store a single chunk once in the `CHUNKCOPY`.
  58. * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
  59. * such that, we prefer copy large memory size once to make good use of the the RVV advance.
  60. *
  61. * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
  62. * but we still copy as much memory as possible for some conditions.
  63. *
  64. * case 1: out - from >= len (no overlap)
  65. * We can use memcpy to copy `len` size once
  66. * because the memory layout would be the same.
  67. *
  68. * case 2: overlap
  69. * We copy N chunks using memcpy at once, aiming to achieve our goal:
  70. * to copy as much memory as possible.
  71. *
  72. * After using a single memcpy to copy N chunks, we have to use series of
  73. * loadchunk and storechunk to ensure the result is correct.
  74. */
  75. static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
  76. Assert(len > 0, "chunkcopy should never have a length 0");
  77. int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
  78. memcpy(out, from, sizeof(chunk_t));
  79. out += align;
  80. from += align;
  81. len -= align;
  82. ptrdiff_t dist = out - from;
  83. if (dist >= len) {
  84. memcpy(out, from, len);
  85. out += len;
  86. from += len;
  87. return out;
  88. }
  89. if (dist >= sizeof(chunk_t)) {
  90. dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
  91. memcpy(out, from, dist);
  92. out += dist;
  93. from += dist;
  94. len -= dist;
  95. }
  96. while (len > 0) {
  97. memcpy(out, from, sizeof(chunk_t));
  98. out += sizeof(chunk_t);
  99. from += sizeof(chunk_t);
  100. len -= sizeof(chunk_t);
  101. }
  102. return out;
  103. }
  104. #include "chunkset_tpl.h"
  105. #define INFLATE_FAST inflate_fast_rvv
  106. #include "inffast_tpl.h"