jcsample-neon.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /*
  2. * Downsampling (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2024-2025, D. R. Commander. All Rights Reserved.
  6. *
  7. * This software is provided 'as-is', without any express or implied
  8. * warranty. In no event will the authors be held liable for any damages
  9. * arising from the use of this software.
  10. *
  11. * Permission is granted to anyone to use this software for any purpose,
  12. * including commercial applications, and to alter it and redistribute it
  13. * freely, subject to the following restrictions:
  14. *
  15. * 1. The origin of this software must not be misrepresented; you must not
  16. * claim that you wrote the original software. If you use this software
  17. * in a product, an acknowledgment in the product documentation would be
  18. * appreciated but is not required.
  19. * 2. Altered source versions must be plainly marked as such, and must not be
  20. * misrepresented as being the original software.
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #define JPEG_INTERNALS
  24. #include "../../src/jinclude.h"
  25. #include "../../src/jpeglib.h"
  26. #include "../../src/jsimd.h"
  27. #include "../../src/jdct.h"
  28. #include "../../src/jsimddct.h"
  29. #include "../jsimd.h"
  30. #include "align.h"
  31. #include "neon-compat.h"
  32. #include <arm_neon.h>
  33. ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
  34. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
  35. 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  36. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
  37. 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
  38. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
  39. 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
  40. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
  41. 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
  42. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
  43. 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
  44. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
  45. 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
  46. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
  47. 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
  48. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
  49. 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
  50. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
  51. 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
  52. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
  53. 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
  54. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
  55. 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
  56. 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
  57. 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
  58. 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
  59. 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
  60. 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
  61. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
  62. 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
  63. 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
  64. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
  65. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  66. };
  67. /* Downsample pixel values of a single component.
  68. * This version handles the common case of 2:1 horizontal and 1:1 vertical,
  69. * without smoothing.
  70. */
  71. void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
  72. JDIMENSION v_samp_factor,
  73. JDIMENSION width_in_blocks,
  74. JSAMPARRAY input_data, JSAMPARRAY output_data)
  75. {
  76. JSAMPROW inptr, outptr;
  77. /* Load expansion mask to pad remaining elements of last DCT block. */
  78. const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
  79. const uint8x16_t expand_mask =
  80. vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
  81. /* Load bias pattern (alternating every pixel.) */
  82. /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
  83. const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
  84. unsigned i, outrow;
  85. for (outrow = 0; outrow < v_samp_factor; outrow++) {
  86. outptr = output_data[outrow];
  87. inptr = input_data[outrow];
  88. /* Downsample all but the last DCT block of pixels. */
  89. for (i = 0; i < width_in_blocks - 1; i++) {
  90. uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
  91. /* Add adjacent pixel values, widen to 16-bit, and add bias. */
  92. uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
  93. /* Divide total by 2 and narrow to 8-bit. */
  94. uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
  95. /* Store samples to memory. */
  96. vst1_u8(outptr + i * DCTSIZE, samples_u8);
  97. }
  98. /* Load pixels in last DCT block into a table. */
  99. uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
  100. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  101. /* Pad the empty elements with the value of the last pixel. */
  102. pixels = vqtbl1q_u8(pixels, expand_mask);
  103. #else
  104. uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
  105. pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
  106. vtbl2_u8(table, vget_high_u8(expand_mask)));
  107. #endif
  108. /* Add adjacent pixel values, widen to 16-bit, and add bias. */
  109. uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
  110. /* Divide total by 2, narrow to 8-bit, and store. */
  111. uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
  112. vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
  113. }
  114. }
  115. /* Downsample pixel values of a single component.
  116. * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  117. * without smoothing.
  118. */
  119. void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
  120. JDIMENSION v_samp_factor,
  121. JDIMENSION width_in_blocks,
  122. JSAMPARRAY input_data, JSAMPARRAY output_data)
  123. {
  124. JSAMPROW inptr0, inptr1, outptr;
  125. /* Load expansion mask to pad remaining elements of last DCT block. */
  126. const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
  127. const uint8x16_t expand_mask =
  128. vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
  129. /* Load bias pattern (alternating every pixel.) */
  130. /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
  131. const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
  132. unsigned i, outrow;
  133. for (outrow = 0; outrow < v_samp_factor; outrow++) {
  134. outptr = output_data[outrow];
  135. inptr0 = input_data[outrow];
  136. inptr1 = input_data[outrow + 1];
  137. /* Downsample all but the last DCT block of pixels. */
  138. for (i = 0; i < width_in_blocks - 1; i++) {
  139. uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
  140. uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
  141. /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
  142. uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
  143. /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
  144. */
  145. samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
  146. /* Divide total by 4 and narrow to 8-bit. */
  147. uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
  148. /* Store samples to memory and increment pointers. */
  149. vst1_u8(outptr + i * DCTSIZE, samples_u8);
  150. }
  151. /* Load pixels in last DCT block into a table. */
  152. uint8x16_t pixels_r0 =
  153. vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
  154. uint8x16_t pixels_r1 =
  155. vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
  156. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  157. /* Pad the empty elements with the value of the last pixel. */
  158. pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
  159. pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
  160. #else
  161. uint8x8x2_t table_r0 =
  162. { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
  163. uint8x8x2_t table_r1 =
  164. { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
  165. pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
  166. vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
  167. pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
  168. vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
  169. #endif
  170. /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
  171. uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
  172. /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
  173. samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
  174. /* Divide total by 4, narrow to 8-bit, and store. */
  175. uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
  176. vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
  177. }
  178. }