jdsample-neon.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. /*
  2. * Upsampling (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved.
  6. *
  7. * This software is provided 'as-is', without any express or implied
  8. * warranty. In no event will the authors be held liable for any damages
  9. * arising from the use of this software.
  10. *
  11. * Permission is granted to anyone to use this software for any purpose,
  12. * including commercial applications, and to alter it and redistribute it
  13. * freely, subject to the following restrictions:
  14. *
  15. * 1. The origin of this software must not be misrepresented; you must not
  16. * claim that you wrote the original software. If you use this software
  17. * in a product, an acknowledgment in the product documentation would be
  18. * appreciated but is not required.
  19. * 2. Altered source versions must be plainly marked as such, and must not be
  20. * misrepresented as being the original software.
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #define JPEG_INTERNALS
  24. #include "../../src/jinclude.h"
  25. #include "../../src/jpeglib.h"
  26. #include "../../src/jsimd.h"
  27. #include "../../src/jdct.h"
  28. #include "../../src/jsimddct.h"
  29. #include "../jsimd.h"
  30. #include "neon-compat.h"
  31. #include <arm_neon.h>
  32. /* The diagram below shows a row of samples produced by h2v1 downsampling.
  33. *
  34. * s0 s1 s2
  35. * +---------+---------+---------+
  36. * | | | |
  37. * | p0 p1 | p2 p3 | p4 p5 |
  38. * | | | |
  39. * +---------+---------+---------+
  40. *
  41. * Samples s0-s2 were created by averaging the original pixel component values
  42. * centered at positions p0-p5 above. To approximate those original pixel
  43. * component values, we proportionally blend the adjacent samples in each row.
  44. *
  45. * An upsampled pixel component value is computed by blending the sample
  46. * containing the pixel center with the nearest neighboring sample, in the
  47. * ratio 3:1. For example:
  48. * p1(upsampled) = 3/4 * s0 + 1/4 * s1
  49. * p2(upsampled) = 3/4 * s1 + 1/4 * s0
  50. * When computing the first and last pixel component values in the row, there
  51. * is no adjacent sample to blend, so:
  52. * p0(upsampled) = s0
  53. * p5(upsampled) = s2
  54. */
  55. void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
  56. JDIMENSION downsampled_width,
  57. JSAMPARRAY input_data,
  58. JSAMPARRAY *output_data_ptr)
  59. {
  60. JSAMPARRAY output_data = *output_data_ptr;
  61. JSAMPROW inptr, outptr;
  62. int inrow;
  63. unsigned colctr;
  64. /* Set up constants. */
  65. const uint16x8_t one_u16 = vdupq_n_u16(1);
  66. const uint8x8_t three_u8 = vdup_n_u8(3);
  67. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  68. inptr = input_data[inrow];
  69. outptr = output_data[inrow];
  70. /* First pixel component value in this row of the original image */
  71. *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
  72. /* 3/4 * containing sample + 1/4 * nearest neighboring sample
  73. * For p1: containing sample = s0, nearest neighboring sample = s1
  74. * For p2: containing sample = s1, nearest neighboring sample = s0
  75. */
  76. uint8x16_t s0 = vld1q_u8(inptr);
  77. uint8x16_t s1 = vld1q_u8(inptr + 1);
  78. /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
  79. * denote low half and high half respectively.
  80. */
  81. uint16x8_t s1_add_3s0_l =
  82. vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
  83. uint16x8_t s1_add_3s0_h =
  84. vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
  85. uint16x8_t s0_add_3s1_l =
  86. vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
  87. uint16x8_t s0_add_3s1_h =
  88. vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
  89. /* Add ordered dithering bias to odd pixel values. */
  90. s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
  91. s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
  92. /* The offset is initially 1, because the first pixel component has already
  93. * been stored. However, in subsequent iterations of the SIMD loop, this
  94. * offset is (2 * colctr - 1) to stay within the bounds of the sample
  95. * buffers without having to resort to a slow scalar tail case for the last
  96. * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
  97. * in jmemmgr.c for more details.
  98. */
  99. unsigned outptr_offset = 1;
  100. uint8x16x2_t output_pixels;
  101. /* We use software pipelining to maximise performance. The code indented
  102. * an extra two spaces begins the next iteration of the loop.
  103. */
  104. for (colctr = 16; colctr < downsampled_width; colctr += 16) {
  105. s0 = vld1q_u8(inptr + colctr - 1);
  106. s1 = vld1q_u8(inptr + colctr);
  107. /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
  108. output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
  109. vrshrn_n_u16(s1_add_3s0_h, 2));
  110. output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
  111. vshrn_n_u16(s0_add_3s1_h, 2));
  112. /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
  113. * denote low half and high half respectively.
  114. */
  115. s1_add_3s0_l =
  116. vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
  117. s1_add_3s0_h =
  118. vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
  119. s0_add_3s1_l =
  120. vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
  121. s0_add_3s1_h =
  122. vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
  123. /* Add ordered dithering bias to odd pixel values. */
  124. s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
  125. s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
  126. /* Store pixel component values to memory. */
  127. vst2q_u8(outptr + outptr_offset, output_pixels);
  128. outptr_offset = 2 * colctr - 1;
  129. }
  130. /* Complete the last iteration of the loop. */
  131. /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
  132. output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
  133. vrshrn_n_u16(s1_add_3s0_h, 2));
  134. output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
  135. vshrn_n_u16(s0_add_3s1_h, 2));
  136. /* Store pixel component values to memory. */
  137. vst2q_u8(outptr + outptr_offset, output_pixels);
  138. /* Last pixel component value in this row of the original image */
  139. outptr[2 * downsampled_width - 1] =
  140. GETJSAMPLE(inptr[downsampled_width - 1]);
  141. }
  142. }
  143. /* The diagram below shows an array of samples produced by h2v2 downsampling.
  144. *
  145. * s0 s1 s2
  146. * +---------+---------+---------+
  147. * | p0 p1 | p2 p3 | p4 p5 |
  148. * sA | | | |
  149. * | p6 p7 | p8 p9 | p10 p11|
  150. * +---------+---------+---------+
  151. * | p12 p13| p14 p15| p16 p17|
  152. * sB | | | |
  153. * | p18 p19| p20 p21| p22 p23|
  154. * +---------+---------+---------+
  155. * | p24 p25| p26 p27| p28 p29|
  156. * sC | | | |
  157. * | p30 p31| p32 p33| p34 p35|
  158. * +---------+---------+---------+
  159. *
  160. * Samples s0A-s2C were created by averaging the original pixel component
  161. * values centered at positions p0-p35 above. To approximate one of those
  162. * original pixel component values, we proportionally blend the sample
  163. * containing the pixel center with the nearest neighboring samples in each
  164. * row, column, and diagonal.
  165. *
  166. * An upsampled pixel component value is computed by first blending the sample
  167. * containing the pixel center with the nearest neighboring samples in the
  168. * same column, in the ratio 3:1, and then blending each column sum with the
  169. * nearest neighboring column sum, in the ratio 3:1. For example:
  170. * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
  171. * 1/4 * (3/4 * s0B + 1/4 * s0A)
  172. * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
  173. * When computing the first and last pixel component values in the row, there
  174. * is no horizontally adjacent sample to blend, so:
  175. * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
  176. * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
  177. * When computing the first and last pixel component values in the column,
  178. * there is no vertically adjacent sample to blend, so:
  179. * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
  180. * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
  181. * When computing the corner pixel component values, there is no adjacent
  182. * sample to blend, so:
  183. * p0(upsampled) = s0A
  184. * p35(upsampled) = s2C
  185. */
  186. void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
  187. JDIMENSION downsampled_width,
  188. JSAMPARRAY input_data,
  189. JSAMPARRAY *output_data_ptr)
  190. {
  191. JSAMPARRAY output_data = *output_data_ptr;
  192. JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
  193. int inrow, outrow;
  194. unsigned colctr;
  195. /* Set up constants. */
  196. const uint16x8_t seven_u16 = vdupq_n_u16(7);
  197. const uint8x8_t three_u8 = vdup_n_u8(3);
  198. const uint16x8_t three_u16 = vdupq_n_u16(3);
  199. inrow = outrow = 0;
  200. while (outrow < max_v_samp_factor) {
  201. inptr0 = input_data[inrow - 1];
  202. inptr1 = input_data[inrow];
  203. inptr2 = input_data[inrow + 1];
  204. /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
  205. * respectively.
  206. */
  207. outptr0 = output_data[outrow++];
  208. outptr1 = output_data[outrow++];
  209. /* First pixel component value in this row of the original image */
  210. int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
  211. *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
  212. int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
  213. *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
  214. /* Step 1: Blend samples vertically in columns s0 and s1.
  215. * Leave the divide by 4 until the end, when it can be done for both
  216. * dimensions at once, right-shifting by 4.
  217. */
  218. /* Load and compute s0colsum0 and s0colsum1. */
  219. uint8x16_t s0A = vld1q_u8(inptr0);
  220. uint8x16_t s0B = vld1q_u8(inptr1);
  221. uint8x16_t s0C = vld1q_u8(inptr2);
  222. /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
  223. * denote low half and high half respectively.
  224. */
  225. uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
  226. vget_low_u8(s0B), three_u8);
  227. uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
  228. vget_high_u8(s0B), three_u8);
  229. uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
  230. vget_low_u8(s0B), three_u8);
  231. uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
  232. vget_high_u8(s0B), three_u8);
  233. /* Load and compute s1colsum0 and s1colsum1. */
  234. uint8x16_t s1A = vld1q_u8(inptr0 + 1);
  235. uint8x16_t s1B = vld1q_u8(inptr1 + 1);
  236. uint8x16_t s1C = vld1q_u8(inptr2 + 1);
  237. uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
  238. vget_low_u8(s1B), three_u8);
  239. uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
  240. vget_high_u8(s1B), three_u8);
  241. uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
  242. vget_low_u8(s1B), three_u8);
  243. uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
  244. vget_high_u8(s1B), three_u8);
  245. /* Step 2: Blend the already-blended columns. */
  246. uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
  247. uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
  248. uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
  249. uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
  250. uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
  251. uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
  252. uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
  253. uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
  254. /* Add ordered dithering bias to odd pixel values. */
  255. output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
  256. output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
  257. output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
  258. output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
  259. /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
  260. uint8x16x2_t output_pixels0 = { {
  261. vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
  262. vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
  263. } };
  264. uint8x16x2_t output_pixels1 = { {
  265. vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
  266. vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
  267. } };
  268. /* Store pixel component values to memory.
  269. * The minimum size of the output buffer for each row is 64 bytes => no
  270. * need to worry about buffer overflow here. See "Creation of 2-D sample
  271. * arrays" in jmemmgr.c for more details.
  272. */
  273. vst2q_u8(outptr0 + 1, output_pixels0);
  274. vst2q_u8(outptr1 + 1, output_pixels1);
  275. /* The first pixel of the image shifted our loads and stores by one byte.
  276. * We have to re-align on a 32-byte boundary at some point before the end
  277. * of the row (we do it now on the 32/33 pixel boundary) to stay within the
  278. * bounds of the sample buffers without having to resort to a slow scalar
  279. * tail case for the last (downsampled_width % 16) samples. See "Creation
  280. * of 2-D sample arrays" in jmemmgr.c for more details.
  281. */
  282. for (colctr = 16; colctr < downsampled_width; colctr += 16) {
  283. /* Step 1: Blend samples vertically in columns s0 and s1. */
  284. /* Load and compute s0colsum0 and s0colsum1. */
  285. s0A = vld1q_u8(inptr0 + colctr - 1);
  286. s0B = vld1q_u8(inptr1 + colctr - 1);
  287. s0C = vld1q_u8(inptr2 + colctr - 1);
  288. s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
  289. three_u8);
  290. s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
  291. three_u8);
  292. s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
  293. three_u8);
  294. s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
  295. three_u8);
  296. /* Load and compute s1colsum0 and s1colsum1. */
  297. s1A = vld1q_u8(inptr0 + colctr);
  298. s1B = vld1q_u8(inptr1 + colctr);
  299. s1C = vld1q_u8(inptr2 + colctr);
  300. s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
  301. three_u8);
  302. s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
  303. three_u8);
  304. s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
  305. three_u8);
  306. s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
  307. three_u8);
  308. /* Step 2: Blend the already-blended columns. */
  309. output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
  310. output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
  311. output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
  312. output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
  313. output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
  314. output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
  315. output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
  316. output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
  317. /* Add ordered dithering bias to odd pixel values. */
  318. output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
  319. output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
  320. output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
  321. output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
  322. /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
  323. output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
  324. vshrn_n_u16(output0_p1_h, 4));
  325. output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
  326. vrshrn_n_u16(output0_p2_h, 4));
  327. output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
  328. vshrn_n_u16(output1_p1_h, 4));
  329. output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
  330. vrshrn_n_u16(output1_p2_h, 4));
  331. /* Store pixel component values to memory. */
  332. vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
  333. vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
  334. }
  335. /* Last pixel component value in this row of the original image */
  336. int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
  337. GETJSAMPLE(inptr0[downsampled_width - 1]);
  338. outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
  339. int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
  340. GETJSAMPLE(inptr2[downsampled_width - 1]);
  341. outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
  342. inrow++;
  343. }
  344. }
  345. /* The diagram below shows a column of samples produced by h1v2 downsampling
  346. * (or by losslessly rotating or transposing an h2v1-downsampled image.)
  347. *
  348. * +---------+
  349. * | p0 |
  350. * sA | |
  351. * | p1 |
  352. * +---------+
  353. * | p2 |
  354. * sB | |
  355. * | p3 |
  356. * +---------+
  357. * | p4 |
  358. * sC | |
  359. * | p5 |
  360. * +---------+
  361. *
  362. * Samples sA-sC were created by averaging the original pixel component values
  363. * centered at positions p0-p5 above. To approximate those original pixel
  364. * component values, we proportionally blend the adjacent samples in each
  365. * column.
  366. *
  367. * An upsampled pixel component value is computed by blending the sample
  368. * containing the pixel center with the nearest neighboring sample, in the
  369. * ratio 3:1. For example:
  370. * p1(upsampled) = 3/4 * sA + 1/4 * sB
  371. * p2(upsampled) = 3/4 * sB + 1/4 * sA
  372. * When computing the first and last pixel component values in the column,
  373. * there is no adjacent sample to blend, so:
  374. * p0(upsampled) = sA
  375. * p5(upsampled) = sC
  376. */
  377. void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
  378. JDIMENSION downsampled_width,
  379. JSAMPARRAY input_data,
  380. JSAMPARRAY *output_data_ptr)
  381. {
  382. JSAMPARRAY output_data = *output_data_ptr;
  383. JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
  384. int inrow, outrow;
  385. unsigned colctr;
  386. /* Set up constants. */
  387. const uint16x8_t one_u16 = vdupq_n_u16(1);
  388. const uint8x8_t three_u8 = vdup_n_u8(3);
  389. inrow = outrow = 0;
  390. while (outrow < max_v_samp_factor) {
  391. inptr0 = input_data[inrow - 1];
  392. inptr1 = input_data[inrow];
  393. inptr2 = input_data[inrow + 1];
  394. /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
  395. * respectively.
  396. */
  397. outptr0 = output_data[outrow++];
  398. outptr1 = output_data[outrow++];
  399. inrow++;
  400. /* The size of the input and output buffers is always a multiple of 32
  401. * bytes => no need to worry about buffer overflow when reading/writing
  402. * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
  403. * details.
  404. */
  405. for (colctr = 0; colctr < downsampled_width; colctr += 16) {
  406. /* Load samples. */
  407. uint8x16_t sA = vld1q_u8(inptr0 + colctr);
  408. uint8x16_t sB = vld1q_u8(inptr1 + colctr);
  409. uint8x16_t sC = vld1q_u8(inptr2 + colctr);
  410. /* Blend samples vertically. */
  411. uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
  412. vget_low_u8(sB), three_u8);
  413. uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
  414. vget_high_u8(sB), three_u8);
  415. uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
  416. vget_low_u8(sB), three_u8);
  417. uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
  418. vget_high_u8(sB), three_u8);
  419. /* Add ordered dithering bias to pixel values in even output rows. */
  420. colsum0_l = vaddq_u16(colsum0_l, one_u16);
  421. colsum0_h = vaddq_u16(colsum0_h, one_u16);
  422. /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
  423. uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
  424. vshrn_n_u16(colsum0_h, 2));
  425. uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
  426. vrshrn_n_u16(colsum1_h, 2));
  427. /* Store pixel component values to memory. */
  428. vst1q_u8(outptr0 + colctr, output_pixels0);
  429. vst1q_u8(outptr1 + colctr, output_pixels1);
  430. }
  431. }
  432. }
  433. /* The diagram below shows a row of samples produced by h2v1 downsampling.
  434. *
  435. * s0 s1
  436. * +---------+---------+
  437. * | | |
  438. * | p0 p1 | p2 p3 |
  439. * | | |
  440. * +---------+---------+
  441. *
  442. * Samples s0 and s1 were created by averaging the original pixel component
  443. * values centered at positions p0-p3 above. To approximate those original
  444. * pixel component values, we duplicate the samples horizontally:
  445. * p0(upsampled) = p1(upsampled) = s0
  446. * p2(upsampled) = p3(upsampled) = s1
  447. */
  448. void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
  449. JSAMPARRAY input_data,
  450. JSAMPARRAY *output_data_ptr)
  451. {
  452. JSAMPARRAY output_data = *output_data_ptr;
  453. JSAMPROW inptr, outptr;
  454. int inrow;
  455. unsigned colctr;
  456. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  457. inptr = input_data[inrow];
  458. outptr = output_data[inrow];
  459. for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
  460. uint8x16_t samples = vld1q_u8(inptr + colctr);
  461. /* Duplicate the samples. The store operation below interleaves them so
  462. * that adjacent pixel component values take on the same sample value,
  463. * per above.
  464. */
  465. uint8x16x2_t output_pixels = { { samples, samples } };
  466. /* Store pixel component values to memory.
  467. * Due to the way sample buffers are allocated, we don't need to worry
  468. * about tail cases when output_width is not a multiple of 32. See
  469. * "Creation of 2-D sample arrays" in jmemmgr.c for details.
  470. */
  471. vst2q_u8(outptr + 2 * colctr, output_pixels);
  472. }
  473. }
  474. }
  475. /* The diagram below shows an array of samples produced by h2v2 downsampling.
  476. *
  477. * s0 s1
  478. * +---------+---------+
  479. * | p0 p1 | p2 p3 |
  480. * sA | | |
  481. * | p4 p5 | p6 p7 |
  482. * +---------+---------+
  483. * | p8 p9 | p10 p11|
  484. * sB | | |
  485. * | p12 p13| p14 p15|
  486. * +---------+---------+
  487. *
  488. * Samples s0A-s1B were created by averaging the original pixel component
  489. * values centered at positions p0-p15 above. To approximate those original
  490. * pixel component values, we duplicate the samples both horizontally and
  491. * vertically:
  492. * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
  493. * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
  494. * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
  495. * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
  496. */
  497. void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
  498. JSAMPARRAY input_data,
  499. JSAMPARRAY *output_data_ptr)
  500. {
  501. JSAMPARRAY output_data = *output_data_ptr;
  502. JSAMPROW inptr, outptr0, outptr1;
  503. int inrow, outrow;
  504. unsigned colctr;
  505. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  506. inptr = input_data[inrow];
  507. outptr0 = output_data[outrow++];
  508. outptr1 = output_data[outrow++];
  509. for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
  510. uint8x16_t samples = vld1q_u8(inptr + colctr);
  511. /* Duplicate the samples. The store operation below interleaves them so
  512. * that adjacent pixel component values take on the same sample value,
  513. * per above.
  514. */
  515. uint8x16x2_t output_pixels = { { samples, samples } };
  516. /* Store pixel component values for both output rows to memory.
  517. * Due to the way sample buffers are allocated, we don't need to worry
  518. * about tail cases when output_width is not a multiple of 32. See
  519. * "Creation of 2-D sample arrays" in jmemmgr.c for details.
  520. */
  521. vst2q_u8(outptr0 + 2 * colctr, output_pixels);
  522. vst2q_u8(outptr1 + 2 * colctr, output_pixels);
  523. }
  524. }
  525. }