jcphuff-neon.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. /*
  2. * Prepare data for progressive Huffman encoding (Arm Neon)
  3. *
  4. * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
  6. * Copyright (C) 2022, 2024-2025, D. R. Commander. All Rights Reserved.
  7. *
  8. * This software is provided 'as-is', without any express or implied
  9. * warranty. In no event will the authors be held liable for any damages
  10. * arising from the use of this software.
  11. *
  12. * Permission is granted to anyone to use this software for any purpose,
  13. * including commercial applications, and to alter it and redistribute it
  14. * freely, subject to the following restrictions:
  15. *
  16. * 1. The origin of this software must not be misrepresented; you must not
  17. * claim that you wrote the original software. If you use this software
  18. * in a product, an acknowledgment in the product documentation would be
  19. * appreciated but is not required.
  20. * 2. Altered source versions must be plainly marked as such, and must not be
  21. * misrepresented as being the original software.
  22. * 3. This notice may not be removed or altered from any source distribution.
  23. */
  24. #define JPEG_INTERNALS
  25. #include "../../src/jinclude.h"
  26. #include "../../src/jpeglib.h"
  27. #include "../../src/jsimd.h"
  28. #include "../../src/jdct.h"
  29. #include "../../src/jsimddct.h"
  30. #include "../jsimd.h"
  31. #include "neon-compat.h"
  32. #include <arm_neon.h>
  33. /* Data preparation for encode_mcu_AC_first().
  34. *
  35. * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
  36. * found in jcphuff.c.
  37. */
  38. void jsimd_encode_mcu_AC_first_prepare_neon
  39. (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  40. UJCOEF *values, size_t *zerobits)
  41. {
  42. UJCOEF *values_ptr = values;
  43. UJCOEF *diff_values_ptr = values + DCTSIZE2;
  44. /* Rows of coefficients to zero (since they haven't been processed) */
  45. int i, rows_to_zero = 8;
  46. for (i = 0; i < Sl / 16; i++) {
  47. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  48. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  49. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  50. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  51. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  52. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  53. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  54. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  55. int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  56. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  57. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  58. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  59. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  60. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  61. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  62. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  63. /* Isolate sign of coefficients. */
  64. uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
  65. uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
  66. /* Compute absolute value of coefficients and apply point transform Al. */
  67. uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
  68. uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
  69. abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
  70. abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
  71. /* Compute diff values. */
  72. uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
  73. uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
  74. /* Store transformed coefficients and diff values. */
  75. vst1q_u16(values_ptr, abs_coefs1);
  76. vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
  77. vst1q_u16(diff_values_ptr, diff1);
  78. vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
  79. values_ptr += 16;
  80. diff_values_ptr += 16;
  81. jpeg_natural_order_start += 16;
  82. rows_to_zero -= 2;
  83. }
  84. /* Same operation but for remaining partial vector */
  85. int remaining_coefs = Sl % 16;
  86. if (remaining_coefs > 8) {
  87. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  88. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  89. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  90. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  91. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  92. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  93. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  94. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  95. int16x8_t coefs2 = vdupq_n_s16(0);
  96. switch (remaining_coefs) {
  97. case 15:
  98. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  99. FALLTHROUGH /*FALLTHROUGH*/
  100. case 14:
  101. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  102. FALLTHROUGH /*FALLTHROUGH*/
  103. case 13:
  104. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  105. FALLTHROUGH /*FALLTHROUGH*/
  106. case 12:
  107. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  108. FALLTHROUGH /*FALLTHROUGH*/
  109. case 11:
  110. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  111. FALLTHROUGH /*FALLTHROUGH*/
  112. case 10:
  113. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  114. FALLTHROUGH /*FALLTHROUGH*/
  115. case 9:
  116. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
  117. FALLTHROUGH /*FALLTHROUGH*/
  118. default:
  119. break;
  120. }
  121. /* Isolate sign of coefficients. */
  122. uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
  123. uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
  124. /* Compute absolute value of coefficients and apply point transform Al. */
  125. uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
  126. uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
  127. abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
  128. abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
  129. /* Compute diff values. */
  130. uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
  131. uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
  132. /* Store transformed coefficients and diff values. */
  133. vst1q_u16(values_ptr, abs_coefs1);
  134. vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
  135. vst1q_u16(diff_values_ptr, diff1);
  136. vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
  137. values_ptr += 16;
  138. diff_values_ptr += 16;
  139. rows_to_zero -= 2;
  140. } else if (remaining_coefs > 0) {
  141. int16x8_t coefs = vdupq_n_s16(0);
  142. switch (remaining_coefs) {
  143. case 8:
  144. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
  145. FALLTHROUGH /*FALLTHROUGH*/
  146. case 7:
  147. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
  148. FALLTHROUGH /*FALLTHROUGH*/
  149. case 6:
  150. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
  151. FALLTHROUGH /*FALLTHROUGH*/
  152. case 5:
  153. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
  154. FALLTHROUGH /*FALLTHROUGH*/
  155. case 4:
  156. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
  157. FALLTHROUGH /*FALLTHROUGH*/
  158. case 3:
  159. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
  160. FALLTHROUGH /*FALLTHROUGH*/
  161. case 2:
  162. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
  163. FALLTHROUGH /*FALLTHROUGH*/
  164. case 1:
  165. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
  166. FALLTHROUGH /*FALLTHROUGH*/
  167. default:
  168. break;
  169. }
  170. /* Isolate sign of coefficients. */
  171. uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
  172. /* Compute absolute value of coefficients and apply point transform Al. */
  173. uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
  174. abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
  175. /* Compute diff values. */
  176. uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
  177. /* Store transformed coefficients and diff values. */
  178. vst1q_u16(values_ptr, abs_coefs);
  179. vst1q_u16(diff_values_ptr, diff);
  180. values_ptr += 8;
  181. diff_values_ptr += 8;
  182. rows_to_zero--;
  183. }
  184. /* Zero remaining memory in the values and diff_values blocks. */
  185. for (i = 0; i < rows_to_zero; i++) {
  186. vst1q_u16(values_ptr, vdupq_n_u16(0));
  187. vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
  188. values_ptr += 8;
  189. diff_values_ptr += 8;
  190. }
  191. /* Construct zerobits bitmap. A set bit means that the corresponding
  192. * coefficient != 0.
  193. */
  194. uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
  195. uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
  196. uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
  197. uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
  198. uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
  199. uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
  200. uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
  201. uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
  202. uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
  203. uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
  204. uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
  205. uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
  206. uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
  207. uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
  208. uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
  209. uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
  210. /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
  211. const uint8x8_t bitmap_mask =
  212. vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
  213. row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
  214. row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
  215. row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
  216. row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
  217. row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
  218. row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
  219. row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
  220. row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
  221. uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
  222. uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
  223. uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
  224. uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
  225. uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  226. uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  227. uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  228. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  229. /* Move bitmap to a 64-bit scalar register. */
  230. uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  231. /* Store zerobits bitmap. */
  232. *zerobits = ~bitmap;
  233. #else
  234. /* Move bitmap to two 32-bit scalar registers. */
  235. uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  236. uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  237. /* Store zerobits bitmap. */
  238. zerobits[0] = ~bitmap0;
  239. zerobits[1] = ~bitmap1;
  240. #endif
  241. }
  242. /* Data preparation for encode_mcu_AC_refine().
  243. *
  244. * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
  245. * found in jcphuff.c.
  246. */
  247. int jsimd_encode_mcu_AC_refine_prepare_neon
  248. (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  249. UJCOEF *absvalues, size_t *bits)
  250. {
  251. /* Temporary storage buffers for data used to compute the signbits bitmap and
  252. * the end-of-block (EOB) position
  253. */
  254. uint8_t coef_sign_bits[64];
  255. uint8_t coef_eq1_bits[64];
  256. UJCOEF *absvalues_ptr = absvalues;
  257. uint8_t *coef_sign_bits_ptr = coef_sign_bits;
  258. uint8_t *eq1_bits_ptr = coef_eq1_bits;
  259. /* Rows of coefficients to zero (since they haven't been processed) */
  260. int i, rows_to_zero = 8;
  261. for (i = 0; i < Sl / 16; i++) {
  262. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  263. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  264. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  265. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  266. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  267. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  268. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  269. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  270. int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  271. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  272. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  273. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  274. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  275. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  276. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  277. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  278. /* Compute and store data for signbits bitmap. */
  279. uint8x8_t sign_coefs1 =
  280. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
  281. uint8x8_t sign_coefs2 =
  282. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
  283. vst1_u8(coef_sign_bits_ptr, sign_coefs1);
  284. vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
  285. /* Compute absolute value of coefficients and apply point transform Al. */
  286. uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
  287. uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
  288. abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
  289. abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
  290. vst1q_u16(absvalues_ptr, abs_coefs1);
  291. vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
  292. /* Test whether transformed coefficient values == 1 (used to find EOB
  293. * position.)
  294. */
  295. uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
  296. uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
  297. vst1_u8(eq1_bits_ptr, coefs_eq11);
  298. vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
  299. absvalues_ptr += 16;
  300. coef_sign_bits_ptr += 16;
  301. eq1_bits_ptr += 16;
  302. jpeg_natural_order_start += 16;
  303. rows_to_zero -= 2;
  304. }
  305. /* Same operation but for remaining partial vector */
  306. int remaining_coefs = Sl % 16;
  307. if (remaining_coefs > 8) {
  308. int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  309. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  310. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  311. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  312. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  313. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  314. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  315. coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  316. int16x8_t coefs2 = vdupq_n_s16(0);
  317. switch (remaining_coefs) {
  318. case 15:
  319. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  320. FALLTHROUGH /*FALLTHROUGH*/
  321. case 14:
  322. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  323. FALLTHROUGH /*FALLTHROUGH*/
  324. case 13:
  325. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  326. FALLTHROUGH /*FALLTHROUGH*/
  327. case 12:
  328. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  329. FALLTHROUGH /*FALLTHROUGH*/
  330. case 11:
  331. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  332. FALLTHROUGH /*FALLTHROUGH*/
  333. case 10:
  334. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  335. FALLTHROUGH /*FALLTHROUGH*/
  336. case 9:
  337. coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
  338. FALLTHROUGH /*FALLTHROUGH*/
  339. default:
  340. break;
  341. }
  342. /* Compute and store data for signbits bitmap. */
  343. uint8x8_t sign_coefs1 =
  344. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
  345. uint8x8_t sign_coefs2 =
  346. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
  347. vst1_u8(coef_sign_bits_ptr, sign_coefs1);
  348. vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
  349. /* Compute absolute value of coefficients and apply point transform Al. */
  350. uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
  351. uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
  352. abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
  353. abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
  354. vst1q_u16(absvalues_ptr, abs_coefs1);
  355. vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
  356. /* Test whether transformed coefficient values == 1 (used to find EOB
  357. * position.)
  358. */
  359. uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
  360. uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
  361. vst1_u8(eq1_bits_ptr, coefs_eq11);
  362. vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
  363. absvalues_ptr += 16;
  364. coef_sign_bits_ptr += 16;
  365. eq1_bits_ptr += 16;
  366. jpeg_natural_order_start += 16;
  367. rows_to_zero -= 2;
  368. } else if (remaining_coefs > 0) {
  369. int16x8_t coefs = vdupq_n_s16(0);
  370. switch (remaining_coefs) {
  371. case 8:
  372. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
  373. FALLTHROUGH /*FALLTHROUGH*/
  374. case 7:
  375. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
  376. FALLTHROUGH /*FALLTHROUGH*/
  377. case 6:
  378. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
  379. FALLTHROUGH /*FALLTHROUGH*/
  380. case 5:
  381. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
  382. FALLTHROUGH /*FALLTHROUGH*/
  383. case 4:
  384. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
  385. FALLTHROUGH /*FALLTHROUGH*/
  386. case 3:
  387. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
  388. FALLTHROUGH /*FALLTHROUGH*/
  389. case 2:
  390. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
  391. FALLTHROUGH /*FALLTHROUGH*/
  392. case 1:
  393. coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
  394. FALLTHROUGH /*FALLTHROUGH*/
  395. default:
  396. break;
  397. }
  398. /* Compute and store data for signbits bitmap. */
  399. uint8x8_t sign_coefs =
  400. vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
  401. vst1_u8(coef_sign_bits_ptr, sign_coefs);
  402. /* Compute absolute value of coefficients and apply point transform Al. */
  403. uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
  404. abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
  405. vst1q_u16(absvalues_ptr, abs_coefs);
  406. /* Test whether transformed coefficient values == 1 (used to find EOB
  407. * position.)
  408. */
  409. uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
  410. vst1_u8(eq1_bits_ptr, coefs_eq1);
  411. absvalues_ptr += 8;
  412. coef_sign_bits_ptr += 8;
  413. eq1_bits_ptr += 8;
  414. rows_to_zero--;
  415. }
  416. /* Zero remaining memory in blocks. */
  417. for (i = 0; i < rows_to_zero; i++) {
  418. vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
  419. vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
  420. vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
  421. absvalues_ptr += 8;
  422. coef_sign_bits_ptr += 8;
  423. eq1_bits_ptr += 8;
  424. }
  425. /* Construct zerobits bitmap. */
  426. uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
  427. uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
  428. uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
  429. uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
  430. uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
  431. uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
  432. uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
  433. uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
  434. uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
  435. uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
  436. uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
  437. uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
  438. uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
  439. uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
  440. uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
  441. uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
  442. /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
  443. const uint8x8_t bitmap_mask =
  444. vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
  445. abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
  446. abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
  447. abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
  448. abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
  449. abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
  450. abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
  451. abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
  452. abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
  453. uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
  454. uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
  455. uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
  456. uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
  457. uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  458. uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  459. uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  460. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  461. /* Move bitmap to a 64-bit scalar register. */
  462. uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  463. /* Store zerobits bitmap. */
  464. bits[0] = ~bitmap;
  465. #else
  466. /* Move bitmap to two 32-bit scalar registers. */
  467. uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  468. uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  469. /* Store zerobits bitmap. */
  470. bits[0] = ~bitmap0;
  471. bits[1] = ~bitmap1;
  472. #endif
  473. /* Construct signbits bitmap. */
  474. uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
  475. uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
  476. uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
  477. uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
  478. uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
  479. uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
  480. uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
  481. uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
  482. signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
  483. signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
  484. signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
  485. signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
  486. signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
  487. signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
  488. signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
  489. signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
  490. bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
  491. bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
  492. bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
  493. bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
  494. bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  495. bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  496. bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  497. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  498. /* Move bitmap to a 64-bit scalar register. */
  499. bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  500. /* Store signbits bitmap. */
  501. bits[1] = ~bitmap;
  502. #else
  503. /* Move bitmap to two 32-bit scalar registers. */
  504. bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  505. bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  506. /* Store signbits bitmap. */
  507. bits[2] = ~bitmap0;
  508. bits[3] = ~bitmap1;
  509. #endif
  510. /* Construct bitmap to find EOB position (the index of the last coefficient
  511. * equal to 1.)
  512. */
  513. uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
  514. uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
  515. uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
  516. uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
  517. uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
  518. uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
  519. uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
  520. uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
  521. row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
  522. row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
  523. row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
  524. row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
  525. row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
  526. row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
  527. row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
  528. row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
  529. bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
  530. bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
  531. bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
  532. bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
  533. bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
  534. bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
  535. bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
  536. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  537. /* Move bitmap to a 64-bit scalar register. */
  538. bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
  539. /* Return EOB position. */
  540. if (bitmap == 0) {
  541. /* EOB position is defined to be 0 if all coefficients != 1. */
  542. return 0;
  543. } else {
  544. return 63 - BUILTIN_CLZLL(bitmap);
  545. }
  546. #else
  547. /* Move bitmap to two 32-bit scalar registers. */
  548. bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
  549. bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
  550. /* Return EOB position. */
  551. if (bitmap0 == 0 && bitmap1 == 0) {
  552. return 0;
  553. } else if (bitmap1 != 0) {
  554. return 63 - BUILTIN_CLZ(bitmap1);
  555. } else {
  556. return 31 - BUILTIN_CLZ(bitmap0);
  557. }
  558. #endif
  559. }