jfdctfst-neon.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /*
  2. * Fast integer FDCT (Arm Neon)
  3. *
  4. * Copyright (C) 2020, Arm Limited. All Rights Reserved.
  5. * Copyright (C) 2024, D. R. Commander. All Rights Reserved.
  6. *
  7. * This software is provided 'as-is', without any express or implied
  8. * warranty. In no event will the authors be held liable for any damages
  9. * arising from the use of this software.
  10. *
  11. * Permission is granted to anyone to use this software for any purpose,
  12. * including commercial applications, and to alter it and redistribute it
  13. * freely, subject to the following restrictions:
  14. *
  15. * 1. The origin of this software must not be misrepresented; you must not
  16. * claim that you wrote the original software. If you use this software
  17. * in a product, an acknowledgment in the product documentation would be
  18. * appreciated but is not required.
  19. * 2. Altered source versions must be plainly marked as such, and must not be
  20. * misrepresented as being the original software.
  21. * 3. This notice may not be removed or altered from any source distribution.
  22. */
  23. #define JPEG_INTERNALS
  24. #include "../../src/jinclude.h"
  25. #include "../../src/jpeglib.h"
  26. #include "../../src/jsimd.h"
  27. #include "../../src/jdct.h"
  28. #include "../../src/jsimddct.h"
  29. #include "../jsimd.h"
  30. #include "align.h"
  31. #include "neon-compat.h"
  32. #include <arm_neon.h>
  33. /* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
  34. * (Discrete Cosine Transform) on one block of samples. It uses the same
  35. * calculations and produces exactly the same output as IJG's original
  36. * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
  37. *
  38. * Scaled integer constants are used to avoid floating-point arithmetic:
  39. * 0.382683433 = 12544 * 2^-15
  40. * 0.541196100 = 17795 * 2^-15
  41. * 0.707106781 = 23168 * 2^-15
  42. * 0.306562965 = 9984 * 2^-15
  43. *
  44. * See jfdctfst.c for further details of the DCT algorithm. Where possible,
  45. * the variable names and comments here in jsimd_fdct_ifast_neon() match up
  46. * with those in jpeg_fdct_ifast().
  47. */
  48. #define F_0_382 12544
  49. #define F_0_541 17792
  50. #define F_0_707 23168
  51. #define F_0_306 9984
  52. ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
  53. F_0_382, F_0_541, F_0_707, F_0_306
  54. };
  55. void jsimd_fdct_ifast_neon(DCTELEM *data)
  56. {
  57. /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
  58. * are used, followed by vuzp to transpose the block such that we have a
  59. * column of samples per vector - allowing all rows to be processed at once.
  60. */
  61. int16x8x4_t data1 = vld4q_s16(data);
  62. int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
  63. int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
  64. int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
  65. int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
  66. int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
  67. int16x8_t col0 = cols_04.val[0];
  68. int16x8_t col1 = cols_15.val[0];
  69. int16x8_t col2 = cols_26.val[0];
  70. int16x8_t col3 = cols_37.val[0];
  71. int16x8_t col4 = cols_04.val[1];
  72. int16x8_t col5 = cols_15.val[1];
  73. int16x8_t col6 = cols_26.val[1];
  74. int16x8_t col7 = cols_37.val[1];
  75. /* Pass 1: process rows. */
  76. /* Load DCT conversion constants. */
  77. const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
  78. int16x8_t tmp0 = vaddq_s16(col0, col7);
  79. int16x8_t tmp7 = vsubq_s16(col0, col7);
  80. int16x8_t tmp1 = vaddq_s16(col1, col6);
  81. int16x8_t tmp6 = vsubq_s16(col1, col6);
  82. int16x8_t tmp2 = vaddq_s16(col2, col5);
  83. int16x8_t tmp5 = vsubq_s16(col2, col5);
  84. int16x8_t tmp3 = vaddq_s16(col3, col4);
  85. int16x8_t tmp4 = vsubq_s16(col3, col4);
  86. /* Even part */
  87. int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
  88. int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
  89. int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
  90. int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
  91. col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
  92. col4 = vsubq_s16(tmp10, tmp11);
  93. int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
  94. col2 = vaddq_s16(tmp13, z1); /* phase 5 */
  95. col6 = vsubq_s16(tmp13, z1);
  96. /* Odd part */
  97. tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
  98. tmp11 = vaddq_s16(tmp5, tmp6);
  99. tmp12 = vaddq_s16(tmp6, tmp7);
  100. int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
  101. int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
  102. z2 = vaddq_s16(z2, z5);
  103. int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
  104. z5 = vaddq_s16(tmp12, z5);
  105. z4 = vaddq_s16(z4, z5);
  106. int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
  107. int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
  108. int16x8_t z13 = vsubq_s16(tmp7, z3);
  109. col5 = vaddq_s16(z13, z2); /* phase 6 */
  110. col3 = vsubq_s16(z13, z2);
  111. col1 = vaddq_s16(z11, z4);
  112. col7 = vsubq_s16(z11, z4);
  113. /* Transpose to work on columns in pass 2. */
  114. int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
  115. int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
  116. int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
  117. int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
  118. int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
  119. vreinterpretq_s32_s16(cols_45.val[0]));
  120. int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
  121. vreinterpretq_s32_s16(cols_45.val[1]));
  122. int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
  123. vreinterpretq_s32_s16(cols_67.val[0]));
  124. int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
  125. vreinterpretq_s32_s16(cols_67.val[1]));
  126. int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
  127. int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
  128. int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
  129. int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
  130. int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
  131. int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
  132. int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
  133. int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
  134. int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
  135. int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
  136. int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
  137. int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
  138. /* Pass 2: process columns. */
  139. tmp0 = vaddq_s16(row0, row7);
  140. tmp7 = vsubq_s16(row0, row7);
  141. tmp1 = vaddq_s16(row1, row6);
  142. tmp6 = vsubq_s16(row1, row6);
  143. tmp2 = vaddq_s16(row2, row5);
  144. tmp5 = vsubq_s16(row2, row5);
  145. tmp3 = vaddq_s16(row3, row4);
  146. tmp4 = vsubq_s16(row3, row4);
  147. /* Even part */
  148. tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
  149. tmp13 = vsubq_s16(tmp0, tmp3);
  150. tmp11 = vaddq_s16(tmp1, tmp2);
  151. tmp12 = vsubq_s16(tmp1, tmp2);
  152. row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
  153. row4 = vsubq_s16(tmp10, tmp11);
  154. z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
  155. row2 = vaddq_s16(tmp13, z1); /* phase 5 */
  156. row6 = vsubq_s16(tmp13, z1);
  157. /* Odd part */
  158. tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
  159. tmp11 = vaddq_s16(tmp5, tmp6);
  160. tmp12 = vaddq_s16(tmp6, tmp7);
  161. z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
  162. z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
  163. z2 = vaddq_s16(z2, z5);
  164. z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
  165. z5 = vaddq_s16(tmp12, z5);
  166. z4 = vaddq_s16(z4, z5);
  167. z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
  168. z11 = vaddq_s16(tmp7, z3); /* phase 5 */
  169. z13 = vsubq_s16(tmp7, z3);
  170. row5 = vaddq_s16(z13, z2); /* phase 6 */
  171. row3 = vsubq_s16(z13, z2);
  172. row1 = vaddq_s16(z11, z4);
  173. row7 = vsubq_s16(z11, z4);
  174. vst1q_s16(data + 0 * DCTSIZE, row0);
  175. vst1q_s16(data + 1 * DCTSIZE, row1);
  176. vst1q_s16(data + 2 * DCTSIZE, row2);
  177. vst1q_s16(data + 3 * DCTSIZE, row3);
  178. vst1q_s16(data + 4 * DCTSIZE, row4);
  179. vst1q_s16(data + 5 * DCTSIZE, row5);
  180. vst1q_s16(data + 6 * DCTSIZE, row6);
  181. vst1q_s16(data + 7 * DCTSIZE, row7);
  182. }