jsimd_neon.S 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200
  1. /*
  2. * Armv7 Neon optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
  5. * All Rights Reserved.
  6. * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
  7. * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
  8. * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
  9. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  10. * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
  11. *
  12. * This software is provided 'as-is', without any express or implied
  13. * warranty. In no event will the authors be held liable for any damages
  14. * arising from the use of this software.
  15. *
  16. * Permission is granted to anyone to use this software for any purpose,
  17. * including commercial applications, and to alter it and redistribute it
  18. * freely, subject to the following restrictions:
  19. *
  20. * 1. The origin of this software must not be misrepresented; you must not
  21. * claim that you wrote the original software. If you use this software
  22. * in a product, an acknowledgment in the product documentation would be
  23. * appreciated but is not required.
  24. * 2. Altered source versions must be plainly marked as such, and must not be
  25. * misrepresented as being the original software.
  26. * 3. This notice may not be removed or altered from any source distribution.
  27. */
  28. #if defined(__linux__) && defined(__ELF__)
  29. .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
  30. #endif
  31. .text
  32. .fpu neon
  33. .arch armv7a
  34. .object_arch armv4
  35. .arm
  36. .syntax unified
  37. /*****************************************************************************/
  38. /* Supplementary macro for setting function attributes */
  39. .macro asm_function fname
  40. #ifdef __APPLE__
  41. .private_extern _\fname
  42. .globl _\fname
  43. _\fname:
  44. #else
  45. .global \fname
  46. #ifdef __ELF__
  47. .hidden \fname
  48. .type \fname, %function
  49. #endif
  50. \fname:
  51. #endif
  52. .endm
  53. #define CENTERJSAMPLE 128
  54. /*****************************************************************************/
  55. /*
  56. * Perform dequantization and inverse DCT on one block of coefficients.
  57. *
  58. * GLOBAL(void)
  59. * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
  60. * JSAMPARRAY output_buf, JDIMENSION output_col)
  61. */
  62. #define FIX_0_298631336 (2446)
  63. #define FIX_0_390180644 (3196)
  64. #define FIX_0_541196100 (4433)
  65. #define FIX_0_765366865 (6270)
  66. #define FIX_0_899976223 (7373)
  67. #define FIX_1_175875602 (9633)
  68. #define FIX_1_501321110 (12299)
  69. #define FIX_1_847759065 (15137)
  70. #define FIX_1_961570560 (16069)
  71. #define FIX_2_053119869 (16819)
  72. #define FIX_2_562915447 (20995)
  73. #define FIX_3_072711026 (25172)
  74. #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
  75. #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
  76. #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
  77. #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
  78. #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
  79. #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
  80. #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
  81. #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
  82. /*
  83. * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
  84. * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
  85. */
  86. #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
  87. DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
  88. JLONG q1, q2, q3, q4, q5, q6, q7; \
  89. JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
  90. \
  91. /* 1-D iDCT input data */ \
  92. row0 = xrow0; \
  93. row1 = xrow1; \
  94. row2 = xrow2; \
  95. row3 = xrow3; \
  96. row4 = xrow4; \
  97. row5 = xrow5; \
  98. row6 = xrow6; \
  99. row7 = xrow7; \
  100. \
  101. q5 = row7 + row3; \
  102. q4 = row5 + row1; \
  103. q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
  104. MULTIPLY(q4, FIX_1_175875602); \
  105. q7 = MULTIPLY(q5, FIX_1_175875602) + \
  106. MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
  107. q2 = MULTIPLY(row2, FIX_0_541196100) + \
  108. MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
  109. q4 = q6; \
  110. q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
  111. q6 += MULTIPLY(row5, -FIX_2_562915447) + \
  112. MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
  113. /* now we can use q1 (reloadable constants have been used up) */ \
  114. q1 = q3 + q2; \
  115. q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
  116. MULTIPLY(row1, -FIX_0_899976223); \
  117. q5 = q7; \
  118. q1 = q1 + q6; \
  119. q7 += MULTIPLY(row7, -FIX_0_899976223) + \
  120. MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
  121. \
  122. /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
  123. tmp11_plus_tmp2 = q1; \
  124. row1 = 0; \
  125. \
  126. q1 = q1 - q6; \
  127. q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
  128. MULTIPLY(row3, -FIX_2_562915447); \
  129. q1 = q1 - q6; \
  130. q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
  131. MULTIPLY(row6, FIX_0_541196100); \
  132. q3 = q3 - q2; \
  133. \
  134. /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
  135. tmp11_minus_tmp2 = q1; \
  136. \
  137. q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
  138. q2 = q1 + q6; \
  139. q1 = q1 - q6; \
  140. \
  141. /* pick up the results */ \
  142. tmp0 = q4; \
  143. tmp1 = q5; \
  144. tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
  145. tmp3 = q7; \
  146. tmp10 = q2; \
  147. tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
  148. tmp12 = q3; \
  149. tmp13 = q1; \
  150. }
  151. #define XFIX_0_899976223 d0[0]
  152. #define XFIX_0_541196100 d0[1]
  153. #define XFIX_2_562915447 d0[2]
  154. #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
  155. #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
  156. #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
  157. #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
  158. #define XFIX_1_175875602 d1[3]
  159. #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
  160. #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
  161. #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
  162. #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
  163. .balign 16
  164. jsimd_idct_islow_neon_consts:
  165. .short FIX_0_899976223 /* d0[0] */
  166. .short FIX_0_541196100 /* d0[1] */
  167. .short FIX_2_562915447 /* d0[2] */
  168. .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
  169. .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
  170. .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
  171. .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
  172. .short FIX_1_175875602 /* d1[3] */
  173. /* reloadable constants */
  174. .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
  175. .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
  176. .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
  177. .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
  178. asm_function jsimd_idct_islow_neon
  179. DCT_TABLE .req r0
  180. COEF_BLOCK .req r1
  181. OUTPUT_BUF .req r2
  182. OUTPUT_COL .req r3
  183. TMP1 .req r0
  184. TMP2 .req r1
  185. TMP3 .req r2
  186. TMP4 .req ip
  187. ROW0L .req d16
  188. ROW0R .req d17
  189. ROW1L .req d18
  190. ROW1R .req d19
  191. ROW2L .req d20
  192. ROW2R .req d21
  193. ROW3L .req d22
  194. ROW3R .req d23
  195. ROW4L .req d24
  196. ROW4R .req d25
  197. ROW5L .req d26
  198. ROW5R .req d27
  199. ROW6L .req d28
  200. ROW6R .req d29
  201. ROW7L .req d30
  202. ROW7R .req d31
  203. /* Load and dequantize coefficients into Neon registers
  204. * with the following allocation:
  205. * 0 1 2 3 | 4 5 6 7
  206. * ---------+--------
  207. * 0 | d16 | d17 ( q8 )
  208. * 1 | d18 | d19 ( q9 )
  209. * 2 | d20 | d21 ( q10 )
  210. * 3 | d22 | d23 ( q11 )
  211. * 4 | d24 | d25 ( q12 )
  212. * 5 | d26 | d27 ( q13 )
  213. * 6 | d28 | d29 ( q14 )
  214. * 7 | d30 | d31 ( q15 )
  215. */
  216. adr ip, jsimd_idct_islow_neon_consts
  217. vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
  218. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  219. vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
  220. vmul.s16 q8, q8, q0
  221. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  222. vmul.s16 q9, q9, q1
  223. vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
  224. vmul.s16 q10, q10, q2
  225. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  226. vmul.s16 q11, q11, q3
  227. vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
  228. vmul.s16 q12, q12, q0
  229. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  230. vmul.s16 q14, q14, q2
  231. vmul.s16 q13, q13, q1
  232. vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
  233. add ip, ip, #16
  234. vmul.s16 q15, q15, q3
  235. vpush {d8 - d15} /* save Neon registers */
  236. /* 1-D IDCT, pass 1, left 4x8 half */
  237. vadd.s16 d4, ROW7L, ROW3L
  238. vadd.s16 d5, ROW5L, ROW1L
  239. vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
  240. vmlal.s16 q6, d5, XFIX_1_175875602
  241. vmull.s16 q7, d4, XFIX_1_175875602
  242. /* Check for the zero coefficients in the right 4x8 half */
  243. push {r4, r5}
  244. vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
  245. vsubl.s16 q3, ROW0L, ROW4L
  246. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
  247. vmull.s16 q2, ROW2L, XFIX_0_541196100
  248. vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
  249. orr r0, r4, r5
  250. vmov q4, q6
  251. vmlsl.s16 q6, ROW5L, XFIX_2_562915447
  252. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
  253. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  254. vshl.s32 q3, q3, #13
  255. orr r0, r0, r4
  256. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  257. orr r0, r0, r5
  258. vadd.s32 q1, q3, q2
  259. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
  260. vmov q5, q7
  261. vadd.s32 q1, q1, q6
  262. orr r0, r0, r4
  263. vmlsl.s16 q7, ROW7L, XFIX_0_899976223
  264. orr r0, r0, r5
  265. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  266. vrshrn.s32 ROW1L, q1, #11
  267. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
  268. vsub.s32 q1, q1, q6
  269. vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
  270. orr r0, r0, r4
  271. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  272. orr r0, r0, r5
  273. vsub.s32 q1, q1, q6
  274. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  275. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
  276. vmlal.s16 q6, ROW6L, XFIX_0_541196100
  277. vsub.s32 q3, q3, q2
  278. orr r0, r0, r4
  279. vrshrn.s32 ROW6L, q1, #11
  280. orr r0, r0, r5
  281. vadd.s32 q1, q3, q5
  282. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
  283. vsub.s32 q3, q3, q5
  284. vaddl.s16 q5, ROW0L, ROW4L
  285. orr r0, r0, r4
  286. vrshrn.s32 ROW2L, q1, #11
  287. orr r0, r0, r5
  288. vrshrn.s32 ROW5L, q3, #11
  289. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
  290. vshl.s32 q5, q5, #13
  291. vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
  292. orr r0, r0, r4
  293. vadd.s32 q2, q5, q6
  294. orrs r0, r0, r5
  295. vsub.s32 q1, q5, q6
  296. vadd.s32 q6, q2, q7
  297. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
  298. vsub.s32 q2, q2, q7
  299. vadd.s32 q5, q1, q4
  300. orr r0, r4, r5
  301. vsub.s32 q3, q1, q4
  302. pop {r4, r5}
  303. vrshrn.s32 ROW7L, q2, #11
  304. vrshrn.s32 ROW3L, q5, #11
  305. vrshrn.s32 ROW0L, q6, #11
  306. vrshrn.s32 ROW4L, q3, #11
  307. beq 3f /* Go to do some special handling for the sparse
  308. right 4x8 half */
  309. /* 1-D IDCT, pass 1, right 4x8 half */
  310. vld1.s16 {d2}, [ip, :64] /* reload constants */
  311. vadd.s16 d10, ROW7R, ROW3R
  312. vadd.s16 d8, ROW5R, ROW1R
  313. /* Transpose left 4x8 half */
  314. vtrn.16 ROW6L, ROW7L
  315. vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
  316. vmlal.s16 q6, d8, XFIX_1_175875602
  317. vtrn.16 ROW2L, ROW3L
  318. vmull.s16 q7, d10, XFIX_1_175875602
  319. vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
  320. vtrn.16 ROW0L, ROW1L
  321. vsubl.s16 q3, ROW0R, ROW4R
  322. vmull.s16 q2, ROW2R, XFIX_0_541196100
  323. vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
  324. vtrn.16 ROW4L, ROW5L
  325. vmov q4, q6
  326. vmlsl.s16 q6, ROW5R, XFIX_2_562915447
  327. vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
  328. vtrn.32 ROW1L, ROW3L
  329. vshl.s32 q3, q3, #13
  330. vmlsl.s16 q4, ROW1R, XFIX_0_899976223
  331. vtrn.32 ROW4L, ROW6L
  332. vadd.s32 q1, q3, q2
  333. vmov q5, q7
  334. vadd.s32 q1, q1, q6
  335. vtrn.32 ROW0L, ROW2L
  336. vmlsl.s16 q7, ROW7R, XFIX_0_899976223
  337. vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
  338. vrshrn.s32 ROW1R, q1, #11
  339. vtrn.32 ROW5L, ROW7L
  340. vsub.s32 q1, q1, q6
  341. vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
  342. vmlsl.s16 q5, ROW3R, XFIX_2_562915447
  343. vsub.s32 q1, q1, q6
  344. vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
  345. vmlal.s16 q6, ROW6R, XFIX_0_541196100
  346. vsub.s32 q3, q3, q2
  347. vrshrn.s32 ROW6R, q1, #11
  348. vadd.s32 q1, q3, q5
  349. vsub.s32 q3, q3, q5
  350. vaddl.s16 q5, ROW0R, ROW4R
  351. vrshrn.s32 ROW2R, q1, #11
  352. vrshrn.s32 ROW5R, q3, #11
  353. vshl.s32 q5, q5, #13
  354. vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
  355. vadd.s32 q2, q5, q6
  356. vsub.s32 q1, q5, q6
  357. vadd.s32 q6, q2, q7
  358. vsub.s32 q2, q2, q7
  359. vadd.s32 q5, q1, q4
  360. vsub.s32 q3, q1, q4
  361. vrshrn.s32 ROW7R, q2, #11
  362. vrshrn.s32 ROW3R, q5, #11
  363. vrshrn.s32 ROW0R, q6, #11
  364. vrshrn.s32 ROW4R, q3, #11
  365. /* Transpose right 4x8 half */
  366. vtrn.16 ROW6R, ROW7R
  367. vtrn.16 ROW2R, ROW3R
  368. vtrn.16 ROW0R, ROW1R
  369. vtrn.16 ROW4R, ROW5R
  370. vtrn.32 ROW1R, ROW3R
  371. vtrn.32 ROW4R, ROW6R
  372. vtrn.32 ROW0R, ROW2R
  373. vtrn.32 ROW5R, ROW7R
  374. 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
  375. vld1.s16 {d2}, [ip, :64] /* reload constants */
  376. vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
  377. vmlal.s16 q6, ROW1L, XFIX_1_175875602
  378. vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
  379. vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
  380. vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
  381. vmlal.s16 q7, ROW3L, XFIX_1_175875602
  382. vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
  383. vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
  384. vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
  385. vmull.s16 q2, ROW2L, XFIX_0_541196100
  386. vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
  387. vmov q4, q6
  388. vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
  389. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  390. vshl.s32 q3, q3, #13
  391. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  392. vadd.s32 q1, q3, q2
  393. vmov q5, q7
  394. vadd.s32 q1, q1, q6
  395. vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
  396. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  397. vshrn.s32 ROW1L, q1, #16
  398. vsub.s32 q1, q1, q6
  399. vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
  400. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  401. vsub.s32 q1, q1, q6
  402. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  403. vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
  404. vsub.s32 q3, q3, q2
  405. vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
  406. vadd.s32 q1, q3, q5
  407. vsub.s32 q3, q3, q5
  408. vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
  409. vshrn.s32 ROW2L, q1, #16
  410. vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
  411. vshl.s32 q5, q5, #13
  412. vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
  413. vadd.s32 q2, q5, q6
  414. vsub.s32 q1, q5, q6
  415. vadd.s32 q6, q2, q7
  416. vsub.s32 q2, q2, q7
  417. vadd.s32 q5, q1, q4
  418. vsub.s32 q3, q1, q4
  419. vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
  420. vshrn.s32 ROW3L, q5, #16
  421. vshrn.s32 ROW0L, q6, #16
  422. vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
  423. /* 1-D IDCT, pass 2, right 4x8 half */
  424. vld1.s16 {d2}, [ip, :64] /* reload constants */
  425. vmull.s16 q6, ROW5R, XFIX_1_175875602
  426. vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
  427. vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
  428. vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
  429. vmull.s16 q7, ROW7R, XFIX_1_175875602
  430. vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
  431. vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
  432. vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
  433. vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
  434. vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
  435. vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
  436. vmov q4, q6
  437. vmlsl.s16 q6, ROW5R, XFIX_2_562915447
  438. vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
  439. vshl.s32 q3, q3, #13
  440. vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
  441. vadd.s32 q1, q3, q2
  442. vmov q5, q7
  443. vadd.s32 q1, q1, q6
  444. vmlsl.s16 q7, ROW7R, XFIX_0_899976223
  445. vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
  446. vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
  447. vsub.s32 q1, q1, q6
  448. vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
  449. vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
  450. vsub.s32 q1, q1, q6
  451. vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
  452. vmlal.s16 q6, ROW6R, XFIX_0_541196100
  453. vsub.s32 q3, q3, q2
  454. vshrn.s32 ROW6R, q1, #16
  455. vadd.s32 q1, q3, q5
  456. vsub.s32 q3, q3, q5
  457. vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
  458. vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
  459. vshrn.s32 ROW5R, q3, #16
  460. vshl.s32 q5, q5, #13
  461. vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
  462. vadd.s32 q2, q5, q6
  463. vsub.s32 q1, q5, q6
  464. vadd.s32 q6, q2, q7
  465. vsub.s32 q2, q2, q7
  466. vadd.s32 q5, q1, q4
  467. vsub.s32 q3, q1, q4
  468. vshrn.s32 ROW7R, q2, #16
  469. vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
  470. vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
  471. vshrn.s32 ROW4R, q3, #16
  472. 2: /* Descale to 8-bit and range limit */
  473. vqrshrn.s16 d16, q8, #2
  474. vqrshrn.s16 d17, q9, #2
  475. vqrshrn.s16 d18, q10, #2
  476. vqrshrn.s16 d19, q11, #2
  477. vpop {d8 - d15} /* restore Neon registers */
  478. vqrshrn.s16 d20, q12, #2
  479. /* Transpose the final 8-bit samples and do signed->unsigned conversion */
  480. vtrn.16 q8, q9
  481. vqrshrn.s16 d21, q13, #2
  482. vqrshrn.s16 d22, q14, #2
  483. vmov.u8 q0, #(CENTERJSAMPLE)
  484. vqrshrn.s16 d23, q15, #2
  485. vtrn.8 d16, d17
  486. vtrn.8 d18, d19
  487. vadd.u8 q8, q8, q0
  488. vadd.u8 q9, q9, q0
  489. vtrn.16 q10, q11
  490. /* Store results to the output buffer */
  491. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  492. add TMP1, TMP1, OUTPUT_COL
  493. add TMP2, TMP2, OUTPUT_COL
  494. vst1.8 {d16}, [TMP1]
  495. vtrn.8 d20, d21
  496. vst1.8 {d17}, [TMP2]
  497. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  498. add TMP1, TMP1, OUTPUT_COL
  499. add TMP2, TMP2, OUTPUT_COL
  500. vst1.8 {d18}, [TMP1]
  501. vadd.u8 q10, q10, q0
  502. vst1.8 {d19}, [TMP2]
  503. ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  504. add TMP1, TMP1, OUTPUT_COL
  505. add TMP2, TMP2, OUTPUT_COL
  506. add TMP3, TMP3, OUTPUT_COL
  507. add TMP4, TMP4, OUTPUT_COL
  508. vtrn.8 d22, d23
  509. vst1.8 {d20}, [TMP1]
  510. vadd.u8 q11, q11, q0
  511. vst1.8 {d21}, [TMP2]
  512. vst1.8 {d22}, [TMP3]
  513. vst1.8 {d23}, [TMP4]
  514. bx lr
  515. 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
  516. /* Transpose left 4x8 half */
  517. vtrn.16 ROW6L, ROW7L
  518. vtrn.16 ROW2L, ROW3L
  519. vtrn.16 ROW0L, ROW1L
  520. vtrn.16 ROW4L, ROW5L
  521. vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
  522. vtrn.32 ROW1L, ROW3L
  523. vtrn.32 ROW4L, ROW6L
  524. vtrn.32 ROW0L, ROW2L
  525. vtrn.32 ROW5L, ROW7L
  526. cmp r0, #0
  527. beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
  528. pass */
  529. /* Only row 0 is non-zero for the right 4x8 half */
  530. vdup.s16 ROW1R, ROW0R[1]
  531. vdup.s16 ROW2R, ROW0R[2]
  532. vdup.s16 ROW3R, ROW0R[3]
  533. vdup.s16 ROW4R, ROW0R[0]
  534. vdup.s16 ROW5R, ROW0R[1]
  535. vdup.s16 ROW6R, ROW0R[2]
  536. vdup.s16 ROW7R, ROW0R[3]
  537. vdup.s16 ROW0R, ROW0R[0]
  538. b 1b /* Go to 'normal' second pass */
  539. 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
  540. vld1.s16 {d2}, [ip, :64] /* reload constants */
  541. vmull.s16 q6, ROW1L, XFIX_1_175875602
  542. vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
  543. vmull.s16 q7, ROW3L, XFIX_1_175875602
  544. vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
  545. vmull.s16 q2, ROW2L, XFIX_0_541196100
  546. vshll.s16 q3, ROW0L, #13
  547. vmov q4, q6
  548. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  549. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  550. vadd.s32 q1, q3, q2
  551. vmov q5, q7
  552. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  553. vadd.s32 q1, q1, q6
  554. vadd.s32 q6, q6, q6
  555. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  556. vshrn.s32 ROW1L, q1, #16
  557. vsub.s32 q1, q1, q6
  558. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  559. vsub.s32 q3, q3, q2
  560. vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
  561. vadd.s32 q1, q3, q5
  562. vsub.s32 q3, q3, q5
  563. vshll.s16 q5, ROW0L, #13
  564. vshrn.s32 ROW2L, q1, #16
  565. vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
  566. vadd.s32 q2, q5, q6
  567. vsub.s32 q1, q5, q6
  568. vadd.s32 q6, q2, q7
  569. vsub.s32 q2, q2, q7
  570. vadd.s32 q5, q1, q4
  571. vsub.s32 q3, q1, q4
  572. vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
  573. vshrn.s32 ROW3L, q5, #16
  574. vshrn.s32 ROW0L, q6, #16
  575. vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
  576. /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
  577. vld1.s16 {d2}, [ip, :64] /* reload constants */
  578. vmull.s16 q6, ROW5L, XFIX_1_175875602
  579. vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
  580. vmull.s16 q7, ROW7L, XFIX_1_175875602
  581. vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
  582. vmull.s16 q2, ROW6L, XFIX_0_541196100
  583. vshll.s16 q3, ROW4L, #13
  584. vmov q4, q6
  585. vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
  586. vmlsl.s16 q4, ROW5L, XFIX_0_899976223
  587. vadd.s32 q1, q3, q2
  588. vmov q5, q7
  589. vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
  590. vadd.s32 q1, q1, q6
  591. vadd.s32 q6, q6, q6
  592. vmlsl.s16 q5, ROW7L, XFIX_2_562915447
  593. vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
  594. vsub.s32 q1, q1, q6
  595. vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
  596. vsub.s32 q3, q3, q2
  597. vshrn.s32 ROW6R, q1, #16
  598. vadd.s32 q1, q3, q5
  599. vsub.s32 q3, q3, q5
  600. vshll.s16 q5, ROW4L, #13
  601. vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
  602. vshrn.s32 ROW5R, q3, #16
  603. vadd.s32 q2, q5, q6
  604. vsub.s32 q1, q5, q6
  605. vadd.s32 q6, q2, q7
  606. vsub.s32 q2, q2, q7
  607. vadd.s32 q5, q1, q4
  608. vsub.s32 q3, q1, q4
  609. vshrn.s32 ROW7R, q2, #16
  610. vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
  611. vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
  612. vshrn.s32 ROW4R, q3, #16
  613. b 2b /* Go to epilogue */
  614. .unreq DCT_TABLE
  615. .unreq COEF_BLOCK
  616. .unreq OUTPUT_BUF
  617. .unreq OUTPUT_COL
  618. .unreq TMP1
  619. .unreq TMP2
  620. .unreq TMP3
  621. .unreq TMP4
  622. .unreq ROW0L
  623. .unreq ROW0R
  624. .unreq ROW1L
  625. .unreq ROW1R
  626. .unreq ROW2L
  627. .unreq ROW2R
  628. .unreq ROW3L
  629. .unreq ROW3R
  630. .unreq ROW4L
  631. .unreq ROW4R
  632. .unreq ROW5L
  633. .unreq ROW5R
  634. .unreq ROW6L
  635. .unreq ROW6R
  636. .unreq ROW7L
  637. .unreq ROW7R
  638. /*****************************************************************************/
  639. /*
  640. * jsimd_idct_ifast_neon
  641. *
  642. * This function contains a fast, not so accurate integer implementation of
  643. * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
  644. * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
  645. * function from jidctfst.c
  646. *
  647. * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
  648. * But in Arm Neon case some extra additions are required because VQDMULH
  649. * instruction can't handle the constants larger than 1. So the expressions
  650. * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
  651. * which introduces an extra addition. Overall, there are 6 extra additions
  652. * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
  653. */
  654. #define XFIX_1_082392200 d0[0]
  655. #define XFIX_1_414213562 d0[1]
  656. #define XFIX_1_847759065 d0[2]
  657. #define XFIX_2_613125930 d0[3]
  658. .balign 16
  659. jsimd_idct_ifast_neon_consts:
  660. .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
  661. .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
  662. .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
  663. .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
  664. asm_function jsimd_idct_ifast_neon
  665. DCT_TABLE .req r0
  666. COEF_BLOCK .req r1
  667. OUTPUT_BUF .req r2
  668. OUTPUT_COL .req r3
  669. TMP1 .req r0
  670. TMP2 .req r1
  671. TMP3 .req r2
  672. TMP4 .req ip
  673. /* Load and dequantize coefficients into Neon registers
  674. * with the following allocation:
  675. * 0 1 2 3 | 4 5 6 7
  676. * ---------+--------
  677. * 0 | d16 | d17 ( q8 )
  678. * 1 | d18 | d19 ( q9 )
  679. * 2 | d20 | d21 ( q10 )
  680. * 3 | d22 | d23 ( q11 )
  681. * 4 | d24 | d25 ( q12 )
  682. * 5 | d26 | d27 ( q13 )
  683. * 6 | d28 | d29 ( q14 )
  684. * 7 | d30 | d31 ( q15 )
  685. */
  686. adr ip, jsimd_idct_ifast_neon_consts
  687. vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
  688. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  689. vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
  690. vmul.s16 q8, q8, q0
  691. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  692. vmul.s16 q9, q9, q1
  693. vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
  694. vmul.s16 q10, q10, q2
  695. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  696. vmul.s16 q11, q11, q3
  697. vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
  698. vmul.s16 q12, q12, q0
  699. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  700. vmul.s16 q14, q14, q2
  701. vmul.s16 q13, q13, q1
  702. vld1.16 {d0}, [ip, :64] /* load constants */
  703. vmul.s16 q15, q15, q3
  704. vpush {d8 - d13} /* save Neon registers */
  705. /* 1-D IDCT, pass 1 */
  706. vsub.s16 q2, q10, q14
  707. vadd.s16 q14, q10, q14
  708. vsub.s16 q1, q11, q13
  709. vadd.s16 q13, q11, q13
  710. vsub.s16 q5, q9, q15
  711. vadd.s16 q15, q9, q15
  712. vqdmulh.s16 q4, q2, XFIX_1_414213562
  713. vqdmulh.s16 q6, q1, XFIX_2_613125930
  714. vadd.s16 q3, q1, q1
  715. vsub.s16 q1, q5, q1
  716. vadd.s16 q10, q2, q4
  717. vqdmulh.s16 q4, q1, XFIX_1_847759065
  718. vsub.s16 q2, q15, q13
  719. vadd.s16 q3, q3, q6
  720. vqdmulh.s16 q6, q2, XFIX_1_414213562
  721. vadd.s16 q1, q1, q4
  722. vqdmulh.s16 q4, q5, XFIX_1_082392200
  723. vsub.s16 q10, q10, q14
  724. vadd.s16 q2, q2, q6
  725. vsub.s16 q6, q8, q12
  726. vadd.s16 q12, q8, q12
  727. vadd.s16 q9, q5, q4
  728. vadd.s16 q5, q6, q10
  729. vsub.s16 q10, q6, q10
  730. vadd.s16 q6, q15, q13
  731. vadd.s16 q8, q12, q14
  732. vsub.s16 q3, q6, q3
  733. vsub.s16 q12, q12, q14
  734. vsub.s16 q3, q3, q1
  735. vsub.s16 q1, q9, q1
  736. vadd.s16 q2, q3, q2
  737. vsub.s16 q15, q8, q6
  738. vadd.s16 q1, q1, q2
  739. vadd.s16 q8, q8, q6
  740. vadd.s16 q14, q5, q3
  741. vsub.s16 q9, q5, q3
  742. vsub.s16 q13, q10, q2
  743. vadd.s16 q10, q10, q2
  744. /* Transpose */
  745. vtrn.16 q8, q9
  746. vsub.s16 q11, q12, q1
  747. vtrn.16 q14, q15
  748. vadd.s16 q12, q12, q1
  749. vtrn.16 q10, q11
  750. vtrn.16 q12, q13
  751. vtrn.32 q9, q11
  752. vtrn.32 q12, q14
  753. vtrn.32 q8, q10
  754. vtrn.32 q13, q15
  755. vswp d28, d21
  756. vswp d26, d19
  757. /* 1-D IDCT, pass 2 */
  758. vsub.s16 q2, q10, q14
  759. vswp d30, d23
  760. vadd.s16 q14, q10, q14
  761. vswp d24, d17
  762. vsub.s16 q1, q11, q13
  763. vadd.s16 q13, q11, q13
  764. vsub.s16 q5, q9, q15
  765. vadd.s16 q15, q9, q15
  766. vqdmulh.s16 q4, q2, XFIX_1_414213562
  767. vqdmulh.s16 q6, q1, XFIX_2_613125930
  768. vadd.s16 q3, q1, q1
  769. vsub.s16 q1, q5, q1
  770. vadd.s16 q10, q2, q4
  771. vqdmulh.s16 q4, q1, XFIX_1_847759065
  772. vsub.s16 q2, q15, q13
  773. vadd.s16 q3, q3, q6
  774. vqdmulh.s16 q6, q2, XFIX_1_414213562
  775. vadd.s16 q1, q1, q4
  776. vqdmulh.s16 q4, q5, XFIX_1_082392200
  777. vsub.s16 q10, q10, q14
  778. vadd.s16 q2, q2, q6
  779. vsub.s16 q6, q8, q12
  780. vadd.s16 q12, q8, q12
  781. vadd.s16 q9, q5, q4
  782. vadd.s16 q5, q6, q10
  783. vsub.s16 q10, q6, q10
  784. vadd.s16 q6, q15, q13
  785. vadd.s16 q8, q12, q14
  786. vsub.s16 q3, q6, q3
  787. vsub.s16 q12, q12, q14
  788. vsub.s16 q3, q3, q1
  789. vsub.s16 q1, q9, q1
  790. vadd.s16 q2, q3, q2
  791. vsub.s16 q15, q8, q6
  792. vadd.s16 q1, q1, q2
  793. vadd.s16 q8, q8, q6
  794. vadd.s16 q14, q5, q3
  795. vsub.s16 q9, q5, q3
  796. vsub.s16 q13, q10, q2
  797. vpop {d8 - d13} /* restore Neon registers */
  798. vadd.s16 q10, q10, q2
  799. vsub.s16 q11, q12, q1
  800. vadd.s16 q12, q12, q1
  801. /* Descale to 8-bit and range limit */
  802. vmov.u8 q0, #0x80
  803. vqshrn.s16 d16, q8, #5
  804. vqshrn.s16 d17, q9, #5
  805. vqshrn.s16 d18, q10, #5
  806. vqshrn.s16 d19, q11, #5
  807. vqshrn.s16 d20, q12, #5
  808. vqshrn.s16 d21, q13, #5
  809. vqshrn.s16 d22, q14, #5
  810. vqshrn.s16 d23, q15, #5
  811. vadd.u8 q8, q8, q0
  812. vadd.u8 q9, q9, q0
  813. vadd.u8 q10, q10, q0
  814. vadd.u8 q11, q11, q0
  815. /* Transpose the final 8-bit samples */
  816. vtrn.16 q8, q9
  817. vtrn.16 q10, q11
  818. vtrn.32 q8, q10
  819. vtrn.32 q9, q11
  820. vtrn.8 d16, d17
  821. vtrn.8 d18, d19
  822. /* Store results to the output buffer */
  823. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  824. add TMP1, TMP1, OUTPUT_COL
  825. add TMP2, TMP2, OUTPUT_COL
  826. vst1.8 {d16}, [TMP1]
  827. vst1.8 {d17}, [TMP2]
  828. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  829. add TMP1, TMP1, OUTPUT_COL
  830. add TMP2, TMP2, OUTPUT_COL
  831. vst1.8 {d18}, [TMP1]
  832. vtrn.8 d20, d21
  833. vst1.8 {d19}, [TMP2]
  834. ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  835. add TMP1, TMP1, OUTPUT_COL
  836. add TMP2, TMP2, OUTPUT_COL
  837. add TMP3, TMP3, OUTPUT_COL
  838. add TMP4, TMP4, OUTPUT_COL
  839. vst1.8 {d20}, [TMP1]
  840. vtrn.8 d22, d23
  841. vst1.8 {d21}, [TMP2]
  842. vst1.8 {d22}, [TMP3]
  843. vst1.8 {d23}, [TMP4]
  844. bx lr
  845. .unreq DCT_TABLE
  846. .unreq COEF_BLOCK
  847. .unreq OUTPUT_BUF
  848. .unreq OUTPUT_COL
  849. .unreq TMP1
  850. .unreq TMP2
  851. .unreq TMP3
  852. .unreq TMP4
  853. /*****************************************************************************/
  854. /*
  855. * jsimd_extrgb_ycc_convert_neon
  856. * jsimd_extbgr_ycc_convert_neon
  857. * jsimd_extrgbx_ycc_convert_neon
  858. * jsimd_extbgrx_ycc_convert_neon
  859. * jsimd_extxbgr_ycc_convert_neon
  860. * jsimd_extxrgb_ycc_convert_neon
  861. *
  862. * Colorspace conversion RGB -> YCbCr
  863. */
  864. .macro do_store size
  865. .if \size == 8
  866. vst1.8 {d20}, [Y]!
  867. vst1.8 {d21}, [U]!
  868. vst1.8 {d22}, [V]!
  869. .elseif \size == 4
  870. vst1.8 {d20[0]}, [Y]!
  871. vst1.8 {d20[1]}, [Y]!
  872. vst1.8 {d20[2]}, [Y]!
  873. vst1.8 {d20[3]}, [Y]!
  874. vst1.8 {d21[0]}, [U]!
  875. vst1.8 {d21[1]}, [U]!
  876. vst1.8 {d21[2]}, [U]!
  877. vst1.8 {d21[3]}, [U]!
  878. vst1.8 {d22[0]}, [V]!
  879. vst1.8 {d22[1]}, [V]!
  880. vst1.8 {d22[2]}, [V]!
  881. vst1.8 {d22[3]}, [V]!
  882. .elseif \size == 2
  883. vst1.8 {d20[4]}, [Y]!
  884. vst1.8 {d20[5]}, [Y]!
  885. vst1.8 {d21[4]}, [U]!
  886. vst1.8 {d21[5]}, [U]!
  887. vst1.8 {d22[4]}, [V]!
  888. vst1.8 {d22[5]}, [V]!
  889. .elseif \size == 1
  890. vst1.8 {d20[6]}, [Y]!
  891. vst1.8 {d21[6]}, [U]!
  892. vst1.8 {d22[6]}, [V]!
  893. .else
  894. .error unsupported macroblock size
  895. .endif
  896. .endm
  897. .macro do_load bpp, size
  898. .if \bpp == 24
  899. .if \size == 8
  900. vld3.8 {d10, d11, d12}, [RGB]!
  901. pld [RGB, #128]
  902. .elseif \size == 4
  903. vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
  904. vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
  905. vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
  906. vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
  907. .elseif \size == 2
  908. vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
  909. vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
  910. .elseif \size == 1
  911. vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
  912. .else
  913. .error unsupported macroblock size
  914. .endif
  915. .elseif \bpp == 32
  916. .if \size == 8
  917. vld4.8 {d10, d11, d12, d13}, [RGB]!
  918. pld [RGB, #128]
  919. .elseif \size == 4
  920. vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
  921. vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
  922. vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
  923. vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
  924. .elseif \size == 2
  925. vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
  926. vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
  927. .elseif \size == 1
  928. vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
  929. .else
  930. .error unsupported macroblock size
  931. .endif
  932. .else
  933. .error unsupported bpp
  934. .endif
  935. .endm
  936. .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
  937. /*
  938. * 2-stage pipelined RGB->YCbCr conversion
  939. */
  940. .macro do_rgb_to_yuv_stage1
  941. vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
  942. vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
  943. vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
  944. vmull.u16 q7, d4, d0[0]
  945. vmlal.u16 q7, d6, d0[1]
  946. vmlal.u16 q7, d8, d0[2]
  947. vmull.u16 q8, d5, d0[0]
  948. vmlal.u16 q8, d7, d0[1]
  949. vmlal.u16 q8, d9, d0[2]
  950. vrev64.32 q9, q1
  951. vrev64.32 q13, q1
  952. vmlsl.u16 q9, d4, d0[3]
  953. vmlsl.u16 q9, d6, d1[0]
  954. vmlal.u16 q9, d8, d1[1]
  955. vmlsl.u16 q13, d5, d0[3]
  956. vmlsl.u16 q13, d7, d1[0]
  957. vmlal.u16 q13, d9, d1[1]
  958. vrev64.32 q14, q1
  959. vrev64.32 q15, q1
  960. vmlal.u16 q14, d4, d1[1]
  961. vmlsl.u16 q14, d6, d1[2]
  962. vmlsl.u16 q14, d8, d1[3]
  963. vmlal.u16 q15, d5, d1[1]
  964. vmlsl.u16 q15, d7, d1[2]
  965. vmlsl.u16 q15, d9, d1[3]
  966. .endm
  967. .macro do_rgb_to_yuv_stage2
  968. vrshrn.u32 d20, q7, #16
  969. vrshrn.u32 d21, q8, #16
  970. vshrn.u32 d22, q9, #16
  971. vshrn.u32 d23, q13, #16
  972. vshrn.u32 d24, q14, #16
  973. vshrn.u32 d25, q15, #16
  974. vmovn.u16 d20, q10 /* d20 = y */
  975. vmovn.u16 d21, q11 /* d21 = u */
  976. vmovn.u16 d22, q12 /* d22 = v */
  977. .endm
  978. .macro do_rgb_to_yuv
  979. do_rgb_to_yuv_stage1
  980. do_rgb_to_yuv_stage2
  981. .endm
  982. .macro do_rgb_to_yuv_stage2_store_load_stage1
  983. vrshrn.u32 d20, q7, #16
  984. vrshrn.u32 d21, q8, #16
  985. vshrn.u32 d22, q9, #16
  986. vrev64.32 q9, q1
  987. vshrn.u32 d23, q13, #16
  988. vrev64.32 q13, q1
  989. vshrn.u32 d24, q14, #16
  990. vshrn.u32 d25, q15, #16
  991. do_load \bpp, 8
  992. vmovn.u16 d20, q10 /* d20 = y */
  993. vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
  994. vmovn.u16 d21, q11 /* d21 = u */
  995. vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
  996. vmovn.u16 d22, q12 /* d22 = v */
  997. vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
  998. vmull.u16 q7, d4, d0[0]
  999. vmlal.u16 q7, d6, d0[1]
  1000. vmlal.u16 q7, d8, d0[2]
  1001. vst1.8 {d20}, [Y]!
  1002. vmull.u16 q8, d5, d0[0]
  1003. vmlal.u16 q8, d7, d0[1]
  1004. vmlal.u16 q8, d9, d0[2]
  1005. vmlsl.u16 q9, d4, d0[3]
  1006. vmlsl.u16 q9, d6, d1[0]
  1007. vmlal.u16 q9, d8, d1[1]
  1008. vst1.8 {d21}, [U]!
  1009. vmlsl.u16 q13, d5, d0[3]
  1010. vmlsl.u16 q13, d7, d1[0]
  1011. vmlal.u16 q13, d9, d1[1]
  1012. vrev64.32 q14, q1
  1013. vrev64.32 q15, q1
  1014. vmlal.u16 q14, d4, d1[1]
  1015. vmlsl.u16 q14, d6, d1[2]
  1016. vmlsl.u16 q14, d8, d1[3]
  1017. vst1.8 {d22}, [V]!
  1018. vmlal.u16 q15, d5, d1[1]
  1019. vmlsl.u16 q15, d7, d1[2]
  1020. vmlsl.u16 q15, d9, d1[3]
  1021. .endm
  1022. .balign 16
  1023. jsimd_\colorid\()_ycc_neon_consts:
  1024. .short 19595, 38470, 7471, 11059
  1025. .short 21709, 32768, 27439, 5329
  1026. .short 32767, 128, 32767, 128
  1027. .short 32767, 128, 32767, 128
  1028. asm_function jsimd_\colorid\()_ycc_convert_neon
  1029. OUTPUT_WIDTH .req r0
  1030. INPUT_BUF .req r1
  1031. OUTPUT_BUF .req r2
  1032. OUTPUT_ROW .req r3
  1033. NUM_ROWS .req r4
  1034. OUTPUT_BUF0 .req r5
  1035. OUTPUT_BUF1 .req r6
  1036. OUTPUT_BUF2 .req OUTPUT_BUF
  1037. RGB .req r7
  1038. Y .req r8
  1039. U .req r9
  1040. V .req r10
  1041. N .req ip
  1042. /* Load constants to d0, d1, d2, d3 */
  1043. adr ip, jsimd_\colorid\()_ycc_neon_consts
  1044. vld1.16 {d0, d1, d2, d3}, [ip, :128]
  1045. /* Save Arm registers and handle input arguments */
  1046. push {r4, r5, r6, r7, r8, r9, r10, lr}
  1047. ldr NUM_ROWS, [sp, #(4 * 8)]
  1048. ldr OUTPUT_BUF0, [OUTPUT_BUF]
  1049. ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
  1050. ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
  1051. .unreq OUTPUT_BUF
  1052. /* Save Neon registers */
  1053. vpush {d8 - d15}
  1054. /* Outer loop over scanlines */
  1055. cmp NUM_ROWS, #1
  1056. blt 9f
  1057. 0:
  1058. ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
  1059. ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
  1060. mov N, OUTPUT_WIDTH
  1061. ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
  1062. add OUTPUT_ROW, OUTPUT_ROW, #1
  1063. ldr RGB, [INPUT_BUF], #4
  1064. /* Inner loop over pixels */
  1065. subs N, N, #8
  1066. blt 3f
  1067. do_load \bpp, 8
  1068. do_rgb_to_yuv_stage1
  1069. subs N, N, #8
  1070. blt 2f
  1071. 1:
  1072. do_rgb_to_yuv_stage2_store_load_stage1
  1073. subs N, N, #8
  1074. bge 1b
  1075. 2:
  1076. do_rgb_to_yuv_stage2
  1077. do_store 8
  1078. tst N, #7
  1079. beq 8f
  1080. 3:
  1081. tst N, #4
  1082. beq 3f
  1083. do_load \bpp, 4
  1084. 3:
  1085. tst N, #2
  1086. beq 4f
  1087. do_load \bpp, 2
  1088. 4:
  1089. tst N, #1
  1090. beq 5f
  1091. do_load \bpp, 1
  1092. 5:
  1093. do_rgb_to_yuv
  1094. tst N, #4
  1095. beq 6f
  1096. do_store 4
  1097. 6:
  1098. tst N, #2
  1099. beq 7f
  1100. do_store 2
  1101. 7:
  1102. tst N, #1
  1103. beq 8f
  1104. do_store 1
  1105. 8:
  1106. subs NUM_ROWS, NUM_ROWS, #1
  1107. bgt 0b
  1108. 9:
  1109. /* Restore all registers and return */
  1110. vpop {d8 - d15}
  1111. pop {r4, r5, r6, r7, r8, r9, r10, pc}
  1112. .unreq OUTPUT_WIDTH
  1113. .unreq OUTPUT_ROW
  1114. .unreq INPUT_BUF
  1115. .unreq NUM_ROWS
  1116. .unreq OUTPUT_BUF0
  1117. .unreq OUTPUT_BUF1
  1118. .unreq OUTPUT_BUF2
  1119. .unreq RGB
  1120. .unreq Y
  1121. .unreq U
  1122. .unreq V
  1123. .unreq N
  1124. .purgem do_rgb_to_yuv
  1125. .purgem do_rgb_to_yuv_stage1
  1126. .purgem do_rgb_to_yuv_stage2
  1127. .purgem do_rgb_to_yuv_stage2_store_load_stage1
  1128. .endm
  1129. /*--------------------------------- id ----- bpp R G B */
  1130. generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
  1131. generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
  1132. generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
  1133. generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
  1134. generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
  1135. generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
  1136. .purgem do_load
  1137. .purgem do_store