jidctfst-mmi.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2014-2015, 2018-2019, D. R. Commander. All Rights Reserved.
  5. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  6. * All Rights Reserved.
  7. * Authors: LiuQingfa <liuqingfa-hf@loongson.cn>
  8. *
  9. * Based on the x86 SIMD extension for IJG JPEG library
  10. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  11. *
  12. * This software is provided 'as-is', without any express or implied
  13. * warranty. In no event will the authors be held liable for any damages
  14. * arising from the use of this software.
  15. *
  16. * Permission is granted to anyone to use this software for any purpose,
  17. * including commercial applications, and to alter it and redistribute it
  18. * freely, subject to the following restrictions:
  19. *
  20. * 1. The origin of this software must not be misrepresented; you must not
  21. * claim that you wrote the original software. If you use this software
  22. * in a product, an acknowledgment in the product documentation would be
  23. * appreciated but is not required.
  24. * 2. Altered source versions must be plainly marked as such, and must not be
  25. * misrepresented as being the original software.
  26. * 3. This notice may not be removed or altered from any source distribution.
  27. */
  28. /* FAST INTEGER INVERSE DCT */
  29. #include "jsimd_mmi.h"
  30. #define CONST_BITS 8
  31. #define PASS1_BITS 2
  32. #define FIX_1_082 ((short)277) /* FIX(1.082392200) */
  33. #define FIX_1_414 ((short)362) /* FIX(1.414213562) */
  34. #define FIX_1_847 ((short)473) /* FIX(1.847759065) */
  35. #define FIX_2_613 ((short)669) /* FIX(2.613125930) */
  36. #define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
  37. #define PRE_MULTIPLY_SCALE_BITS 2
  38. #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
  39. enum const_index {
  40. index_PW_F1082,
  41. index_PW_F1414,
  42. index_PW_F1847,
  43. index_PW_MF1613,
  44. index_PB_CENTERJSAMP
  45. };
  46. static uint64_t const_value[] = {
  47. _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
  48. _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
  49. _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
  50. _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
  51. _uint64_set1_pi8(CENTERJSAMPLE)
  52. };
  53. #define PW_F1414 get_const_value(index_PW_F1414)
  54. #define PW_F1847 get_const_value(index_PW_F1847)
  55. #define PW_MF1613 get_const_value(index_PW_MF1613)
  56. #define PW_F1082 get_const_value(index_PW_F1082)
  57. #define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
  58. #define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
  59. #define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
  60. #define DO_IDCT_COMMON() { \
  61. tmp7 = _mm_add_pi16(z11, z13); \
  62. \
  63. tmp11 = _mm_sub_pi16(z11, z13); \
  64. tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
  65. tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
  66. \
  67. tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
  68. tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
  69. \
  70. /* To avoid overflow... \
  71. * \
  72. * (Original) \
  73. * tmp12 = -2.613125930 * z10 + z5; \
  74. * \
  75. * (This implementation) \
  76. * tmp12 = (-1.613125930 - 1) * z10 + z5; \
  77. * = -1.613125930 * z10 - z10 + z5; \
  78. */ \
  79. \
  80. z5 = _mm_add_pi16(tmp10, tmp12); \
  81. z5 = _mm_mulhi_pi16(z5, PW_F1847); \
  82. \
  83. tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
  84. tmp10 = _mm_sub_pi16(tmp10, z5); \
  85. tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
  86. tmp12 = _mm_sub_pi16(tmp12, z10); \
  87. tmp12 = _mm_sub_pi16(tmp12, z10); \
  88. tmp12 = _mm_sub_pi16(tmp12, z10); \
  89. tmp12 = _mm_add_pi16(tmp12, z5); \
  90. \
  91. /* Final output stage */ \
  92. \
  93. tmp6 = _mm_sub_pi16(tmp12, tmp7); \
  94. tmp5 = _mm_sub_pi16(tmp11, tmp6); \
  95. tmp4 = _mm_add_pi16(tmp10, tmp5); \
  96. \
  97. out0 = _mm_add_pi16(tmp0, tmp7); \
  98. out7 = _mm_sub_pi16(tmp0, tmp7); \
  99. out1 = _mm_add_pi16(tmp1, tmp6); \
  100. out6 = _mm_sub_pi16(tmp1, tmp6); \
  101. \
  102. out2 = _mm_add_pi16(tmp2, tmp5); \
  103. out5 = _mm_sub_pi16(tmp2, tmp5); \
  104. out4 = _mm_add_pi16(tmp3, tmp4); \
  105. out3 = _mm_sub_pi16(tmp3, tmp4); \
  106. }
  107. #define DO_IDCT_PASS1(iter) { \
  108. __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
  109. __m64 quant0l, quant1l, quant2l, quant3l; \
  110. __m64 quant4l, quant5l, quant6l, quant7l; \
  111. __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
  112. __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
  113. __m32 col0a, col1a, mm0; \
  114. \
  115. col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
  116. col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
  117. mm0 = _mm_or_si32(col0a, col1a); \
  118. \
  119. if (test_m32_zero(mm0)) { \
  120. __m64 mm1, mm2; \
  121. \
  122. col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
  123. col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
  124. col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
  125. col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
  126. col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
  127. col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
  128. col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
  129. col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
  130. \
  131. mm1 = _mm_or_si64(col1l, col3l); \
  132. mm2 = _mm_or_si64(col2l, col4l); \
  133. mm1 = _mm_or_si64(mm1, col5l); \
  134. mm2 = _mm_or_si64(mm2, col6l); \
  135. mm1 = _mm_or_si64(mm1, col7l); \
  136. mm1 = _mm_or_si64(mm1, mm2); \
  137. \
  138. if (test_m64_zero(mm1)) { \
  139. __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
  140. \
  141. /* AC terms all zero */ \
  142. \
  143. quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
  144. \
  145. dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \
  146. \
  147. dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
  148. dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
  149. \
  150. row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
  151. row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
  152. row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
  153. row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
  154. \
  155. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
  156. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
  157. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
  158. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
  159. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
  160. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
  161. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
  162. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
  163. \
  164. goto nextcolumn##iter; \
  165. } \
  166. } \
  167. \
  168. /* Even part */ \
  169. \
  170. col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
  171. col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
  172. col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
  173. col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
  174. \
  175. quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
  176. quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
  177. quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
  178. quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
  179. \
  180. tmp0 = _mm_mullo_pi16(col0l, quant0l); \
  181. tmp1 = _mm_mullo_pi16(col2l, quant2l); \
  182. tmp2 = _mm_mullo_pi16(col4l, quant4l); \
  183. tmp3 = _mm_mullo_pi16(col6l, quant6l); \
  184. \
  185. tmp10 = _mm_add_pi16(tmp0, tmp2); \
  186. tmp11 = _mm_sub_pi16(tmp0, tmp2); \
  187. tmp13 = _mm_add_pi16(tmp1, tmp3); \
  188. \
  189. tmp12 = _mm_sub_pi16(tmp1, tmp3); \
  190. tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
  191. tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
  192. tmp12 = _mm_sub_pi16(tmp12, tmp13); \
  193. \
  194. tmp0 = _mm_add_pi16(tmp10, tmp13); \
  195. tmp3 = _mm_sub_pi16(tmp10, tmp13); \
  196. tmp1 = _mm_add_pi16(tmp11, tmp12); \
  197. tmp2 = _mm_sub_pi16(tmp11, tmp12); \
  198. \
  199. /* Odd part */ \
  200. \
  201. col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
  202. col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
  203. col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
  204. col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
  205. \
  206. quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
  207. quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
  208. quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
  209. quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
  210. \
  211. tmp4 = _mm_mullo_pi16(col1l, quant1l); \
  212. tmp5 = _mm_mullo_pi16(col3l, quant3l); \
  213. tmp6 = _mm_mullo_pi16(col5l, quant5l); \
  214. tmp7 = _mm_mullo_pi16(col7l, quant7l); \
  215. \
  216. z13 = _mm_add_pi16(tmp6, tmp5); \
  217. z10 = _mm_sub_pi16(tmp6, tmp5); \
  218. z11 = _mm_add_pi16(tmp4, tmp7); \
  219. z12 = _mm_sub_pi16(tmp4, tmp7); \
  220. \
  221. DO_IDCT_COMMON() \
  222. \
  223. /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
  224. /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
  225. /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
  226. /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
  227. \
  228. /* Transpose coefficients */ \
  229. \
  230. row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
  231. row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
  232. row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
  233. row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
  234. \
  235. row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
  236. row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
  237. row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
  238. row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
  239. \
  240. row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
  241. row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
  242. row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
  243. row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
  244. \
  245. row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
  246. row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
  247. row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
  248. row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
  249. \
  250. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
  251. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
  252. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
  253. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
  254. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
  255. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
  256. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
  257. _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
  258. }
  259. #define DO_IDCT_PASS2(ctr) { \
  260. __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
  261. __m64 col0123a, col0123b, col0123c, col0123d; \
  262. __m64 col01l, col01h, col23l, col23h; \
  263. __m64 col0, col1, col2, col3; \
  264. __m64 row06, row17, row24, row35; \
  265. \
  266. row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
  267. row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
  268. row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
  269. row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
  270. row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
  271. row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
  272. row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
  273. row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
  274. \
  275. /* Even part */ \
  276. \
  277. tmp10 = _mm_add_pi16(row0l, row4l); \
  278. tmp11 = _mm_sub_pi16(row0l, row4l); \
  279. tmp13 = _mm_add_pi16(row2l, row6l); \
  280. \
  281. tmp12 = _mm_sub_pi16(row2l, row6l); \
  282. tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
  283. tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
  284. tmp12 = _mm_sub_pi16(tmp12, tmp13); \
  285. \
  286. tmp0 = _mm_add_pi16(tmp10, tmp13); \
  287. tmp3 = _mm_sub_pi16(tmp10, tmp13); \
  288. tmp1 = _mm_add_pi16(tmp11, tmp12); \
  289. tmp2 = _mm_sub_pi16(tmp11, tmp12); \
  290. \
  291. /* Odd part */ \
  292. \
  293. z13 = _mm_add_pi16(row5l, row3l); \
  294. z10 = _mm_sub_pi16(row5l, row3l); \
  295. z11 = _mm_add_pi16(row1l, row7l); \
  296. z12 = _mm_sub_pi16(row1l, row7l); \
  297. \
  298. DO_IDCT_COMMON() \
  299. \
  300. /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
  301. /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
  302. /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
  303. /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
  304. \
  305. out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
  306. out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
  307. out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
  308. out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
  309. out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
  310. out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
  311. out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
  312. out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
  313. \
  314. row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
  315. row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
  316. row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
  317. row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
  318. \
  319. row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
  320. row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
  321. row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
  322. row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
  323. \
  324. /* Transpose coefficients */ \
  325. \
  326. col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
  327. col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
  328. col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
  329. col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
  330. \
  331. col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
  332. col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
  333. col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
  334. col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
  335. \
  336. col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
  337. col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
  338. col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
  339. col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
  340. \
  341. _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
  342. _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
  343. _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
  344. _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
  345. }
  346. void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
  347. JSAMPARRAY output_buf, JDIMENSION output_col)
  348. {
  349. __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  350. __m64 tmp10, tmp11, tmp12, tmp13;
  351. __m64 out0, out1, out2, out3, out4, out5, out6, out7;
  352. __m64 z5, z10, z11, z12, z13;
  353. JCOEFPTR inptr;
  354. ISLOW_MULT_TYPE *quantptr;
  355. JCOEF *wsptr;
  356. JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
  357. /* Pass 1: process columns. */
  358. inptr = coef_block;
  359. quantptr = (ISLOW_MULT_TYPE *)dct_table;
  360. wsptr = workspace;
  361. DO_IDCT_PASS1(1)
  362. nextcolumn1:
  363. inptr += 4;
  364. quantptr += 4;
  365. wsptr += DCTSIZE * 4;
  366. DO_IDCT_PASS1(2)
  367. nextcolumn2:
  368. /* Pass 2: process rows. */
  369. wsptr = workspace;
  370. DO_IDCT_PASS2(0)
  371. wsptr += 4;
  372. DO_IDCT_PASS2(4)
  373. }