jidctflt-sse2.asm 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. ;
  2. ; Floating-point IDCT (SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. ;
  13. ; This file contains a floating-point implementation of the inverse DCT
  14. ; (Discrete Cosine Transform). The following code is based directly on
  15. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  20. shufps %1, %2, 0x44
  21. %endmacro
  22. %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  23. shufps %1, %2, 0xEE
  24. %endmacro
  25. ; --------------------------------------------------------------------------
  26. SECTION SEG_CONST
  27. ALIGNZ 32
  28. GLOBAL_DATA(jconst_idct_float_sse2)
  29. EXTN(jconst_idct_float_sse2):
  30. PD_1_414 times 4 dd 1.414213562373095048801689
  31. PD_1_847 times 4 dd 1.847759065022573512256366
  32. PD_1_082 times 4 dd 1.082392200292393968799446
  33. PD_M2_613 times 4 dd -2.613125929752753055713286
  34. PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
  35. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  36. ALIGNZ 32
  37. ; --------------------------------------------------------------------------
  38. SECTION SEG_TEXT
  39. BITS 32
  40. ;
  41. ; Perform dequantization and inverse DCT on one block of coefficients.
  42. ;
  43. ; GLOBAL(void)
  44. ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  45. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  46. ;
  47. %define dct_table(b) (b) + 8 ; void *dct_table
  48. %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
  49. %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
  50. %define output_col(b) (b) + 20 ; JDIMENSION output_col
  51. %define original_ebp ebp + 0
  52. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  53. ; xmmword wk[WK_NUM]
  54. %define WK_NUM 2
  55. %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  56. ; FAST_FLOAT workspace[DCTSIZE2]
  57. align 32
  58. GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  59. EXTN(jsimd_idct_float_sse2):
  60. push ebp
  61. mov eax, esp ; eax = original ebp
  62. sub esp, byte 4
  63. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  64. mov [esp], eax
  65. mov ebp, esp ; ebp = aligned ebp
  66. lea esp, [workspace]
  67. push ebx
  68. ; push ecx ; need not be preserved
  69. ; push edx ; need not be preserved
  70. push esi
  71. push edi
  72. GET_GOT ebx ; get GOT address
  73. ; ---- Pass 1: process columns from input, store into work array.
  74. ; mov eax, [original_ebp]
  75. mov edx, POINTER [dct_table(eax)] ; quantptr
  76. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  77. lea edi, [workspace] ; FAST_FLOAT *wsptr
  78. mov ecx, DCTSIZE/4 ; ctr
  79. ALIGNX 16, 7
  80. .columnloop:
  81. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  82. mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  83. or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  84. jnz near .columnDCT
  85. movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  86. movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  87. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  88. movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  89. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  90. movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  91. movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  92. por xmm1, xmm2
  93. por xmm3, xmm4
  94. por xmm5, xmm6
  95. por xmm1, xmm3
  96. por xmm5, xmm7
  97. por xmm1, xmm5
  98. packsswb xmm1, xmm1
  99. movd eax, xmm1
  100. test eax, eax
  101. jnz short .columnDCT
  102. ; -- AC terms all zero
  103. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  104. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  105. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  106. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  107. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  108. movaps xmm1, xmm0
  109. movaps xmm2, xmm0
  110. movaps xmm3, xmm0
  111. shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
  112. shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
  113. shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
  114. shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
  115. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  116. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
  117. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
  118. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
  119. movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
  120. movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
  121. movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
  122. movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
  123. jmp near .nextcolumn
  124. ALIGNX 16, 7
  125. %endif
  126. .columnDCT:
  127. ; -- Even part
  128. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  129. movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  130. movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  131. movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  132. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  133. punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
  134. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  135. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
  136. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  137. cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
  138. punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
  139. punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
  140. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
  141. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
  142. cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
  143. cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
  144. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  145. mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  146. mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  147. mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  148. movaps xmm4, xmm0
  149. movaps xmm5, xmm1
  150. subps xmm0, xmm2 ; xmm0=tmp11
  151. subps xmm1, xmm3
  152. addps xmm4, xmm2 ; xmm4=tmp10
  153. addps xmm5, xmm3 ; xmm5=tmp13
  154. mulps xmm1, [GOTOFF(ebx,PD_1_414)]
  155. subps xmm1, xmm5 ; xmm1=tmp12
  156. movaps xmm6, xmm4
  157. movaps xmm7, xmm0
  158. subps xmm4, xmm5 ; xmm4=tmp3
  159. subps xmm0, xmm1 ; xmm0=tmp2
  160. addps xmm6, xmm5 ; xmm6=tmp0
  161. addps xmm7, xmm1 ; xmm7=tmp1
  162. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  163. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  164. ; -- Odd part
  165. movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  166. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  167. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  168. movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  169. punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
  170. punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
  171. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
  172. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
  173. cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
  174. cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
  175. punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
  176. punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
  177. psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
  178. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
  179. cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
  180. cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
  181. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  182. mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  183. mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  184. mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  185. movaps xmm4, xmm2
  186. movaps xmm0, xmm5
  187. addps xmm2, xmm1 ; xmm2=z11
  188. addps xmm5, xmm3 ; xmm5=z13
  189. subps xmm4, xmm1 ; xmm4=z12
  190. subps xmm0, xmm3 ; xmm0=z10
  191. movaps xmm1, xmm2
  192. subps xmm2, xmm5
  193. addps xmm1, xmm5 ; xmm1=tmp7
  194. mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
  195. movaps xmm3, xmm0
  196. addps xmm0, xmm4
  197. mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
  198. mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
  199. mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
  200. addps xmm3, xmm0 ; xmm3=tmp12
  201. subps xmm4, xmm0 ; xmm4=tmp10
  202. ; -- Final output stage
  203. subps xmm3, xmm1 ; xmm3=tmp6
  204. movaps xmm5, xmm6
  205. movaps xmm0, xmm7
  206. addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
  207. addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
  208. subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
  209. subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
  210. subps xmm2, xmm3 ; xmm2=tmp5
  211. movaps xmm1, xmm6 ; transpose coefficients(phase 1)
  212. unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
  213. unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
  214. movaps xmm3, xmm0 ; transpose coefficients(phase 1)
  215. unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
  216. unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
  217. movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
  218. movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
  219. movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
  220. movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
  221. addps xmm4, xmm2 ; xmm4=tmp4
  222. movaps xmm0, xmm7
  223. movaps xmm3, xmm5
  224. addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
  225. addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
  226. subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
  227. subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
  228. movaps xmm2, xmm7 ; transpose coefficients(phase 1)
  229. unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
  230. unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
  231. movaps xmm4, xmm5 ; transpose coefficients(phase 1)
  232. unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
  233. unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
  234. movaps xmm3, xmm6 ; transpose coefficients(phase 2)
  235. UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30)
  236. UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31)
  237. movaps xmm0, xmm1 ; transpose coefficients(phase 2)
  238. UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32)
  239. UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33)
  240. movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
  241. movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
  242. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
  243. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
  244. movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
  245. movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  246. movaps xmm6, xmm5 ; transpose coefficients(phase 2)
  247. UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70)
  248. UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71)
  249. movaps xmm3, xmm4 ; transpose coefficients(phase 2)
  250. UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72)
  251. UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73)
  252. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
  253. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
  254. movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
  255. movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
  256. .nextcolumn:
  257. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  258. add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  259. add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  260. dec ecx ; ctr
  261. jnz near .columnloop
  262. ; -- Prefetch the next coefficient block
  263. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  264. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  265. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  266. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  267. ; ---- Pass 2: process rows from work array, store into output array.
  268. mov eax, [original_ebp]
  269. lea esi, [workspace] ; FAST_FLOAT *wsptr
  270. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  271. mov eax, JDIMENSION [output_col(eax)]
  272. mov ecx, DCTSIZE/4 ; ctr
  273. ALIGNX 16, 7
  274. .rowloop:
  275. ; -- Even part
  276. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  277. movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
  278. movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
  279. movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
  280. movaps xmm4, xmm0
  281. movaps xmm5, xmm1
  282. subps xmm0, xmm2 ; xmm0=tmp11
  283. subps xmm1, xmm3
  284. addps xmm4, xmm2 ; xmm4=tmp10
  285. addps xmm5, xmm3 ; xmm5=tmp13
  286. mulps xmm1, [GOTOFF(ebx,PD_1_414)]
  287. subps xmm1, xmm5 ; xmm1=tmp12
  288. movaps xmm6, xmm4
  289. movaps xmm7, xmm0
  290. subps xmm4, xmm5 ; xmm4=tmp3
  291. subps xmm0, xmm1 ; xmm0=tmp2
  292. addps xmm6, xmm5 ; xmm6=tmp0
  293. addps xmm7, xmm1 ; xmm7=tmp1
  294. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  295. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  296. ; -- Odd part
  297. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  298. movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
  299. movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
  300. movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
  301. movaps xmm4, xmm2
  302. movaps xmm0, xmm5
  303. addps xmm2, xmm1 ; xmm2=z11
  304. addps xmm5, xmm3 ; xmm5=z13
  305. subps xmm4, xmm1 ; xmm4=z12
  306. subps xmm0, xmm3 ; xmm0=z10
  307. movaps xmm1, xmm2
  308. subps xmm2, xmm5
  309. addps xmm1, xmm5 ; xmm1=tmp7
  310. mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
  311. movaps xmm3, xmm0
  312. addps xmm0, xmm4
  313. mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
  314. mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
  315. mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
  316. addps xmm3, xmm0 ; xmm3=tmp12
  317. subps xmm4, xmm0 ; xmm4=tmp10
  318. ; -- Final output stage
  319. subps xmm3, xmm1 ; xmm3=tmp6
  320. movaps xmm5, xmm6
  321. movaps xmm0, xmm7
  322. addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
  323. addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
  324. subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
  325. subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
  326. subps xmm2, xmm3 ; xmm2=tmp5
  327. movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
  328. pcmpeqd xmm3, xmm3
  329. psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  330. addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
  331. addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
  332. addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
  333. addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
  334. pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
  335. pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
  336. pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
  337. pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
  338. por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
  339. por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
  340. movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
  341. movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
  342. addps xmm4, xmm2 ; xmm4=tmp4
  343. movaps xmm7, xmm1
  344. movaps xmm5, xmm3
  345. addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
  346. addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
  347. subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
  348. subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
  349. movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
  350. pcmpeqd xmm4, xmm4
  351. psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  352. addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
  353. addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
  354. addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
  355. addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
  356. pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
  357. pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
  358. pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
  359. pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
  360. por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
  361. por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
  362. movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
  363. packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
  364. packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
  365. paddb xmm6, xmm2
  366. paddb xmm1, xmm2
  367. movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
  368. punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  369. punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  370. movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
  371. punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  372. punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  373. pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  374. pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  375. PUSHPIC ebx ; save GOT address
  376. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  377. mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  378. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
  379. movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
  380. mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  381. mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  382. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
  383. movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
  384. POPPIC ebx ; restore GOT address
  385. add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
  386. add edi, byte 4*SIZEOF_JSAMPROW
  387. dec ecx ; ctr
  388. jnz near .rowloop
  389. pop edi
  390. pop esi
  391. ; pop edx ; need not be preserved
  392. ; pop ecx ; need not be preserved
  393. pop ebx
  394. mov esp, ebp ; esp <- aligned ebp
  395. pop esp ; esp <- original ebp
  396. pop ebp
  397. ret
  398. ; For some reason, the OS X linker does not honor the request to align the
  399. ; segment unless we do this.
  400. align 32