jidctflt-sse2.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. ;
  2. ; Floating-point IDCT (64-bit SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
  6. ; Copyright (C) 2018, Matthias Räncker.
  7. ; Copyright (C) 2023, Aliaksiej Kandracienka.
  8. ;
  9. ; Based on the x86 SIMD extension for IJG JPEG library
  10. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  12. ;
  13. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  14. ;
  15. ; This file contains a floating-point implementation of the inverse DCT
  16. ; (Discrete Cosine Transform). The following code is based directly on
  17. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  18. %include "jsimdext.inc"
  19. %include "jdct.inc"
  20. ; --------------------------------------------------------------------------
  21. %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  22. shufps %1, %2, 0x44
  23. %endmacro
  24. %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  25. shufps %1, %2, 0xEE
  26. %endmacro
  27. ; --------------------------------------------------------------------------
  28. SECTION SEG_CONST
  29. ALIGNZ 32
  30. GLOBAL_DATA(jconst_idct_float_sse2)
  31. EXTN(jconst_idct_float_sse2):
  32. PD_1_414 times 4 dd 1.414213562373095048801689
  33. PD_1_847 times 4 dd 1.847759065022573512256366
  34. PD_1_082 times 4 dd 1.082392200292393968799446
  35. PD_M2_613 times 4 dd -2.613125929752753055713286
  36. PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
  37. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  38. ALIGNZ 32
  39. ; --------------------------------------------------------------------------
  40. SECTION SEG_TEXT
  41. BITS 64
  42. ;
  43. ; Perform dequantization and inverse DCT on one block of coefficients.
  44. ;
  45. ; GLOBAL(void)
  46. ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  47. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  48. ;
  49. ; r10 = void *dct_table
  50. ; r11 = JCOEFPTR coef_block
  51. ; r12 = JSAMPARRAY output_buf
  52. ; r13d = JDIMENSION output_col
  53. %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
  54. ; xmmword wk[WK_NUM]
  55. %define WK_NUM 2
  56. %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  57. ; FAST_FLOAT workspace[DCTSIZE2]
  58. align 32
  59. GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  60. EXTN(jsimd_idct_float_sse2):
  61. ENDBR64
  62. push rbp
  63. mov rbp, rsp
  64. push r15
  65. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  66. ; Allocate stack space for wk array. r15 is used to access it.
  67. mov r15, rsp
  68. lea rsp, [workspace]
  69. COLLECT_ARGS 4
  70. push rbx
  71. ; ---- Pass 1: process columns from input, store into work array.
  72. mov rdx, r10 ; quantptr
  73. mov rsi, r11 ; inptr
  74. lea rdi, [workspace] ; FAST_FLOAT *wsptr
  75. mov rcx, DCTSIZE/4 ; ctr
  76. .columnloop:
  77. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  78. mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  79. or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  80. jnz near .columnDCT
  81. movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  82. movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  83. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  84. movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  85. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  86. movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  87. movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  88. por xmm1, xmm2
  89. por xmm3, xmm4
  90. por xmm5, xmm6
  91. por xmm1, xmm3
  92. por xmm5, xmm7
  93. por xmm1, xmm5
  94. packsswb xmm1, xmm1
  95. movd eax, xmm1
  96. test rax, rax
  97. jnz short .columnDCT
  98. ; -- AC terms all zero
  99. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  100. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  101. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  102. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  103. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  104. movaps xmm1, xmm0
  105. movaps xmm2, xmm0
  106. movaps xmm3, xmm0
  107. shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
  108. shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
  109. shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
  110. shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
  111. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  112. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
  113. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  114. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
  115. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
  116. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
  117. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  118. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  119. jmp near .nextcolumn
  120. %endif
  121. .columnDCT:
  122. ; -- Even part
  123. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  124. movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  125. movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  126. movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  127. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  128. punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
  129. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  130. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
  131. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  132. cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
  133. punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
  134. punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
  135. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
  136. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
  137. cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
  138. cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
  139. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  140. mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  141. mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  142. mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  143. movaps xmm4, xmm0
  144. movaps xmm5, xmm1
  145. subps xmm0, xmm2 ; xmm0=tmp11
  146. subps xmm1, xmm3
  147. addps xmm4, xmm2 ; xmm4=tmp10
  148. addps xmm5, xmm3 ; xmm5=tmp13
  149. mulps xmm1, [rel PD_1_414]
  150. subps xmm1, xmm5 ; xmm1=tmp12
  151. movaps xmm6, xmm4
  152. movaps xmm7, xmm0
  153. subps xmm4, xmm5 ; xmm4=tmp3
  154. subps xmm0, xmm1 ; xmm0=tmp2
  155. addps xmm6, xmm5 ; xmm6=tmp0
  156. addps xmm7, xmm1 ; xmm7=tmp1
  157. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  158. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  159. ; -- Odd part
  160. movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  161. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  162. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  163. movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  164. punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
  165. punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
  166. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
  167. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
  168. cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
  169. cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
  170. punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
  171. punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
  172. psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
  173. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
  174. cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
  175. cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
  176. mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  177. mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  178. mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  179. mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  180. movaps xmm4, xmm2
  181. movaps xmm0, xmm5
  182. addps xmm2, xmm1 ; xmm2=z11
  183. addps xmm5, xmm3 ; xmm5=z13
  184. subps xmm4, xmm1 ; xmm4=z12
  185. subps xmm0, xmm3 ; xmm0=z10
  186. movaps xmm1, xmm2
  187. subps xmm2, xmm5
  188. addps xmm1, xmm5 ; xmm1=tmp7
  189. mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
  190. movaps xmm3, xmm0
  191. addps xmm0, xmm4
  192. mulps xmm0, [rel PD_1_847] ; xmm0=z5
  193. mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  194. mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  195. addps xmm3, xmm0 ; xmm3=tmp12
  196. subps xmm4, xmm0 ; xmm4=tmp10
  197. ; -- Final output stage
  198. subps xmm3, xmm1 ; xmm3=tmp6
  199. movaps xmm5, xmm6
  200. movaps xmm0, xmm7
  201. addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
  202. addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
  203. subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
  204. subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
  205. subps xmm2, xmm3 ; xmm2=tmp5
  206. movaps xmm1, xmm6 ; transpose coefficients(phase 1)
  207. unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
  208. unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
  209. movaps xmm3, xmm0 ; transpose coefficients(phase 1)
  210. unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
  211. unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
  212. movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
  213. movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
  214. movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
  215. movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
  216. addps xmm4, xmm2 ; xmm4=tmp4
  217. movaps xmm0, xmm7
  218. movaps xmm3, xmm5
  219. addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
  220. addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
  221. subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
  222. subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
  223. movaps xmm2, xmm7 ; transpose coefficients(phase 1)
  224. unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
  225. unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
  226. movaps xmm4, xmm5 ; transpose coefficients(phase 1)
  227. unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
  228. unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
  229. movaps xmm3, xmm6 ; transpose coefficients(phase 2)
  230. UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30)
  231. UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31)
  232. movaps xmm0, xmm1 ; transpose coefficients(phase 2)
  233. UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32)
  234. UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33)
  235. movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
  236. movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
  237. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
  238. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  239. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  240. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  241. movaps xmm6, xmm5 ; transpose coefficients(phase 2)
  242. UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70)
  243. UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71)
  244. movaps xmm3, xmm4 ; transpose coefficients(phase 2)
  245. UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72)
  246. UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73)
  247. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
  248. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
  249. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
  250. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  251. .nextcolumn:
  252. add rsi, byte 4*SIZEOF_JCOEF ; coef_block
  253. add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  254. add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  255. dec rcx ; ctr
  256. jnz near .columnloop
  257. ; -- Prefetch the next coefficient block
  258. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  259. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  260. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  261. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  262. ; ---- Pass 2: process rows from work array, store into output array.
  263. lea rsi, [workspace] ; FAST_FLOAT *wsptr
  264. mov rdi, r12 ; (JSAMPROW *)
  265. mov eax, r13d
  266. mov rcx, DCTSIZE/4 ; ctr
  267. .rowloop:
  268. ; -- Even part
  269. movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
  270. movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
  271. movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
  272. movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
  273. movaps xmm4, xmm0
  274. movaps xmm5, xmm1
  275. subps xmm0, xmm2 ; xmm0=tmp11
  276. subps xmm1, xmm3
  277. addps xmm4, xmm2 ; xmm4=tmp10
  278. addps xmm5, xmm3 ; xmm5=tmp13
  279. mulps xmm1, [rel PD_1_414]
  280. subps xmm1, xmm5 ; xmm1=tmp12
  281. movaps xmm6, xmm4
  282. movaps xmm7, xmm0
  283. subps xmm4, xmm5 ; xmm4=tmp3
  284. subps xmm0, xmm1 ; xmm0=tmp2
  285. addps xmm6, xmm5 ; xmm6=tmp0
  286. addps xmm7, xmm1 ; xmm7=tmp1
  287. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  288. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  289. ; -- Odd part
  290. movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
  291. movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
  292. movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
  293. movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
  294. movaps xmm4, xmm2
  295. movaps xmm0, xmm5
  296. addps xmm2, xmm1 ; xmm2=z11
  297. addps xmm5, xmm3 ; xmm5=z13
  298. subps xmm4, xmm1 ; xmm4=z12
  299. subps xmm0, xmm3 ; xmm0=z10
  300. movaps xmm1, xmm2
  301. subps xmm2, xmm5
  302. addps xmm1, xmm5 ; xmm1=tmp7
  303. mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
  304. movaps xmm3, xmm0
  305. addps xmm0, xmm4
  306. mulps xmm0, [rel PD_1_847] ; xmm0=z5
  307. mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  308. mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  309. addps xmm3, xmm0 ; xmm3=tmp12
  310. subps xmm4, xmm0 ; xmm4=tmp10
  311. ; -- Final output stage
  312. subps xmm3, xmm1 ; xmm3=tmp6
  313. movaps xmm5, xmm6
  314. movaps xmm0, xmm7
  315. addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
  316. addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
  317. subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
  318. subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
  319. subps xmm2, xmm3 ; xmm2=tmp5
  320. movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
  321. pcmpeqd xmm3, xmm3
  322. psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  323. addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
  324. addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
  325. addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
  326. addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
  327. pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
  328. pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
  329. pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
  330. pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
  331. por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
  332. por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
  333. movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
  334. movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
  335. addps xmm4, xmm2 ; xmm4=tmp4
  336. movaps xmm7, xmm1
  337. movaps xmm5, xmm3
  338. addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
  339. addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
  340. subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
  341. subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
  342. movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
  343. pcmpeqd xmm4, xmm4
  344. psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  345. addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
  346. addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
  347. addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
  348. addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
  349. pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
  350. pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
  351. pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
  352. pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
  353. por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
  354. por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
  355. movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
  356. packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
  357. packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
  358. paddb xmm6, xmm2
  359. paddb xmm1, xmm2
  360. movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
  361. punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  362. punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  363. movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
  364. punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  365. punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  366. pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  367. pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  368. mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
  369. mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
  370. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
  371. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
  372. mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
  373. mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
  374. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
  375. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
  376. add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
  377. add rdi, byte 4*SIZEOF_JSAMPROW
  378. dec rcx ; ctr
  379. jnz near .rowloop
  380. pop rbx
  381. UNCOLLECT_ARGS 4
  382. lea rsp, [rbp-8]
  383. pop r15
  384. pop rbp
  385. ret
  386. ; For some reason, the OS X linker does not honor the request to align the
  387. ; segment unless we do this.
  388. align 32