jcsample-sse2.asm 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. ;
  2. ; Downsampling (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
  6. ; Copyright (C) 2018, Matthias Räncker.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  13. %include "jsimdext.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 64
  17. ;
  18. ; Downsample pixel values of a single component.
  19. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  20. ; without smoothing.
  21. ;
  22. ; GLOBAL(void)
  23. ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
  24. ; JDIMENSION v_samp_factor,
  25. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  26. ; JSAMPARRAY output_data);
  27. ;
  28. ; r10d = JDIMENSION image_width
  29. ; r11 = int max_v_samp_factor
  30. ; r12d = JDIMENSION v_samp_factor
  31. ; r13d = JDIMENSION width_in_blocks
  32. ; r14 = JSAMPARRAY input_data
  33. ; r15 = JSAMPARRAY output_data
  34. align 32
  35. GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
  36. EXTN(jsimd_h2v1_downsample_sse2):
  37. ENDBR64
  38. push rbp
  39. mov rbp, rsp
  40. COLLECT_ARGS 6
  41. mov ecx, r13d
  42. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  43. jz near .return
  44. mov edx, r10d
  45. ; -- expand_right_edge
  46. push rcx
  47. shl rcx, 1 ; output_cols * 2
  48. sub rcx, rdx
  49. jle short .expand_end
  50. mov rax, r11
  51. test rax, rax
  52. jle short .expand_end
  53. cld
  54. mov rsi, r14 ; input_data
  55. .expandloop:
  56. push rax
  57. push rcx
  58. mov rdip, JSAMPROW [rsi]
  59. add rdi, rdx
  60. mov al, JSAMPLE [rdi-1]
  61. rep stosb
  62. pop rcx
  63. pop rax
  64. add rsi, byte SIZEOF_JSAMPROW
  65. dec rax
  66. jg short .expandloop
  67. .expand_end:
  68. pop rcx ; output_cols
  69. ; -- h2v1_downsample
  70. mov eax, r12d ; rowctr
  71. test eax, eax
  72. jle near .return
  73. mov rdx, 0x00010000 ; bias pattern
  74. movd xmm7, edx
  75. pcmpeqw xmm6, xmm6
  76. pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  77. psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  78. mov rsi, r14 ; input_data
  79. mov rdi, r15 ; output_data
  80. .rowloop:
  81. push rcx
  82. push rdi
  83. push rsi
  84. mov rsip, JSAMPROW [rsi] ; inptr
  85. mov rdip, JSAMPROW [rdi] ; outptr
  86. cmp rcx, byte SIZEOF_XMMWORD
  87. jae short .columnloop
  88. .columnloop_r8:
  89. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  90. pxor xmm1, xmm1
  91. mov rcx, SIZEOF_XMMWORD
  92. jmp short .downsample
  93. .columnloop:
  94. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  95. movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  96. .downsample:
  97. movdqa xmm2, xmm0
  98. movdqa xmm3, xmm1
  99. pand xmm0, xmm6
  100. psrlw xmm2, BYTE_BIT
  101. pand xmm1, xmm6
  102. psrlw xmm3, BYTE_BIT
  103. paddw xmm0, xmm2
  104. paddw xmm1, xmm3
  105. paddw xmm0, xmm7
  106. paddw xmm1, xmm7
  107. psrlw xmm0, 1
  108. psrlw xmm1, 1
  109. packuswb xmm0, xmm1
  110. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  111. sub rcx, byte SIZEOF_XMMWORD ; outcol
  112. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  113. add rdi, byte 1*SIZEOF_XMMWORD ; outptr
  114. cmp rcx, byte SIZEOF_XMMWORD
  115. jae short .columnloop
  116. test rcx, rcx
  117. jnz short .columnloop_r8
  118. pop rsi
  119. pop rdi
  120. pop rcx
  121. add rsi, byte SIZEOF_JSAMPROW ; input_data
  122. add rdi, byte SIZEOF_JSAMPROW ; output_data
  123. dec rax ; rowctr
  124. jg near .rowloop
  125. .return:
  126. UNCOLLECT_ARGS 6
  127. pop rbp
  128. ret
  129. ; --------------------------------------------------------------------------
  130. ;
  131. ; Downsample pixel values of a single component.
  132. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  133. ; without smoothing.
  134. ;
  135. ; GLOBAL(void)
  136. ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
  137. ; JDIMENSION v_samp_factor,
  138. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  139. ; JSAMPARRAY output_data);
  140. ;
  141. ; r10d = JDIMENSION image_width
  142. ; r11 = int max_v_samp_factor
  143. ; r12d = JDIMENSION v_samp_factor
  144. ; r13d = JDIMENSION width_in_blocks
  145. ; r14 = JSAMPARRAY input_data
  146. ; r15 = JSAMPARRAY output_data
  147. align 32
  148. GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
  149. EXTN(jsimd_h2v2_downsample_sse2):
  150. ENDBR64
  151. push rbp
  152. mov rbp, rsp
  153. COLLECT_ARGS 6
  154. mov ecx, r13d
  155. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  156. jz near .return
  157. mov edx, r10d
  158. ; -- expand_right_edge
  159. push rcx
  160. shl rcx, 1 ; output_cols * 2
  161. sub rcx, rdx
  162. jle short .expand_end
  163. mov rax, r11
  164. test rax, rax
  165. jle short .expand_end
  166. cld
  167. mov rsi, r14 ; input_data
  168. .expandloop:
  169. push rax
  170. push rcx
  171. mov rdip, JSAMPROW [rsi]
  172. add rdi, rdx
  173. mov al, JSAMPLE [rdi-1]
  174. rep stosb
  175. pop rcx
  176. pop rax
  177. add rsi, byte SIZEOF_JSAMPROW
  178. dec rax
  179. jg short .expandloop
  180. .expand_end:
  181. pop rcx ; output_cols
  182. ; -- h2v2_downsample
  183. mov eax, r12d ; rowctr
  184. test rax, rax
  185. jle near .return
  186. mov rdx, 0x00020001 ; bias pattern
  187. movd xmm7, edx
  188. pcmpeqw xmm6, xmm6
  189. pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
  190. psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  191. mov rsi, r14 ; input_data
  192. mov rdi, r15 ; output_data
  193. .rowloop:
  194. push rcx
  195. push rdi
  196. push rsi
  197. mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  198. mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
  199. mov rdip, JSAMPROW [rdi] ; outptr
  200. cmp rcx, byte SIZEOF_XMMWORD
  201. jae short .columnloop
  202. .columnloop_r8:
  203. movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  204. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  205. pxor xmm2, xmm2
  206. pxor xmm3, xmm3
  207. mov rcx, SIZEOF_XMMWORD
  208. jmp short .downsample
  209. .columnloop:
  210. movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  211. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  212. movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  213. movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  214. .downsample:
  215. movdqa xmm4, xmm0
  216. movdqa xmm5, xmm1
  217. pand xmm0, xmm6
  218. psrlw xmm4, BYTE_BIT
  219. pand xmm1, xmm6
  220. psrlw xmm5, BYTE_BIT
  221. paddw xmm0, xmm4
  222. paddw xmm1, xmm5
  223. movdqa xmm4, xmm2
  224. movdqa xmm5, xmm3
  225. pand xmm2, xmm6
  226. psrlw xmm4, BYTE_BIT
  227. pand xmm3, xmm6
  228. psrlw xmm5, BYTE_BIT
  229. paddw xmm2, xmm4
  230. paddw xmm3, xmm5
  231. paddw xmm0, xmm1
  232. paddw xmm2, xmm3
  233. paddw xmm0, xmm7
  234. paddw xmm2, xmm7
  235. psrlw xmm0, 2
  236. psrlw xmm2, 2
  237. packuswb xmm0, xmm2
  238. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  239. sub rcx, byte SIZEOF_XMMWORD ; outcol
  240. add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
  241. add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
  242. add rdi, byte 1*SIZEOF_XMMWORD ; outptr
  243. cmp rcx, byte SIZEOF_XMMWORD
  244. jae near .columnloop
  245. test rcx, rcx
  246. jnz near .columnloop_r8
  247. pop rsi
  248. pop rdi
  249. pop rcx
  250. add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
  251. add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
  252. dec rax ; rowctr
  253. jg near .rowloop
  254. .return:
  255. UNCOLLECT_ARGS 6
  256. pop rbp
  257. ret
  258. ; For some reason, the OS X linker does not honor the request to align the
  259. ; segment unless we do this.
  260. align 32