jcsample-avx2.asm 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. ;
  2. ; Downsampling (64-bit AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
  6. ; Copyright (C) 2015, Intel Corporation.
  7. ; Copyright (C) 2018, Matthias Räncker.
  8. ;
  9. ; Based on the x86 SIMD extension for IJG JPEG library
  10. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  12. ;
  13. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  14. %include "jsimdext.inc"
  15. ; --------------------------------------------------------------------------
  16. SECTION SEG_TEXT
  17. BITS 64
  18. ;
  19. ; Downsample pixel values of a single component.
  20. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  21. ; without smoothing.
  22. ;
  23. ; GLOBAL(void)
  24. ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  25. ; JDIMENSION v_samp_factor,
  26. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  27. ; JSAMPARRAY output_data);
  28. ;
  29. ; r10d = JDIMENSION image_width
  30. ; r11 = int max_v_samp_factor
  31. ; r12d = JDIMENSION v_samp_factor
  32. ; r13d = JDIMENSION width_in_blocks
  33. ; r14 = JSAMPARRAY input_data
  34. ; r15 = JSAMPARRAY output_data
  35. align 32
  36. GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
  37. EXTN(jsimd_h2v1_downsample_avx2):
  38. ENDBR64
  39. push rbp
  40. mov rbp, rsp
  41. COLLECT_ARGS 6
  42. mov ecx, r13d
  43. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  44. jz near .return
  45. mov edx, r10d
  46. ; -- expand_right_edge
  47. push rcx
  48. shl rcx, 1 ; output_cols * 2
  49. sub rcx, rdx
  50. jle short .expand_end
  51. mov rax, r11
  52. test rax, rax
  53. jle short .expand_end
  54. cld
  55. mov rsi, r14 ; input_data
  56. .expandloop:
  57. push rax
  58. push rcx
  59. mov rdip, JSAMPROW [rsi]
  60. add rdi, rdx
  61. mov al, JSAMPLE [rdi-1]
  62. rep stosb
  63. pop rcx
  64. pop rax
  65. add rsi, byte SIZEOF_JSAMPROW
  66. dec rax
  67. jg short .expandloop
  68. .expand_end:
  69. pop rcx ; output_cols
  70. ; -- h2v1_downsample
  71. mov eax, r12d ; rowctr
  72. test eax, eax
  73. jle near .return
  74. mov rdx, 0x00010000 ; bias pattern
  75. vmovd xmm7, edx
  76. vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  77. vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
  78. vpcmpeqw ymm6, ymm6, ymm6
  79. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  80. mov rsi, r14 ; input_data
  81. mov rdi, r15 ; output_data
  82. .rowloop:
  83. push rcx
  84. push rdi
  85. push rsi
  86. mov rsip, JSAMPROW [rsi] ; inptr
  87. mov rdip, JSAMPROW [rdi] ; outptr
  88. cmp rcx, byte SIZEOF_YMMWORD
  89. jae short .columnloop
  90. .columnloop_r24:
  91. ; rcx can possibly be 8, 16, 24
  92. cmp rcx, 24
  93. jne .columnloop_r16
  94. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  95. vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  96. mov rcx, SIZEOF_YMMWORD
  97. jmp short .downsample
  98. .columnloop_r16:
  99. cmp rcx, 16
  100. jne .columnloop_r8
  101. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  102. vpxor ymm1, ymm1, ymm1
  103. mov rcx, SIZEOF_YMMWORD
  104. jmp short .downsample
  105. .columnloop_r8:
  106. vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
  107. vpxor ymm1, ymm1, ymm1
  108. mov rcx, SIZEOF_YMMWORD
  109. jmp short .downsample
  110. .columnloop:
  111. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  112. vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  113. .downsample:
  114. vpsrlw ymm2, ymm0, BYTE_BIT
  115. vpand ymm0, ymm0, ymm6
  116. vpsrlw ymm3, ymm1, BYTE_BIT
  117. vpand ymm1, ymm1, ymm6
  118. vpaddw ymm0, ymm0, ymm2
  119. vpaddw ymm1, ymm1, ymm3
  120. vpaddw ymm0, ymm0, ymm7
  121. vpaddw ymm1, ymm1, ymm7
  122. vpsrlw ymm0, ymm0, 1
  123. vpsrlw ymm1, ymm1, 1
  124. vpackuswb ymm0, ymm0, ymm1
  125. vpermq ymm0, ymm0, 0xd8
  126. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  127. sub rcx, byte SIZEOF_YMMWORD ; outcol
  128. add rsi, byte 2*SIZEOF_YMMWORD ; inptr
  129. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  130. cmp rcx, byte SIZEOF_YMMWORD
  131. jae short .columnloop
  132. test rcx, rcx
  133. jnz near .columnloop_r24
  134. pop rsi
  135. pop rdi
  136. pop rcx
  137. add rsi, byte SIZEOF_JSAMPROW ; input_data
  138. add rdi, byte SIZEOF_JSAMPROW ; output_data
  139. dec rax ; rowctr
  140. jg near .rowloop
  141. .return:
  142. vzeroupper
  143. UNCOLLECT_ARGS 6
  144. pop rbp
  145. ret
  146. ; --------------------------------------------------------------------------
  147. ;
  148. ; Downsample pixel values of a single component.
  149. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  150. ; without smoothing.
  151. ;
  152. ; GLOBAL(void)
  153. ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  154. ; JDIMENSION v_samp_factor,
  155. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  156. ; JSAMPARRAY output_data);
  157. ;
  158. ; r10d = JDIMENSION image_width
  159. ; r11 = int max_v_samp_factor
  160. ; r12d = JDIMENSION v_samp_factor
  161. ; r13d = JDIMENSION width_in_blocks
  162. ; r14 = JSAMPARRAY input_data
  163. ; r15 = JSAMPARRAY output_data
  164. align 32
  165. GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
  166. EXTN(jsimd_h2v2_downsample_avx2):
  167. ENDBR64
  168. push rbp
  169. mov rbp, rsp
  170. COLLECT_ARGS 6
  171. mov ecx, r13d
  172. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  173. jz near .return
  174. mov edx, r10d
  175. ; -- expand_right_edge
  176. push rcx
  177. shl rcx, 1 ; output_cols * 2
  178. sub rcx, rdx
  179. jle short .expand_end
  180. mov rax, r11
  181. test rax, rax
  182. jle short .expand_end
  183. cld
  184. mov rsi, r14 ; input_data
  185. .expandloop:
  186. push rax
  187. push rcx
  188. mov rdip, JSAMPROW [rsi]
  189. add rdi, rdx
  190. mov al, JSAMPLE [rdi-1]
  191. rep stosb
  192. pop rcx
  193. pop rax
  194. add rsi, byte SIZEOF_JSAMPROW
  195. dec rax
  196. jg short .expandloop
  197. .expand_end:
  198. pop rcx ; output_cols
  199. ; -- h2v2_downsample
  200. mov eax, r12d ; rowctr
  201. test rax, rax
  202. jle near .return
  203. mov rdx, 0x00020001 ; bias pattern
  204. vmovd xmm7, edx
  205. vpcmpeqw ymm6, ymm6, ymm6
  206. vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
  207. vperm2i128 ymm7, ymm7, ymm7, 0
  208. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  209. mov rsi, r14 ; input_data
  210. mov rdi, r15 ; output_data
  211. .rowloop:
  212. push rcx
  213. push rdi
  214. push rsi
  215. mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  216. mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
  217. mov rdip, JSAMPROW [rdi] ; outptr
  218. cmp rcx, byte SIZEOF_YMMWORD
  219. jae short .columnloop
  220. .columnloop_r24:
  221. cmp rcx, 24
  222. jne .columnloop_r16
  223. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  224. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  225. vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
  226. vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  227. mov rcx, SIZEOF_YMMWORD
  228. jmp short .downsample
  229. .columnloop_r16:
  230. cmp rcx, 16
  231. jne .columnloop_r8
  232. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  233. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  234. vpxor ymm2, ymm2, ymm2
  235. vpxor ymm3, ymm3, ymm3
  236. mov rcx, SIZEOF_YMMWORD
  237. jmp short .downsample
  238. .columnloop_r8:
  239. vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  240. vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  241. vpxor ymm2, ymm2, ymm2
  242. vpxor ymm3, ymm3, ymm3
  243. mov rcx, SIZEOF_YMMWORD
  244. jmp short .downsample
  245. .columnloop:
  246. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  247. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  248. vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
  249. vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  250. .downsample:
  251. vpand ymm4, ymm0, ymm6
  252. vpsrlw ymm0, ymm0, BYTE_BIT
  253. vpand ymm5, ymm1, ymm6
  254. vpsrlw ymm1, ymm1, BYTE_BIT
  255. vpaddw ymm0, ymm0, ymm4
  256. vpaddw ymm1, ymm1, ymm5
  257. vpand ymm4, ymm2, ymm6
  258. vpsrlw ymm2, ymm2, BYTE_BIT
  259. vpand ymm5, ymm3, ymm6
  260. vpsrlw ymm3, ymm3, BYTE_BIT
  261. vpaddw ymm2, ymm2, ymm4
  262. vpaddw ymm3, ymm3, ymm5
  263. vpaddw ymm0, ymm0, ymm1
  264. vpaddw ymm2, ymm2, ymm3
  265. vpaddw ymm0, ymm0, ymm7
  266. vpaddw ymm2, ymm2, ymm7
  267. vpsrlw ymm0, ymm0, 2
  268. vpsrlw ymm2, ymm2, 2
  269. vpackuswb ymm0, ymm0, ymm2
  270. vpermq ymm0, ymm0, 0xd8
  271. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  272. sub rcx, byte SIZEOF_YMMWORD ; outcol
  273. add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
  274. add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
  275. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  276. cmp rcx, byte SIZEOF_YMMWORD
  277. jae near .columnloop
  278. test rcx, rcx
  279. jnz near .columnloop_r24
  280. pop rsi
  281. pop rdi
  282. pop rcx
  283. add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
  284. add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
  285. dec rax ; rowctr
  286. jg near .rowloop
  287. .return:
  288. vzeroupper
  289. UNCOLLECT_ARGS 6
  290. pop rbp
  291. ret
  292. ; For some reason, the OS X linker does not honor the request to align the
  293. ; segment unless we do this.
  294. align 32