jcsample-avx2.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. ;
  2. ; Downsampling (AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2015, Intel Corporation.
  6. ; Copyright (C) 2016, 2024, D. R. Commander.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  13. %include "jsimdext.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 32
  17. ;
  18. ; Downsample pixel values of a single component.
  19. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  20. ; without smoothing.
  21. ;
  22. ; GLOBAL(void)
  23. ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  24. ; JDIMENSION v_samp_factor,
  25. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  26. ; JSAMPARRAY output_data);
  27. ;
  28. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  29. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  30. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  31. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  32. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  33. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  34. align 32
  35. GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
  36. EXTN(jsimd_h2v1_downsample_avx2):
  37. push ebp
  38. mov ebp, esp
  39. ; push ebx ; unused
  40. ; push ecx ; need not be preserved
  41. ; push edx ; need not be preserved
  42. push esi
  43. push edi
  44. mov ecx, JDIMENSION [width_blks(ebp)]
  45. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  46. jz near .return
  47. mov edx, JDIMENSION [img_width(ebp)]
  48. ; -- expand_right_edge
  49. push ecx
  50. shl ecx, 1 ; output_cols * 2
  51. sub ecx, edx
  52. jle short .expand_end
  53. mov eax, INT [max_v_samp(ebp)]
  54. test eax, eax
  55. jle short .expand_end
  56. cld
  57. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  58. ALIGNX 16, 7
  59. .expandloop:
  60. push eax
  61. push ecx
  62. mov edi, JSAMPROW [esi]
  63. add edi, edx
  64. mov al, JSAMPLE [edi-1]
  65. rep stosb
  66. pop ecx
  67. pop eax
  68. add esi, byte SIZEOF_JSAMPROW
  69. dec eax
  70. jg short .expandloop
  71. .expand_end:
  72. pop ecx ; output_cols
  73. ; -- h2v1_downsample
  74. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  75. test eax, eax
  76. jle near .return
  77. mov edx, 0x00010000 ; bias pattern
  78. vmovd xmm7, edx
  79. vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  80. vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
  81. vpcmpeqw ymm6, ymm6, ymm6
  82. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  83. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  84. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  85. ALIGNX 16, 7
  86. .rowloop:
  87. push ecx
  88. push edi
  89. push esi
  90. mov esi, JSAMPROW [esi] ; inptr
  91. mov edi, JSAMPROW [edi] ; outptr
  92. cmp ecx, byte SIZEOF_YMMWORD
  93. jae short .columnloop
  94. ALIGNX 16, 7
  95. .columnloop_r24:
  96. ; ecx can possibly be 8, 16, 24
  97. cmp ecx, 24
  98. jne .columnloop_r16
  99. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  100. vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
  101. mov ecx, SIZEOF_YMMWORD
  102. jmp short .downsample
  103. .columnloop_r16:
  104. cmp ecx, 16
  105. jne .columnloop_r8
  106. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  107. vpxor ymm1, ymm1, ymm1
  108. mov ecx, SIZEOF_YMMWORD
  109. jmp short .downsample
  110. .columnloop_r8:
  111. vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
  112. vpxor ymm1, ymm1, ymm1
  113. mov ecx, SIZEOF_YMMWORD
  114. jmp short .downsample
  115. ALIGNX 16, 7
  116. .columnloop:
  117. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  118. vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
  119. .downsample:
  120. vpsrlw ymm2, ymm0, BYTE_BIT
  121. vpand ymm0, ymm0, ymm6
  122. vpsrlw ymm3, ymm1, BYTE_BIT
  123. vpand ymm1, ymm1, ymm6
  124. vpaddw ymm0, ymm0, ymm2
  125. vpaddw ymm1, ymm1, ymm3
  126. vpaddw ymm0, ymm0, ymm7
  127. vpaddw ymm1, ymm1, ymm7
  128. vpsrlw ymm0, ymm0, 1
  129. vpsrlw ymm1, ymm1, 1
  130. vpackuswb ymm0, ymm0, ymm1
  131. vpermq ymm0, ymm0, 0xd8
  132. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  133. sub ecx, byte SIZEOF_YMMWORD ; outcol
  134. add esi, byte 2*SIZEOF_YMMWORD ; inptr
  135. add edi, byte 1*SIZEOF_YMMWORD ; outptr
  136. cmp ecx, byte SIZEOF_YMMWORD
  137. jae short .columnloop
  138. test ecx, ecx
  139. jnz near .columnloop_r24
  140. pop esi
  141. pop edi
  142. pop ecx
  143. add esi, byte SIZEOF_JSAMPROW ; input_data
  144. add edi, byte SIZEOF_JSAMPROW ; output_data
  145. dec eax ; rowctr
  146. jg near .rowloop
  147. .return:
  148. vzeroupper
  149. pop edi
  150. pop esi
  151. ; pop edx ; need not be preserved
  152. ; pop ecx ; need not be preserved
  153. ; pop ebx ; unused
  154. pop ebp
  155. ret
  156. ; --------------------------------------------------------------------------
  157. ;
  158. ; Downsample pixel values of a single component.
  159. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  160. ; without smoothing.
  161. ;
  162. ; GLOBAL(void)
  163. ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  164. ; JDIMENSION v_samp_factor,
  165. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  166. ; JSAMPARRAY output_data);
  167. ;
  168. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  169. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  170. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  171. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  172. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  173. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  174. align 32
  175. GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
  176. EXTN(jsimd_h2v2_downsample_avx2):
  177. push ebp
  178. mov ebp, esp
  179. ; push ebx ; unused
  180. ; push ecx ; need not be preserved
  181. ; push edx ; need not be preserved
  182. push esi
  183. push edi
  184. mov ecx, JDIMENSION [width_blks(ebp)]
  185. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  186. jz near .return
  187. mov edx, JDIMENSION [img_width(ebp)]
  188. ; -- expand_right_edge
  189. push ecx
  190. shl ecx, 1 ; output_cols * 2
  191. sub ecx, edx
  192. jle short .expand_end
  193. mov eax, INT [max_v_samp(ebp)]
  194. test eax, eax
  195. jle short .expand_end
  196. cld
  197. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  198. ALIGNX 16, 7
  199. .expandloop:
  200. push eax
  201. push ecx
  202. mov edi, JSAMPROW [esi]
  203. add edi, edx
  204. mov al, JSAMPLE [edi-1]
  205. rep stosb
  206. pop ecx
  207. pop eax
  208. add esi, byte SIZEOF_JSAMPROW
  209. dec eax
  210. jg short .expandloop
  211. .expand_end:
  212. pop ecx ; output_cols
  213. ; -- h2v2_downsample
  214. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  215. test eax, eax
  216. jle near .return
  217. mov edx, 0x00020001 ; bias pattern
  218. vmovd xmm7, edx
  219. vpcmpeqw ymm6, ymm6, ymm6
  220. vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
  221. vperm2i128 ymm7, ymm7, ymm7, 0
  222. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  223. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  224. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  225. ALIGNX 16, 7
  226. .rowloop:
  227. push ecx
  228. push edi
  229. push esi
  230. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  231. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  232. mov edi, JSAMPROW [edi] ; outptr
  233. cmp ecx, byte SIZEOF_YMMWORD
  234. jae short .columnloop
  235. ALIGNX 16, 7
  236. .columnloop_r24:
  237. cmp ecx, 24
  238. jne .columnloop_r16
  239. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  240. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  241. vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
  242. vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
  243. mov ecx, SIZEOF_YMMWORD
  244. jmp short .downsample
  245. .columnloop_r16:
  246. cmp ecx, 16
  247. jne .columnloop_r8
  248. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  249. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  250. vpxor ymm2, ymm2, ymm2
  251. vpxor ymm3, ymm3, ymm3
  252. mov ecx, SIZEOF_YMMWORD
  253. jmp short .downsample
  254. .columnloop_r8:
  255. vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
  256. vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  257. vpxor ymm2, ymm2, ymm2
  258. vpxor ymm3, ymm3, ymm3
  259. mov ecx, SIZEOF_YMMWORD
  260. jmp short .downsample
  261. ALIGNX 16, 7
  262. .columnloop:
  263. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  264. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  265. vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
  266. vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
  267. .downsample:
  268. vpand ymm4, ymm0, ymm6
  269. vpsrlw ymm0, ymm0, BYTE_BIT
  270. vpand ymm5, ymm1, ymm6
  271. vpsrlw ymm1, ymm1, BYTE_BIT
  272. vpaddw ymm0, ymm0, ymm4
  273. vpaddw ymm1, ymm1, ymm5
  274. vpand ymm4, ymm2, ymm6
  275. vpsrlw ymm2, ymm2, BYTE_BIT
  276. vpand ymm5, ymm3, ymm6
  277. vpsrlw ymm3, ymm3, BYTE_BIT
  278. vpaddw ymm2, ymm2, ymm4
  279. vpaddw ymm3, ymm3, ymm5
  280. vpaddw ymm0, ymm0, ymm1
  281. vpaddw ymm2, ymm2, ymm3
  282. vpaddw ymm0, ymm0, ymm7
  283. vpaddw ymm2, ymm2, ymm7
  284. vpsrlw ymm0, ymm0, 2
  285. vpsrlw ymm2, ymm2, 2
  286. vpackuswb ymm0, ymm0, ymm2
  287. vpermq ymm0, ymm0, 0xd8
  288. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  289. sub ecx, byte SIZEOF_YMMWORD ; outcol
  290. add edx, byte 2*SIZEOF_YMMWORD ; inptr0
  291. add esi, byte 2*SIZEOF_YMMWORD ; inptr1
  292. add edi, byte 1*SIZEOF_YMMWORD ; outptr
  293. cmp ecx, byte SIZEOF_YMMWORD
  294. jae near .columnloop
  295. test ecx, ecx
  296. jnz near .columnloop_r24
  297. pop esi
  298. pop edi
  299. pop ecx
  300. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  301. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  302. dec eax ; rowctr
  303. jg near .rowloop
  304. .return:
  305. vzeroupper
  306. pop edi
  307. pop esi
  308. ; pop edx ; need not be preserved
  309. ; pop ecx ; need not be preserved
  310. ; pop ebx ; unused
  311. pop ebp
  312. ret
  313. ; For some reason, the OS X linker does not honor the request to align the
  314. ; segment unless we do this.
  315. align 32