jchuff-sse2.asm 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758
  1. ;
  2. ; Huffman entropy encoding (SSE2)
  3. ;
  4. ; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander.
  5. ; Copyright (C) 2015, Matthieu Darbois.
  6. ; Copyright (C) 2018, Matthias Räncker.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  13. ;
  14. ; This file contains an SSE2 implementation for Huffman coding of one block.
  15. ; The following code is based on jchuff.c; see jchuff.c for more details.
  16. %include "jsimdext.inc"
  17. struc working_state
  18. .next_output_byte: resp 1 ; => next byte to write in buffer
  19. .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
  20. .cur.put_buffer.simd resq 1 ; current bit accumulation buffer
  21. .cur.free_bits resd 1 ; # of bits available in it
  22. .cur.last_dc_val resd 4 ; last DC coef for each component
  23. .cinfo: resp 1 ; dump_buffer needs access to this
  24. endstruc
  25. struc c_derived_tbl
  26. .ehufco: resd 256 ; code for each symbol
  27. .ehufsi: resb 256 ; length of code for each symbol
  28. ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
  29. endstruc
  30. ; --------------------------------------------------------------------------
  31. SECTION SEG_CONST
  32. GLOBAL_DATA(jconst_huff_encode_one_block)
  33. EXTN(jconst_huff_encode_one_block):
  34. ALIGNZ 32
  35. jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
  36. dq 0x000f, 0x001f, 0x003f, 0x007f
  37. dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
  38. dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
  39. times 1 << 14 db 15
  40. times 1 << 13 db 14
  41. times 1 << 12 db 13
  42. times 1 << 11 db 12
  43. times 1 << 10 db 11
  44. times 1 << 9 db 10
  45. times 1 << 8 db 9
  46. times 1 << 7 db 8
  47. times 1 << 6 db 7
  48. times 1 << 5 db 6
  49. times 1 << 4 db 5
  50. times 1 << 3 db 4
  51. times 1 << 2 db 3
  52. times 1 << 1 db 2
  53. times 1 << 0 db 1
  54. times 1 db 0
  55. GLOBAL_DATA(jpeg_nbits_table)
  56. EXTN(jpeg_nbits_table):
  57. times 1 db 0
  58. times 1 << 0 db 1
  59. times 1 << 1 db 2
  60. times 1 << 2 db 3
  61. times 1 << 3 db 4
  62. times 1 << 4 db 5
  63. times 1 << 5 db 6
  64. times 1 << 6 db 7
  65. times 1 << 7 db 8
  66. times 1 << 8 db 9
  67. times 1 << 9 db 10
  68. times 1 << 10 db 11
  69. times 1 << 11 db 12
  70. times 1 << 12 db 13
  71. times 1 << 13 db 14
  72. times 1 << 14 db 15
  73. ALIGNZ 32
  74. %ifdef PIC
  75. %define NBITS(x) nbits_base + x
  76. %else
  77. %define NBITS(x) EXTN(jpeg_nbits_table) + x
  78. %endif
  79. %define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
  80. ; --------------------------------------------------------------------------
  81. SECTION SEG_TEXT
  82. BITS 32
  83. %define mm_put_buffer mm0
  84. %define mm_all_0xff mm1
  85. %define mm_temp mm2
  86. %define mm_nbits mm3
  87. %define mm_code_bits mm3
  88. %define mm_code mm4
  89. %define mm_overflow_bits mm5
  90. %define mm_save_nbits mm6
  91. ; Shorthand used to describe SIMD operations:
  92. ; wN: xmmN treated as eight signed 16-bit values
  93. ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
  94. ; bN: xmmN treated as 16 unsigned 8-bit values, or
  95. ; mmN treated as eight unsigned 8-bit values
  96. ; bN[i]: perform the same operation on all unsigned 8-bit values,
  97. ; i=0..15 (SSE register) or i=0..7 (MMX register)
  98. ; Contents of SIMD registers are shown in memory order.
  99. ; Fill the bit buffer to capacity with the leading bits from code, then output
  100. ; the bit buffer and put the remaining bits from code into the bit buffer.
  101. ;
  102. ; Usage:
  103. ; code - contains the bits to shift into the bit buffer (LSB-aligned)
  104. ; %1 - temp register
  105. ; %2 - low byte of temp register
  106. ; %3 - second byte of temp register
  107. ; %4-%8 (optional) - extra instructions to execute before the macro completes
  108. ; %9 - the label to which to jump when the macro completes
  109. ;
  110. ; Upon completion, free_bits will be set to the number of remaining bits from
  111. ; code, and put_buffer will contain those remaining bits. temp and code will
  112. ; be clobbered.
  113. ;
  114. ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
  115. ; macro in jchuff.c.
  116. %macro EMIT_QWORD 9
  117. %define %%temp %1
  118. %define %%tempb %2
  119. %define %%temph %3
  120. add nbits, free_bits ; nbits += free_bits;
  121. neg free_bits ; free_bits = -free_bits;
  122. movq mm_temp, mm_code ; temp = code;
  123. movd mm_nbits, nbits ; nbits --> MMX register
  124. movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
  125. neg free_bits ; free_bits = -free_bits;
  126. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  127. psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
  128. add free_bits, 64 ; free_bits += 64;
  129. por mm_temp, mm_put_buffer ; temp |= put_buffer;
  130. %ifidn %%temp, nbits_base
  131. movd mm_save_nbits, nbits_base ; save nbits_base
  132. %endif
  133. movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
  134. movq mm_put_buffer, mm_code ; put_buffer = code;
  135. pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
  136. movq mm_code, mm_code_bits ; code = code_bits;
  137. psrlq mm_code_bits, 32 ; code_bits >>= 32;
  138. pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
  139. movd %%temp, mm_code_bits ; temp = code_bits;
  140. bswap %%temp ; temp = htonl(temp);
  141. test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
  142. jnz %%.SLOW ; goto %%.SLOW
  143. mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
  144. %ifidn %%temp, nbits_base
  145. movd nbits_base, mm_save_nbits ; restore nbits_base
  146. %endif
  147. %4
  148. movd nbits, mm_code ; nbits = (uint32_t)(code);
  149. %5
  150. bswap nbits ; nbits = htonl(nbits);
  151. mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
  152. lea buffer, [buffer + 8] ; buffer += 8;
  153. %6
  154. %7
  155. %8
  156. jmp %9 ; return
  157. %%.SLOW:
  158. ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
  159. ; bytes in the qword.
  160. mov byte [buffer], %%tempb ; buffer[0] = temp[0];
  161. cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
  162. mov byte [buffer+1], 0 ; buffer[1] = 0;
  163. sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
  164. mov byte [buffer], %%temph ; buffer[0] = temp[1];
  165. cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
  166. mov byte [buffer+1], 0 ; buffer[1] = 0;
  167. sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  168. shr %%temp, 16 ; temp >>= 16;
  169. mov byte [buffer], %%tempb ; buffer[0] = temp[0];
  170. cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
  171. mov byte [buffer+1], 0 ; buffer[1] = 0;
  172. sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
  173. mov byte [buffer], %%temph ; buffer[0] = temp[1];
  174. cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
  175. mov byte [buffer+1], 0 ; buffer[1] = 0;
  176. sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
  177. movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
  178. %ifidn %%temp, nbits_base
  179. movd nbits_base, mm_save_nbits ; restore nbits_base
  180. %endif
  181. bswap nbits ; nbits = htonl(nbits)
  182. mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
  183. cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
  184. mov byte [buffer+1], 0 ; buffer[1] = 0;
  185. sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
  186. mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
  187. cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
  188. mov byte [buffer+1], 0 ; buffer[1] = 0;
  189. sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
  190. shr nbits, 16 ; nbits >>= 16;
  191. mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
  192. cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
  193. mov byte [buffer+1], 0 ; buffer[1] = 0;
  194. sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
  195. mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
  196. %4
  197. cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
  198. mov byte [buffer+1], 0 ; buffer[1] = 0;
  199. sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
  200. %5
  201. %6
  202. %7
  203. %8
  204. jmp %9 ; return;
  205. %endmacro
  206. %macro PUSH 1
  207. push %1
  208. %assign stack_offset stack_offset + 4
  209. %endmacro
  210. %macro POP 1
  211. pop %1
  212. %assign stack_offset stack_offset - 4
  213. %endmacro
  214. ; If PIC is defined, load the address of a symbol defined in this file into a
  215. ; register. Equivalent to
  216. ; GET_GOT %1
  217. ; lea %1, [GOTOFF(%1, %2)]
  218. ; without using the GOT.
  219. ;
  220. ; Usage:
  221. ; %1 - register into which to load the address of the symbol
  222. ; %2 - symbol whose address should be loaded
  223. ; %3 - optional multi-line macro to execute before the symbol address is loaded
  224. ; %4 - optional multi-line macro to execute after the symbol address is loaded
  225. ;
  226. ; If PIC is not defined, then %3 and %4 are executed in order.
  227. %macro GET_SYM 2-4
  228. %ifdef PIC
  229. call %%.geteip
  230. %%.ref:
  231. %4
  232. add %1, %2 - %%.ref
  233. jmp short %%.done
  234. align 32
  235. %%.geteip:
  236. %3 4 ; must adjust stack pointer because of call
  237. mov %1, POINTER [esp]
  238. ret
  239. align 32
  240. %%.done:
  241. %else
  242. %3 0
  243. %4
  244. %endif
  245. %endmacro
  246. ;
  247. ; Encode a single block's worth of coefficients.
  248. ;
  249. ; GLOBAL(JOCTET *)
  250. ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
  251. ; JCOEFPTR block, int last_dc_val,
  252. ; c_derived_tbl *dctbl, c_derived_tbl *actbl)
  253. ;
  254. ; Stack layout:
  255. ; Function args
  256. ; Return address
  257. ; Saved ebx
  258. ; Saved ebp
  259. ; Saved esi
  260. ; Saved edi <-- esp_save
  261. ; ...
  262. ; esp_save
  263. ; t_ 64*2 bytes (aligned to 128 bytes)
  264. ;
  265. ; esp is used (as t) to point into t_ (data in lower indices is not used once
  266. ; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
  267. ; us to find the rest of the data again.
  268. ;
  269. ; NOTES:
  270. ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
  271. ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
  272. ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
  273. ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
  274. ; requires 2 µops (with memory operand) on Intel. In either case, only one
  275. ; pinsrw instruction can be decoded per cycle (and nothing else if they are
  276. ; back-to-back), so out-of-order execution cannot be used to work around long
  277. ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
  278. ; problem if the code runs from the µop cache.)
  279. ;
  280. ; We use tzcnt instead of bsf without checking for support. The instruction is
  281. ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
  282. ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
  283. ; an input dependency (although the behavior is not formally defined, Intel
  284. ; CPUs usually leave the destination unmodified if the source is zero.) This
  285. ; can prevent out-of-order execution, so we clear the destination before
  286. ; invoking tzcnt.
  287. ;
  288. ; Initial register allocation
  289. ; eax - frame --> buffer
  290. ; ebx - nbits_base (PIC) / emit_temp
  291. ; ecx - dctbl --> size --> state
  292. ; edx - block --> nbits
  293. ; esi - code_temp --> state --> actbl
  294. ; edi - index_temp --> free_bits
  295. ; esp - t
  296. ; ebp - index
  297. %define frame eax
  298. %ifdef PIC
  299. %define nbits_base ebx
  300. %endif
  301. %define emit_temp ebx
  302. %define emit_tempb bl
  303. %define emit_temph bh
  304. %define dctbl ecx
  305. %define block edx
  306. %define code_temp esi
  307. %define index_temp edi
  308. %define t esp
  309. %define index ebp
  310. %assign save_frame DCTSIZE2 * SIZEOF_WORD
  311. ; Step 1: Re-arrange input data according to jpeg_natural_order
  312. ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
  313. ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
  314. ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
  315. ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
  316. ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
  317. ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
  318. ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
  319. ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
  320. align 32
  321. GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
  322. EXTN(jsimd_huff_encode_one_block_sse2):
  323. %assign stack_offset 0
  324. %define arg_state 4 + stack_offset
  325. %define arg_buffer 8 + stack_offset
  326. %define arg_block 12 + stack_offset
  327. %define arg_last_dc_val 16 + stack_offset
  328. %define arg_dctbl 20 + stack_offset
  329. %define arg_actbl 24 + stack_offset
  330. ;X: X = code stream
  331. mov block, [esp + arg_block]
  332. PUSH ebx
  333. PUSH ebp
  334. movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
  335. PUSH esi
  336. PUSH edi
  337. movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
  338. mov frame, esp
  339. lea t, [frame - (save_frame + 4)]
  340. movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
  341. and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
  342. mov [t + save_frame], frame
  343. pxor xmm4, xmm4 ;A: w4[i] = 0;
  344. punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
  345. pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
  346. pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
  347. punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
  348. punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
  349. pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
  350. ;A: (Row 0, offset 1)
  351. pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
  352. paddw xmm0, xmm4 ;A: w0[i] += w4[i];
  353. movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
  354. movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
  355. pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
  356. pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
  357. movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
  358. movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
  359. punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
  360. pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
  361. pxor xmm4, xmm4 ;A: w4[i] = 0;
  362. psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
  363. pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
  364. pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
  365. ; (Row 1, offset 1)
  366. pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
  367. paddw xmm1, xmm4 ;B: w1[i] += w4[i];
  368. movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
  369. pxor xmm4, xmm4 ;B: w4[i] = 0;
  370. pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
  371. packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
  372. ; w/ signed saturation
  373. pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
  374. pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
  375. pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
  376. pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
  377. ; (Row 3, offset 1)
  378. pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
  379. paddw xmm3, xmm4 ;D: w3[i] += w4[i];
  380. movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
  381. pxor xmm4, xmm4 ;D: w4[i] = 0;
  382. pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
  383. pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
  384. pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
  385. pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
  386. pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
  387. pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
  388. pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
  389. ; (Row 2, offset 1)
  390. pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
  391. paddw xmm2, xmm4 ;C: w2[i] += w4[i];
  392. movsx code_temp, word [block] ;Z: code_temp = block[0];
  393. ; %1 - stack pointer adjustment
  394. %macro GET_SYM_BEFORE 1
  395. movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
  396. ;C: t[i+16] = w2[i];
  397. pxor xmm4, xmm4 ;C: w4[i] = 0;
  398. pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
  399. sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
  400. packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
  401. ; w/ signed saturation
  402. movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
  403. pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
  404. pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
  405. movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
  406. punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
  407. shl index_temp, 16 ;Z: index_temp <<= 16;
  408. psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
  409. pxor xmm2, xmm2 ;H: w2[i] = 0;
  410. pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
  411. or index, index_temp ;Z: index |= index_temp;
  412. %undef index_temp
  413. %define free_bits edi
  414. %endmacro
  415. %macro GET_SYM_AFTER 0
  416. movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
  417. unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
  418. pxor xmm0, xmm0 ;H: w0[i] = 0;
  419. not index ;Z: index = ~index;
  420. pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
  421. ; (Row 7, offset 1)
  422. pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
  423. mov dctbl, [frame + arg_dctbl]
  424. paddw xmm3, xmm2 ;H: w3[i] += w2[i];
  425. movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
  426. movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
  427. pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
  428. punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
  429. movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
  430. pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
  431. %endmacro
  432. GET_SYM nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER
  433. psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
  434. shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
  435. pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
  436. pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
  437. pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
  438. pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
  439. cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
  440. ;Z: i.e. if code_temp is positive
  441. pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
  442. movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
  443. pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
  444. pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
  445. pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
  446. ; (Row 6, offset 1)
  447. adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
  448. pxor xmm2, xmm2 ;G: w2[i] = 0;
  449. pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
  450. pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
  451. paddw xmm4, xmm0 ;G: w4[i] += w0[i];
  452. movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
  453. movd mm_temp, code_temp ;Z: temp = code_temp
  454. pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
  455. ; (Row 5, offset 1)
  456. pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
  457. packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
  458. ; w/ signed saturation
  459. lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
  460. pxor xmm0, xmm0 ;F: w0[i] = 0;
  461. pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
  462. paddw xmm1, xmm2 ;F: w1[i] += w2[i];
  463. movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
  464. pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
  465. pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
  466. pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
  467. pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
  468. pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
  469. ; (Row 4, offset 1)
  470. %undef block
  471. %define nbits edx
  472. %define nbitsb dl
  473. %define nbitsh dh
  474. movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
  475. %undef code_temp
  476. %define state esi
  477. pxor xmm2, xmm2 ;E: w2[i] = 0;
  478. mov state, [frame + arg_state]
  479. movd mm_nbits, nbits ;Z: nbits --> MMX register
  480. pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
  481. movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
  482. ;Z: code = dctbl->ehufco[nbits];
  483. %define size ecx
  484. %define sizeb cl
  485. %define sizeh ch
  486. paddw xmm5, xmm0 ;E: w5[i] += w0[i];
  487. movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
  488. movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
  489. ;Z: size = dctbl->ehufsi[nbits];
  490. %undef dctbl
  491. pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
  492. packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
  493. ; w/ signed saturation
  494. movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
  495. ;Z: put_buffer = state->cur.put_buffer.simd;
  496. mov free_bits, [state + working_state.cur.free_bits]
  497. ;Z: free_bits = state->cur.free_bits;
  498. %undef state
  499. %define actbl esi
  500. mov actbl, [frame + arg_actbl]
  501. %define buffer eax
  502. mov buffer, [frame + arg_buffer]
  503. %undef frame
  504. jmp .BEGIN
  505. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  506. align 16
  507. ; size <= 32, so this is not really a loop
  508. .BRLOOP1: ; .BRLOOP1:
  509. movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
  510. ; nbits = actbl->ehufsi[0xf0];
  511. movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
  512. ; code = actbl->ehufco[0xf0];
  513. and index, 0x7ffffff ; clear index if size == 32
  514. sub size, 16 ; size -= 16;
  515. sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
  516. jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
  517. movd mm_nbits, nbits ; nbits --> MMX register
  518. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  519. por mm_put_buffer, mm_code ; put_buffer |= code;
  520. jmp .ERLOOP1 ; goto .ERLOOP1;
  521. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  522. align 16
  523. %ifdef PIC
  524. times 6 nop
  525. %else
  526. times 2 nop
  527. %endif
  528. .BLOOP1: ; do { /* size = # of zero bits/elements to skip */
  529. ; if size == 32, index remains unchanged. Correct in .BRLOOP.
  530. shr index, sizeb ; index >>= size;
  531. lea t, [t + size * SIZEOF_WORD] ; t += size;
  532. cmp size, 16 ; if (size > 16)
  533. jg .BRLOOP1 ; goto .BRLOOP1;
  534. .ERLOOP1: ; .ERLOOP1:
  535. movsx nbits, word [t] ; nbits = *t;
  536. %ifdef PIC
  537. add size, size ; size += size;
  538. %else
  539. lea size, [size * 2] ; size += size;
  540. %endif
  541. movd mm_temp, nbits ; temp = nbits;
  542. movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
  543. lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
  544. movd mm_nbits, nbits ; nbits --> MMX register
  545. movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
  546. ; code = actbl->ehufco[size-16];
  547. movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
  548. ; size = actbl->ehufsi[size-16];
  549. .BEGIN: ; .BEGIN:
  550. pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
  551. psllq mm_code, mm_nbits ; code <<= nbits;
  552. add nbits, size ; nbits += size;
  553. por mm_code, mm_temp ; code |= temp;
  554. sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
  555. jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
  556. xor size, size ; size = 0; /* kill tzcnt input dependency */
  557. tzcnt size, index ; size = # of trailing 0 bits in index
  558. movd mm_nbits, nbits ; nbits --> MMX register
  559. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  560. inc size ; ++size;
  561. por mm_put_buffer, mm_code ; put_buffer |= code;
  562. test index, index
  563. jnz .BLOOP1 ; } while (index != 0);
  564. ; Round 2
  565. ; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
  566. .ELOOP1: ; .ELOOP1:
  567. pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
  568. pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
  569. shl size, 16 ; size <<= 16;
  570. or index, size ; index |= size;
  571. not index ; index = ~index;
  572. lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
  573. ; nbits = t + 1 + 64;
  574. and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
  575. sub nbits, t ; nbits -= t;
  576. shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
  577. tzcnt size, index ; size = # of trailing 0 bits in index
  578. inc size ; ++size;
  579. test index, index ; if (index == 0)
  580. jz .ELOOP2 ; goto .ELOOP2;
  581. ; NOTE: size == 32 cannot happen, since the last element is always 0.
  582. shr index, sizeb ; index >>= size;
  583. lea size, [size + nbits - 33] ; size = size + nbits - 33;
  584. lea t, [t + size * SIZEOF_WORD] ; t += size;
  585. cmp size, 16 ; if (size <= 16)
  586. jle .ERLOOP2 ; goto .ERLOOP2;
  587. .BRLOOP2: ; do {
  588. movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
  589. ; nbits = actbl->ehufsi[0xf0];
  590. sub size, 16 ; size -= 16;
  591. movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
  592. ; code = actbl->ehufco[0xf0];
  593. sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
  594. jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
  595. movd mm_nbits, nbits ; else { nbits --> MMX register
  596. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  597. por mm_put_buffer, mm_code ; put_buffer |= code;
  598. cmp size, 16 ; if (size <= 16)
  599. jle .ERLOOP2 ; goto .ERLOOP2;
  600. jmp .BRLOOP2 ; } while (1);
  601. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  602. align 16
  603. .BLOOP2: ; do { /* size = # of zero bits/elements to skip */
  604. shr index, sizeb ; index >>= size;
  605. lea t, [t + size * SIZEOF_WORD] ; t += size;
  606. cmp size, 16 ; if (size > 16)
  607. jg .BRLOOP2 ; goto .BRLOOP2;
  608. .ERLOOP2: ; .ERLOOP2:
  609. movsx nbits, word [t] ; nbits = *t;
  610. add size, size ; size += size;
  611. movd mm_temp, nbits ; temp = nbits;
  612. movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
  613. movd mm_nbits, nbits ; nbits --> MMX register
  614. lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
  615. movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
  616. ; code = actbl->ehufco[size-16];
  617. movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
  618. ; size = actbl->ehufsi[size-16];
  619. psllq mm_code, mm_nbits ; code <<= nbits;
  620. pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
  621. lea nbits, [nbits + size] ; nbits += size;
  622. por mm_code, mm_temp ; code |= temp;
  623. xor size, size ; size = 0; /* kill tzcnt input dependency */
  624. sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
  625. jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
  626. tzcnt size, index ; size = # of trailing 0 bits in index
  627. movd mm_nbits, nbits ; nbits --> MMX register
  628. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  629. inc size ; ++size;
  630. por mm_put_buffer, mm_code ; put_buffer |= code;
  631. test index, index
  632. jnz .BLOOP2 ; } while (index != 0);
  633. .ELOOP2: ; .ELOOP2:
  634. mov nbits, t ; nbits = t;
  635. lea t, [t + SIZEOF_WORD] ; t = &t[1];
  636. and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
  637. and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
  638. cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
  639. je .EFN ; {
  640. movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
  641. ; code = actbl->ehufco[0];
  642. movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
  643. ; nbits = actbl->ehufsi[0];
  644. sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
  645. jg .EFN_SKIP_EMIT_CODE ; {
  646. EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
  647. align 16
  648. .EFN_SKIP_EMIT_CODE: ; } else {
  649. movd mm_nbits, nbits ; nbits --> MMX register
  650. psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
  651. por mm_put_buffer, mm_code ; put_buffer |= code;
  652. .EFN: ; } }
  653. %define frame esp
  654. mov frame, [t + save_frame]
  655. %define state ecx
  656. mov state, [frame + arg_state]
  657. movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
  658. ; state->cur.put_buffer.simd = put_buffer;
  659. emms
  660. mov [state + working_state.cur.free_bits], free_bits
  661. ; state->cur.free_bits = free_bits;
  662. POP edi
  663. POP esi
  664. POP ebp
  665. POP ebx
  666. ret
  667. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  668. align 16
  669. .EMIT_BRLOOP1:
  670. EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
  671. .ERLOOP1
  672. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  673. align 16
  674. .EMIT_ERLOOP1:
  675. EMIT_QWORD size, sizeb, sizeh, \
  676. { xor size, size }, \
  677. { tzcnt size, index }, \
  678. { inc size }, \
  679. { test index, index }, \
  680. { jnz .BLOOP1 }, \
  681. .ELOOP1
  682. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  683. align 16
  684. .EMIT_BRLOOP2:
  685. EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
  686. { cmp size, 16 }, \
  687. { jle .ERLOOP2 }, \
  688. .BRLOOP2
  689. ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  690. align 16
  691. .EMIT_ERLOOP2:
  692. EMIT_QWORD size, sizeb, sizeh, \
  693. { xor size, size }, \
  694. { tzcnt size, index }, \
  695. { inc size }, \
  696. { test index, index }, \
  697. { jnz .BLOOP2 }, \
  698. .ELOOP2
  699. ; For some reason, the OS X linker does not honor the request to align the
  700. ; segment unless we do this.
  701. align 32