jcphuff-sse2.asm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658
  1. ;
  2. ; Prepare data for progressive Huffman encoding (SSE2)
  3. ;
  4. ; Copyright (C) 2016, 2018, Matthieu Darbois
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  11. ;
  12. ; This file contains an SSE2 implementation of data preparation for progressive
  13. ; Huffman encoding. See jcphuff.c for more details.
  14. %include "jsimdext.inc"
  15. ; --------------------------------------------------------------------------
  16. SECTION SEG_TEXT
  17. BITS 32
  18. ; --------------------------------------------------------------------------
  19. ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
  20. ; jsimd_encode_mcu_AC_refine_prepare_sse2()
  21. %macro LOAD16 0
  22. pxor N0, N0
  23. pxor N1, N1
  24. mov T0, INT [LUT + 0*SIZEOF_INT]
  25. mov T1, INT [LUT + 8*SIZEOF_INT]
  26. pinsrw X0, word [BLOCK + T0 * 2], 0
  27. pinsrw X1, word [BLOCK + T1 * 2], 0
  28. mov T0, INT [LUT + 1*SIZEOF_INT]
  29. mov T1, INT [LUT + 9*SIZEOF_INT]
  30. pinsrw X0, word [BLOCK + T0 * 2], 1
  31. pinsrw X1, word [BLOCK + T1 * 2], 1
  32. mov T0, INT [LUT + 2*SIZEOF_INT]
  33. mov T1, INT [LUT + 10*SIZEOF_INT]
  34. pinsrw X0, word [BLOCK + T0 * 2], 2
  35. pinsrw X1, word [BLOCK + T1 * 2], 2
  36. mov T0, INT [LUT + 3*SIZEOF_INT]
  37. mov T1, INT [LUT + 11*SIZEOF_INT]
  38. pinsrw X0, word [BLOCK + T0 * 2], 3
  39. pinsrw X1, word [BLOCK + T1 * 2], 3
  40. mov T0, INT [LUT + 4*SIZEOF_INT]
  41. mov T1, INT [LUT + 12*SIZEOF_INT]
  42. pinsrw X0, word [BLOCK + T0 * 2], 4
  43. pinsrw X1, word [BLOCK + T1 * 2], 4
  44. mov T0, INT [LUT + 5*SIZEOF_INT]
  45. mov T1, INT [LUT + 13*SIZEOF_INT]
  46. pinsrw X0, word [BLOCK + T0 * 2], 5
  47. pinsrw X1, word [BLOCK + T1 * 2], 5
  48. mov T0, INT [LUT + 6*SIZEOF_INT]
  49. mov T1, INT [LUT + 14*SIZEOF_INT]
  50. pinsrw X0, word [BLOCK + T0 * 2], 6
  51. pinsrw X1, word [BLOCK + T1 * 2], 6
  52. mov T0, INT [LUT + 7*SIZEOF_INT]
  53. mov T1, INT [LUT + 15*SIZEOF_INT]
  54. pinsrw X0, word [BLOCK + T0 * 2], 7
  55. pinsrw X1, word [BLOCK + T1 * 2], 7
  56. %endmacro
  57. %macro LOAD15 0
  58. pxor N0, N0
  59. pxor N1, N1
  60. pxor X1, X1
  61. mov T0, INT [LUT + 0*SIZEOF_INT]
  62. mov T1, INT [LUT + 8*SIZEOF_INT]
  63. pinsrw X0, word [BLOCK + T0 * 2], 0
  64. pinsrw X1, word [BLOCK + T1 * 2], 0
  65. mov T0, INT [LUT + 1*SIZEOF_INT]
  66. pinsrw X0, word [BLOCK + T0 * 2], 1
  67. mov T0, INT [LUT + 2*SIZEOF_INT]
  68. pinsrw X0, word [BLOCK + T0 * 2], 2
  69. mov T0, INT [LUT + 3*SIZEOF_INT]
  70. pinsrw X0, word [BLOCK + T0 * 2], 3
  71. mov T0, INT [LUT + 4*SIZEOF_INT]
  72. pinsrw X0, word [BLOCK + T0 * 2], 4
  73. mov T0, INT [LUT + 5*SIZEOF_INT]
  74. pinsrw X0, word [BLOCK + T0 * 2], 5
  75. mov T0, INT [LUT + 6*SIZEOF_INT]
  76. pinsrw X0, word [BLOCK + T0 * 2], 6
  77. mov T0, INT [LUT + 7*SIZEOF_INT]
  78. pinsrw X0, word [BLOCK + T0 * 2], 7
  79. cmp LENEND, 2
  80. jl %%.ELOAD15
  81. mov T1, INT [LUT + 9*SIZEOF_INT]
  82. pinsrw X1, word [BLOCK + T1 * 2], 1
  83. cmp LENEND, 3
  84. jl %%.ELOAD15
  85. mov T1, INT [LUT + 10*SIZEOF_INT]
  86. pinsrw X1, word [BLOCK + T1 * 2], 2
  87. cmp LENEND, 4
  88. jl %%.ELOAD15
  89. mov T1, INT [LUT + 11*SIZEOF_INT]
  90. pinsrw X1, word [BLOCK + T1 * 2], 3
  91. cmp LENEND, 5
  92. jl %%.ELOAD15
  93. mov T1, INT [LUT + 12*SIZEOF_INT]
  94. pinsrw X1, word [BLOCK + T1 * 2], 4
  95. cmp LENEND, 6
  96. jl %%.ELOAD15
  97. mov T1, INT [LUT + 13*SIZEOF_INT]
  98. pinsrw X1, word [BLOCK + T1 * 2], 5
  99. cmp LENEND, 7
  100. jl %%.ELOAD15
  101. mov T1, INT [LUT + 14*SIZEOF_INT]
  102. pinsrw X1, word [BLOCK + T1 * 2], 6
  103. %%.ELOAD15:
  104. %endmacro
  105. %macro LOAD8 0
  106. pxor N0, N0
  107. mov T0, INT [LUT + 0*SIZEOF_INT]
  108. pinsrw X0, word [BLOCK + T0 * 2], 0
  109. mov T0, INT [LUT + 1*SIZEOF_INT]
  110. pinsrw X0, word [BLOCK + T0 * 2], 1
  111. mov T0, INT [LUT + 2*SIZEOF_INT]
  112. pinsrw X0, word [BLOCK + T0 * 2], 2
  113. mov T0, INT [LUT + 3*SIZEOF_INT]
  114. pinsrw X0, word [BLOCK + T0 * 2], 3
  115. mov T0, INT [LUT + 4*SIZEOF_INT]
  116. pinsrw X0, word [BLOCK + T0 * 2], 4
  117. mov T0, INT [LUT + 5*SIZEOF_INT]
  118. pinsrw X0, word [BLOCK + T0 * 2], 5
  119. mov T0, INT [LUT + 6*SIZEOF_INT]
  120. pinsrw X0, word [BLOCK + T0 * 2], 6
  121. mov T0, INT [LUT + 7*SIZEOF_INT]
  122. pinsrw X0, word [BLOCK + T0 * 2], 7
  123. %endmacro
  124. %macro LOAD7 0
  125. pxor N0, N0
  126. pxor X0, X0
  127. mov T1, INT [LUT + 0*SIZEOF_INT]
  128. pinsrw X0, word [BLOCK + T1 * 2], 0
  129. cmp LENEND, 2
  130. jl %%.ELOAD7
  131. mov T1, INT [LUT + 1*SIZEOF_INT]
  132. pinsrw X0, word [BLOCK + T1 * 2], 1
  133. cmp LENEND, 3
  134. jl %%.ELOAD7
  135. mov T1, INT [LUT + 2*SIZEOF_INT]
  136. pinsrw X0, word [BLOCK + T1 * 2], 2
  137. cmp LENEND, 4
  138. jl %%.ELOAD7
  139. mov T1, INT [LUT + 3*SIZEOF_INT]
  140. pinsrw X0, word [BLOCK + T1 * 2], 3
  141. cmp LENEND, 5
  142. jl %%.ELOAD7
  143. mov T1, INT [LUT + 4*SIZEOF_INT]
  144. pinsrw X0, word [BLOCK + T1 * 2], 4
  145. cmp LENEND, 6
  146. jl %%.ELOAD7
  147. mov T1, INT [LUT + 5*SIZEOF_INT]
  148. pinsrw X0, word [BLOCK + T1 * 2], 5
  149. cmp LENEND, 7
  150. jl %%.ELOAD7
  151. mov T1, INT [LUT + 6*SIZEOF_INT]
  152. pinsrw X0, word [BLOCK + T1 * 2], 6
  153. %%.ELOAD7:
  154. %endmacro
  155. %macro REDUCE0 0
  156. movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
  157. movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
  158. movdqa xmm2, XMMWORD [VALUES + (16*2)]
  159. movdqa xmm3, XMMWORD [VALUES + (24*2)]
  160. movdqa xmm4, XMMWORD [VALUES + (32*2)]
  161. movdqa xmm5, XMMWORD [VALUES + (40*2)]
  162. movdqa xmm6, XMMWORD [VALUES + (48*2)]
  163. pcmpeqw xmm0, ZERO
  164. pcmpeqw xmm1, ZERO
  165. pcmpeqw xmm2, ZERO
  166. pcmpeqw xmm3, ZERO
  167. pcmpeqw xmm4, ZERO
  168. pcmpeqw xmm5, ZERO
  169. pcmpeqw xmm6, ZERO
  170. pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
  171. packsswb xmm0, xmm1
  172. packsswb xmm2, xmm3
  173. packsswb xmm4, xmm5
  174. packsswb xmm6, xmm7
  175. pmovmskb eax, xmm0
  176. pmovmskb ecx, xmm2
  177. pmovmskb edx, xmm4
  178. pmovmskb esi, xmm6
  179. shl ecx, 16
  180. shl esi, 16
  181. or eax, ecx
  182. or edx, esi
  183. not eax
  184. not edx
  185. mov edi, ZEROBITS
  186. mov INT [edi], eax
  187. mov INT [edi+SIZEOF_INT], edx
  188. %endmacro
  189. ;
  190. ; Prepare data for jsimd_encode_mcu_AC_first().
  191. ;
  192. ; GLOBAL(void)
  193. ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
  194. ; const int *jpeg_natural_order_start,
  195. ; int Sl, int Al, JCOEF *values,
  196. ; size_t *zerobits)
  197. ;
  198. ; eax + 8 = const JCOEF *block
  199. ; eax + 12 = const int *jpeg_natural_order_start
  200. ; eax + 16 = int Sl
  201. ; eax + 20 = int Al
  202. ; eax + 24 = JCOEF *values
  203. ; eax + 28 = size_t *zerobits
  204. %define ZERO xmm7
  205. %define X0 xmm0
  206. %define X1 xmm1
  207. %define N0 xmm2
  208. %define N1 xmm3
  209. %define AL xmm4
  210. %define K eax
  211. %define LENEND eax
  212. %define LUT ebx
  213. %define T0 ecx
  214. %define T1 edx
  215. %define BLOCK esi
  216. %define VALUES edi
  217. %define LEN ebp
  218. %define ZEROBITS INT [esp + 5 * 4]
  219. align 32
  220. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
  221. EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
  222. push ebp
  223. mov eax, esp ; eax = original ebp
  224. sub esp, byte 4
  225. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  226. mov [esp], eax
  227. mov ebp, esp ; ebp = aligned ebp
  228. sub esp, 4
  229. push ebx
  230. push ecx
  231. ; push edx ; need not be preserved
  232. push esi
  233. push edi
  234. push ebp
  235. mov BLOCK, INT [eax + 8]
  236. mov LUT, INT [eax + 12]
  237. mov VALUES, INT [eax + 24]
  238. movd AL, INT [eax + 20]
  239. mov T0, INT [eax + 28]
  240. mov ZEROBITS, T0
  241. mov LEN, INT [eax + 16]
  242. pxor ZERO, ZERO
  243. mov K, LEN
  244. and K, -16
  245. shr K, 4
  246. jz .ELOOP16
  247. .BLOOP16:
  248. LOAD16
  249. pcmpgtw N0, X0
  250. pcmpgtw N1, X1
  251. paddw X0, N0
  252. paddw X1, N1
  253. pxor X0, N0
  254. pxor X1, N1
  255. psrlw X0, AL
  256. psrlw X1, AL
  257. pxor N0, X0
  258. pxor N1, X1
  259. movdqa XMMWORD [VALUES + (0) * 2], X0
  260. movdqa XMMWORD [VALUES + (8) * 2], X1
  261. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  262. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  263. add VALUES, 16*2
  264. add LUT, 16*SIZEOF_INT
  265. dec K
  266. jnz .BLOOP16
  267. test LEN, 15
  268. je .PADDING
  269. .ELOOP16:
  270. mov LENEND, LEN
  271. and LENEND, 7
  272. test LEN, 8
  273. jz .TRY7
  274. test LEN, 7
  275. jz .TRY8
  276. LOAD15
  277. pcmpgtw N0, X0
  278. pcmpgtw N1, X1
  279. paddw X0, N0
  280. paddw X1, N1
  281. pxor X0, N0
  282. pxor X1, N1
  283. psrlw X0, AL
  284. psrlw X1, AL
  285. pxor N0, X0
  286. pxor N1, X1
  287. movdqa XMMWORD [VALUES + (0) * 2], X0
  288. movdqa XMMWORD [VALUES + (8) * 2], X1
  289. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  290. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  291. add VALUES, 16*2
  292. jmp .PADDING
  293. .TRY8:
  294. LOAD8
  295. pcmpgtw N0, X0
  296. paddw X0, N0
  297. pxor X0, N0
  298. psrlw X0, AL
  299. pxor N0, X0
  300. movdqa XMMWORD [VALUES + (0) * 2], X0
  301. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  302. add VALUES, 8*2
  303. jmp .PADDING
  304. .TRY7:
  305. LOAD7
  306. pcmpgtw N0, X0
  307. paddw X0, N0
  308. pxor X0, N0
  309. psrlw X0, AL
  310. pxor N0, X0
  311. movdqa XMMWORD [VALUES + (0) * 2], X0
  312. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  313. add VALUES, 8*2
  314. .PADDING:
  315. mov K, LEN
  316. add K, 7
  317. and K, -8
  318. shr K, 3
  319. sub K, DCTSIZE2/8
  320. jz .EPADDING
  321. align 16
  322. .ZEROLOOP:
  323. movdqa XMMWORD [VALUES + 0], ZERO
  324. add VALUES, 8*2
  325. inc K
  326. jnz .ZEROLOOP
  327. .EPADDING:
  328. sub VALUES, DCTSIZE2*2
  329. REDUCE0
  330. pop ebp
  331. pop edi
  332. pop esi
  333. ; pop edx ; need not be preserved
  334. pop ecx
  335. pop ebx
  336. mov esp, ebp ; esp <- aligned ebp
  337. pop esp ; esp <- original ebp
  338. pop ebp
  339. ret
  340. %undef ZERO
  341. %undef X0
  342. %undef X1
  343. %undef N0
  344. %undef N1
  345. %undef AL
  346. %undef K
  347. %undef LUT
  348. %undef T0
  349. %undef T1
  350. %undef BLOCK
  351. %undef VALUES
  352. %undef LEN
  353. ;
  354. ; Prepare data for jsimd_encode_mcu_AC_refine().
  355. ;
  356. ; GLOBAL(int)
  357. ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
  358. ; const int *jpeg_natural_order_start,
  359. ; int Sl, int Al, JCOEF *absvalues,
  360. ; size_t *bits)
  361. ;
  362. ; eax + 8 = const JCOEF *block
  363. ; eax + 12 = const int *jpeg_natural_order_start
  364. ; eax + 16 = int Sl
  365. ; eax + 20 = int Al
  366. ; eax + 24 = JCOEF *values
  367. ; eax + 28 = size_t *bits
  368. %define ZERO xmm7
  369. %define ONE xmm5
  370. %define X0 xmm0
  371. %define X1 xmm1
  372. %define N0 xmm2
  373. %define N1 xmm3
  374. %define AL xmm4
  375. %define K eax
  376. %define LENEND eax
  377. %define LUT ebx
  378. %define T0 ecx
  379. %define T0w cx
  380. %define T1 edx
  381. %define BLOCK esi
  382. %define VALUES edi
  383. %define KK ebp
  384. %define ZEROBITS INT [esp + 5 * 4]
  385. %define EOB INT [esp + 5 * 4 + 4]
  386. %define LEN INT [esp + 5 * 4 + 8]
  387. align 32
  388. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
  389. EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
  390. push ebp
  391. mov eax, esp ; eax = original ebp
  392. sub esp, byte 4
  393. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  394. mov [esp], eax
  395. mov ebp, esp ; ebp = aligned ebp
  396. sub esp, 16
  397. push ebx
  398. push ecx
  399. ; push edx ; need not be preserved
  400. push esi
  401. push edi
  402. push ebp
  403. pcmpeqw ONE, ONE
  404. psrlw ONE, 15
  405. mov BLOCK, INT [eax + 8]
  406. mov LUT, INT [eax + 12]
  407. mov VALUES, INT [eax + 24]
  408. movd AL, INT [eax + 20]
  409. mov T0, INT [eax + 28]
  410. mov K, INT [eax + 16]
  411. mov INT [T0 + 2 * SIZEOF_INT], -1
  412. mov INT [T0 + 3 * SIZEOF_INT], -1
  413. mov ZEROBITS, T0
  414. mov LEN, K
  415. pxor ZERO, ZERO
  416. and K, -16
  417. mov EOB, 0
  418. xor KK, KK
  419. shr K, 4
  420. jz .ELOOPR16
  421. .BLOOPR16:
  422. LOAD16
  423. pcmpgtw N0, X0
  424. pcmpgtw N1, X1
  425. paddw X0, N0
  426. paddw X1, N1
  427. pxor X0, N0
  428. pxor X1, N1
  429. psrlw X0, AL
  430. psrlw X1, AL
  431. movdqa XMMWORD [VALUES + (0) * 2], X0
  432. movdqa XMMWORD [VALUES + (8) * 2], X1
  433. pcmpeqw X0, ONE
  434. pcmpeqw X1, ONE
  435. packsswb N0, N1
  436. packsswb X0, X1
  437. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  438. mov T1, ZEROBITS
  439. not T0
  440. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  441. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  442. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  443. jz .CONTINUER16 ; if (idx) {
  444. lea T1, [T1+KK*8]
  445. mov EOB, T1 ; EOB = k + idx;
  446. .CONTINUER16:
  447. add VALUES, 16*2
  448. add LUT, 16*SIZEOF_INT
  449. add KK, 2
  450. dec K
  451. jnz .BLOOPR16
  452. test LEN, 15
  453. je .PADDINGR
  454. .ELOOPR16:
  455. mov LENEND, LEN
  456. test LENEND, 8
  457. jz .TRYR7
  458. test LENEND, 7
  459. jz .TRYR8
  460. and LENEND, 7
  461. LOAD15
  462. pcmpgtw N0, X0
  463. pcmpgtw N1, X1
  464. paddw X0, N0
  465. paddw X1, N1
  466. pxor X0, N0
  467. pxor X1, N1
  468. psrlw X0, AL
  469. psrlw X1, AL
  470. movdqa XMMWORD [VALUES + (0) * 2], X0
  471. movdqa XMMWORD [VALUES + (8) * 2], X1
  472. pcmpeqw X0, ONE
  473. pcmpeqw X1, ONE
  474. packsswb N0, N1
  475. packsswb X0, X1
  476. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  477. mov T1, ZEROBITS
  478. not T0
  479. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  480. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  481. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  482. jz .CONTINUER15 ; if (idx) {
  483. lea T1, [T1+KK*8]
  484. mov EOB, T1 ; EOB = k + idx;
  485. .CONTINUER15:
  486. add VALUES, 16*2
  487. jmp .PADDINGR
  488. .TRYR8:
  489. LOAD8
  490. pcmpgtw N0, X0
  491. paddw X0, N0
  492. pxor X0, N0
  493. psrlw X0, AL
  494. movdqa XMMWORD [VALUES + (0) * 2], X0
  495. pcmpeqw X0, ONE
  496. packsswb N0, ZERO
  497. packsswb X0, ZERO
  498. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  499. mov T1, ZEROBITS
  500. not T0
  501. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  502. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  503. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  504. jz .CONTINUER8 ; if (idx) {
  505. lea T1, [T1+KK*8]
  506. mov EOB, T1 ; EOB = k + idx;
  507. .CONTINUER8:
  508. add VALUES, 8*2
  509. jmp .PADDINGR
  510. .TRYR7:
  511. and LENEND, 7
  512. LOAD7
  513. pcmpgtw N0, X0
  514. paddw X0, N0
  515. pxor X0, N0
  516. psrlw X0, AL
  517. movdqa XMMWORD [VALUES + (0) * 2], X0
  518. pcmpeqw X0, ONE
  519. packsswb N0, ZERO
  520. packsswb X0, ZERO
  521. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  522. mov T1, ZEROBITS
  523. not T0
  524. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  525. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  526. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  527. jz .CONTINUER7 ; if (idx) {
  528. lea T1, [T1+KK*8]
  529. mov EOB, T1 ; EOB = k + idx;
  530. .CONTINUER7:
  531. add VALUES, 8*2
  532. .PADDINGR:
  533. mov K, LEN
  534. add K, 7
  535. and K, -8
  536. shr K, 3
  537. sub K, DCTSIZE2/8
  538. jz .EPADDINGR
  539. align 16
  540. .ZEROLOOPR:
  541. movdqa XMMWORD [VALUES + 0], ZERO
  542. add VALUES, 8*2
  543. inc K
  544. jnz .ZEROLOOPR
  545. .EPADDINGR:
  546. sub VALUES, DCTSIZE2*2
  547. REDUCE0
  548. mov eax, EOB
  549. pop ebp
  550. pop edi
  551. pop esi
  552. ; pop edx ; need not be preserved
  553. pop ecx
  554. pop ebx
  555. mov esp, ebp ; esp <- aligned ebp
  556. pop esp ; esp <- original ebp
  557. pop ebp
  558. ret
  559. %undef ZERO
  560. %undef ONE
  561. %undef X0
  562. %undef X1
  563. %undef N0
  564. %undef N1
  565. %undef AL
  566. %undef K
  567. %undef KK
  568. %undef EOB
  569. %undef SIGN
  570. %undef LUT
  571. %undef T0
  572. %undef T1
  573. %undef BLOCK
  574. %undef VALUES
  575. %undef LEN
  576. %undef LENEND
  577. ; For some reason, the OS X linker does not honor the request to align the
  578. ; segment unless we do this.
  579. align 32