jcphuff-sse2.asm 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. ;
  2. ; Prepare data for progressive Huffman encoding (64-bit SSE2)
  3. ;
  4. ; Copyright (C) 2016, 2018, Matthieu Darbois
  5. ; Copyright (C) 2023, Aliaksiej Kandracienka.
  6. ; Copyright (C) 2024, D. R. Commander.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  13. ;
  14. ; This file contains an SSE2 implementation of data preparation for progressive
  15. ; Huffman encoding. See jcphuff.c for more details.
  16. %include "jsimdext.inc"
  17. ; --------------------------------------------------------------------------
  18. SECTION SEG_TEXT
  19. BITS 64
  20. ; --------------------------------------------------------------------------
  21. ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
  22. ; jsimd_encode_mcu_AC_refine_prepare_sse2()
  23. %macro LOAD16 0
  24. pxor N0, N0
  25. pxor N1, N1
  26. mov T0d, INT [LUT + 0*SIZEOF_INT]
  27. mov T1d, INT [LUT + 8*SIZEOF_INT]
  28. pinsrw X0, word [BLOCK + T0 * 2], 0
  29. pinsrw X1, word [BLOCK + T1 * 2], 0
  30. mov T0d, INT [LUT + 1*SIZEOF_INT]
  31. mov T1d, INT [LUT + 9*SIZEOF_INT]
  32. pinsrw X0, word [BLOCK + T0 * 2], 1
  33. pinsrw X1, word [BLOCK + T1 * 2], 1
  34. mov T0d, INT [LUT + 2*SIZEOF_INT]
  35. mov T1d, INT [LUT + 10*SIZEOF_INT]
  36. pinsrw X0, word [BLOCK + T0 * 2], 2
  37. pinsrw X1, word [BLOCK + T1 * 2], 2
  38. mov T0d, INT [LUT + 3*SIZEOF_INT]
  39. mov T1d, INT [LUT + 11*SIZEOF_INT]
  40. pinsrw X0, word [BLOCK + T0 * 2], 3
  41. pinsrw X1, word [BLOCK + T1 * 2], 3
  42. mov T0d, INT [LUT + 4*SIZEOF_INT]
  43. mov T1d, INT [LUT + 12*SIZEOF_INT]
  44. pinsrw X0, word [BLOCK + T0 * 2], 4
  45. pinsrw X1, word [BLOCK + T1 * 2], 4
  46. mov T0d, INT [LUT + 5*SIZEOF_INT]
  47. mov T1d, INT [LUT + 13*SIZEOF_INT]
  48. pinsrw X0, word [BLOCK + T0 * 2], 5
  49. pinsrw X1, word [BLOCK + T1 * 2], 5
  50. mov T0d, INT [LUT + 6*SIZEOF_INT]
  51. mov T1d, INT [LUT + 14*SIZEOF_INT]
  52. pinsrw X0, word [BLOCK + T0 * 2], 6
  53. pinsrw X1, word [BLOCK + T1 * 2], 6
  54. mov T0d, INT [LUT + 7*SIZEOF_INT]
  55. mov T1d, INT [LUT + 15*SIZEOF_INT]
  56. pinsrw X0, word [BLOCK + T0 * 2], 7
  57. pinsrw X1, word [BLOCK + T1 * 2], 7
  58. %endmacro
  59. %macro LOAD15 0
  60. pxor N0, N0
  61. pxor N1, N1
  62. pxor X1, X1
  63. mov T0d, INT [LUT + 0*SIZEOF_INT]
  64. mov T1d, INT [LUT + 8*SIZEOF_INT]
  65. pinsrw X0, word [BLOCK + T0 * 2], 0
  66. pinsrw X1, word [BLOCK + T1 * 2], 0
  67. mov T0d, INT [LUT + 1*SIZEOF_INT]
  68. pinsrw X0, word [BLOCK + T0 * 2], 1
  69. mov T0d, INT [LUT + 2*SIZEOF_INT]
  70. pinsrw X0, word [BLOCK + T0 * 2], 2
  71. mov T0d, INT [LUT + 3*SIZEOF_INT]
  72. pinsrw X0, word [BLOCK + T0 * 2], 3
  73. mov T0d, INT [LUT + 4*SIZEOF_INT]
  74. pinsrw X0, word [BLOCK + T0 * 2], 4
  75. mov T0d, INT [LUT + 5*SIZEOF_INT]
  76. pinsrw X0, word [BLOCK + T0 * 2], 5
  77. mov T0d, INT [LUT + 6*SIZEOF_INT]
  78. pinsrw X0, word [BLOCK + T0 * 2], 6
  79. mov T0d, INT [LUT + 7*SIZEOF_INT]
  80. pinsrw X0, word [BLOCK + T0 * 2], 7
  81. cmp LENEND, 2
  82. jl %%.ELOAD15
  83. mov T1d, INT [LUT + 9*SIZEOF_INT]
  84. pinsrw X1, word [BLOCK + T1 * 2], 1
  85. cmp LENEND, 3
  86. jl %%.ELOAD15
  87. mov T1d, INT [LUT + 10*SIZEOF_INT]
  88. pinsrw X1, word [BLOCK + T1 * 2], 2
  89. cmp LENEND, 4
  90. jl %%.ELOAD15
  91. mov T1d, INT [LUT + 11*SIZEOF_INT]
  92. pinsrw X1, word [BLOCK + T1 * 2], 3
  93. cmp LENEND, 5
  94. jl %%.ELOAD15
  95. mov T1d, INT [LUT + 12*SIZEOF_INT]
  96. pinsrw X1, word [BLOCK + T1 * 2], 4
  97. cmp LENEND, 6
  98. jl %%.ELOAD15
  99. mov T1d, INT [LUT + 13*SIZEOF_INT]
  100. pinsrw X1, word [BLOCK + T1 * 2], 5
  101. cmp LENEND, 7
  102. jl %%.ELOAD15
  103. mov T1d, INT [LUT + 14*SIZEOF_INT]
  104. pinsrw X1, word [BLOCK + T1 * 2], 6
  105. %%.ELOAD15:
  106. %endmacro
  107. %macro LOAD8 0
  108. pxor N0, N0
  109. mov T0d, INT [LUT + 0*SIZEOF_INT]
  110. pinsrw X0, word [BLOCK + T0 * 2], 0
  111. mov T0d, INT [LUT + 1*SIZEOF_INT]
  112. pinsrw X0, word [BLOCK + T0 * 2], 1
  113. mov T0d, INT [LUT + 2*SIZEOF_INT]
  114. pinsrw X0, word [BLOCK + T0 * 2], 2
  115. mov T0d, INT [LUT + 3*SIZEOF_INT]
  116. pinsrw X0, word [BLOCK + T0 * 2], 3
  117. mov T0d, INT [LUT + 4*SIZEOF_INT]
  118. pinsrw X0, word [BLOCK + T0 * 2], 4
  119. mov T0d, INT [LUT + 5*SIZEOF_INT]
  120. pinsrw X0, word [BLOCK + T0 * 2], 5
  121. mov T0d, INT [LUT + 6*SIZEOF_INT]
  122. pinsrw X0, word [BLOCK + T0 * 2], 6
  123. mov T0d, INT [LUT + 7*SIZEOF_INT]
  124. pinsrw X0, word [BLOCK + T0 * 2], 7
  125. %endmacro
  126. %macro LOAD7 0
  127. pxor N0, N0
  128. pxor X0, X0
  129. mov T1d, INT [LUT + 0*SIZEOF_INT]
  130. pinsrw X0, word [BLOCK + T1 * 2], 0
  131. cmp LENEND, 2
  132. jl %%.ELOAD7
  133. mov T1d, INT [LUT + 1*SIZEOF_INT]
  134. pinsrw X0, word [BLOCK + T1 * 2], 1
  135. cmp LENEND, 3
  136. jl %%.ELOAD7
  137. mov T1d, INT [LUT + 2*SIZEOF_INT]
  138. pinsrw X0, word [BLOCK + T1 * 2], 2
  139. cmp LENEND, 4
  140. jl %%.ELOAD7
  141. mov T1d, INT [LUT + 3*SIZEOF_INT]
  142. pinsrw X0, word [BLOCK + T1 * 2], 3
  143. cmp LENEND, 5
  144. jl %%.ELOAD7
  145. mov T1d, INT [LUT + 4*SIZEOF_INT]
  146. pinsrw X0, word [BLOCK + T1 * 2], 4
  147. cmp LENEND, 6
  148. jl %%.ELOAD7
  149. mov T1d, INT [LUT + 5*SIZEOF_INT]
  150. pinsrw X0, word [BLOCK + T1 * 2], 5
  151. cmp LENEND, 7
  152. jl %%.ELOAD7
  153. mov T1d, INT [LUT + 6*SIZEOF_INT]
  154. pinsrw X0, word [BLOCK + T1 * 2], 6
  155. %%.ELOAD7:
  156. %endmacro
  157. %macro REDUCE0 0
  158. movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
  159. movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
  160. movdqa xmm2, XMMWORD [VALUES + (16*2)]
  161. movdqa xmm3, XMMWORD [VALUES + (24*2)]
  162. movdqa xmm4, XMMWORD [VALUES + (32*2)]
  163. movdqa xmm5, XMMWORD [VALUES + (40*2)]
  164. movdqa xmm6, XMMWORD [VALUES + (48*2)]
  165. movdqa xmm7, XMMWORD [VALUES + (56*2)]
  166. pcmpeqw xmm0, ZERO
  167. pcmpeqw xmm1, ZERO
  168. pcmpeqw xmm2, ZERO
  169. pcmpeqw xmm3, ZERO
  170. pcmpeqw xmm4, ZERO
  171. pcmpeqw xmm5, ZERO
  172. pcmpeqw xmm6, ZERO
  173. pcmpeqw xmm7, ZERO
  174. packsswb xmm0, xmm1
  175. packsswb xmm2, xmm3
  176. packsswb xmm4, xmm5
  177. packsswb xmm6, xmm7
  178. pmovmskb eax, xmm0
  179. pmovmskb ecx, xmm2
  180. pmovmskb edx, xmm4
  181. pmovmskb esi, xmm6
  182. shl rcx, 16
  183. shl rdx, 32
  184. shl rsi, 48
  185. or rax, rcx
  186. or rdx, rsi
  187. or rax, rdx
  188. not rax
  189. mov MMWORD [r15], rax
  190. %endmacro
  191. ;
  192. ; Prepare data for jsimd_encode_mcu_AC_first().
  193. ;
  194. ; GLOBAL(void)
  195. ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
  196. ; const int *jpeg_natural_order_start,
  197. ; int Sl, int Al, JCOEF *values,
  198. ; size_t *zerobits)
  199. ;
  200. ; r10 = const JCOEF *block
  201. ; r11 = const int *jpeg_natural_order_start
  202. ; r12 = int Sl
  203. ; r13 = int Al
  204. ; r14 = JCOEF *values
  205. ; r15 = size_t *zerobits
  206. %define ZERO xmm9
  207. %define X0 xmm0
  208. %define X1 xmm1
  209. %define N0 xmm2
  210. %define N1 xmm3
  211. %define AL xmm4
  212. %define K eax
  213. %define LUT r11
  214. %define T0 rcx
  215. %define T0d ecx
  216. %define T1 rdx
  217. %define T1d edx
  218. %define BLOCK r10
  219. %define VALUES r14
  220. %define LEN r12d
  221. %define LENEND r13d
  222. align 32
  223. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
  224. EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
  225. ENDBR64
  226. push rbp
  227. mov rbp, rsp
  228. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  229. sub rsp, SIZEOF_XMMWORD
  230. movdqa XMMWORD [rsp], ZERO
  231. COLLECT_ARGS 6
  232. movd AL, r13d
  233. pxor ZERO, ZERO
  234. mov K, LEN
  235. mov LENEND, LEN
  236. and K, -16
  237. and LENEND, 7
  238. shr K, 4
  239. jz .ELOOP16
  240. .BLOOP16:
  241. LOAD16
  242. pcmpgtw N0, X0
  243. pcmpgtw N1, X1
  244. paddw X0, N0
  245. paddw X1, N1
  246. pxor X0, N0
  247. pxor X1, N1
  248. psrlw X0, AL
  249. psrlw X1, AL
  250. pxor N0, X0
  251. pxor N1, X1
  252. movdqa XMMWORD [VALUES + (0) * 2], X0
  253. movdqa XMMWORD [VALUES + (8) * 2], X1
  254. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  255. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  256. add VALUES, 16*2
  257. add LUT, 16*SIZEOF_INT
  258. dec K
  259. jnz .BLOOP16
  260. test LEN, 15
  261. je .PADDING
  262. .ELOOP16:
  263. test LEN, 8
  264. jz .TRY7
  265. test LEN, 7
  266. jz .TRY8
  267. LOAD15
  268. pcmpgtw N0, X0
  269. pcmpgtw N1, X1
  270. paddw X0, N0
  271. paddw X1, N1
  272. pxor X0, N0
  273. pxor X1, N1
  274. psrlw X0, AL
  275. psrlw X1, AL
  276. pxor N0, X0
  277. pxor N1, X1
  278. movdqa XMMWORD [VALUES + (0) * 2], X0
  279. movdqa XMMWORD [VALUES + (8) * 2], X1
  280. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  281. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  282. add VALUES, 16*2
  283. jmp .PADDING
  284. .TRY8:
  285. LOAD8
  286. pcmpgtw N0, X0
  287. paddw X0, N0
  288. pxor X0, N0
  289. psrlw X0, AL
  290. pxor N0, X0
  291. movdqa XMMWORD [VALUES + (0) * 2], X0
  292. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  293. add VALUES, 8*2
  294. jmp .PADDING
  295. .TRY7:
  296. LOAD7
  297. pcmpgtw N0, X0
  298. paddw X0, N0
  299. pxor X0, N0
  300. psrlw X0, AL
  301. pxor N0, X0
  302. movdqa XMMWORD [VALUES + (0) * 2], X0
  303. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  304. add VALUES, 8*2
  305. .PADDING:
  306. mov K, LEN
  307. add K, 7
  308. and K, -8
  309. shr K, 3
  310. sub K, DCTSIZE2/8
  311. jz .EPADDING
  312. align 16
  313. .ZEROLOOP:
  314. movdqa XMMWORD [VALUES + 0], ZERO
  315. add VALUES, 8*2
  316. inc K
  317. jnz .ZEROLOOP
  318. .EPADDING:
  319. sub VALUES, DCTSIZE2*2
  320. REDUCE0
  321. UNCOLLECT_ARGS 6
  322. movdqa ZERO, XMMWORD [rsp]
  323. mov rsp, rbp
  324. pop rbp
  325. ret
  326. %undef ZERO
  327. %undef X0
  328. %undef X1
  329. %undef N0
  330. %undef N1
  331. %undef AL
  332. %undef K
  333. %undef LUT
  334. %undef T0
  335. %undef T0d
  336. %undef T1
  337. %undef T1d
  338. %undef BLOCK
  339. %undef VALUES
  340. %undef LEN
  341. %undef LENEND
  342. ;
  343. ; Prepare data for jsimd_encode_mcu_AC_refine().
  344. ;
  345. ; GLOBAL(int)
  346. ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
  347. ; const int *jpeg_natural_order_start,
  348. ; int Sl, int Al, JCOEF *absvalues,
  349. ; size_t *bits)
  350. ;
  351. ; r10 = const JCOEF *block
  352. ; r11 = const int *jpeg_natural_order_start
  353. ; r12 = int Sl
  354. ; r13 = int Al
  355. ; r14 = JCOEF *values
  356. ; r15 = size_t *bits
  357. %define ZERO xmm9
  358. %define ONE xmm5
  359. %define X0 xmm0
  360. %define X1 xmm1
  361. %define N0 xmm2
  362. %define N1 xmm3
  363. %define AL xmm4
  364. %define K eax
  365. %define KK r9d
  366. %define EOB r8d
  367. %define SIGN rdi
  368. %define LUT r11
  369. %define T0 rcx
  370. %define T0d ecx
  371. %define T1 rdx
  372. %define T1d edx
  373. %define BLOCK r10
  374. %define VALUES r14
  375. %define LEN r12d
  376. %define LENEND r13d
  377. align 32
  378. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
  379. EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
  380. ENDBR64
  381. push rbp
  382. mov rbp, rsp
  383. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  384. sub rsp, SIZEOF_XMMWORD
  385. movdqa XMMWORD [rsp], ZERO
  386. COLLECT_ARGS 6
  387. xor SIGN, SIGN
  388. xor EOB, EOB
  389. xor KK, KK
  390. movd AL, r13d
  391. pxor ZERO, ZERO
  392. pcmpeqw ONE, ONE
  393. psrlw ONE, 15
  394. mov K, LEN
  395. mov LENEND, LEN
  396. and K, -16
  397. and LENEND, 7
  398. shr K, 4
  399. jz .ELOOPR16
  400. .BLOOPR16:
  401. LOAD16
  402. pcmpgtw N0, X0
  403. pcmpgtw N1, X1
  404. paddw X0, N0
  405. paddw X1, N1
  406. pxor X0, N0
  407. pxor X1, N1
  408. psrlw X0, AL
  409. psrlw X1, AL
  410. movdqa XMMWORD [VALUES + (0) * 2], X0
  411. movdqa XMMWORD [VALUES + (8) * 2], X1
  412. pcmpeqw X0, ONE
  413. pcmpeqw X1, ONE
  414. packsswb N0, N1
  415. packsswb X0, X1
  416. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  417. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  418. shr SIGN, 16 ; make room for sizebits
  419. shl T0, 48
  420. or SIGN, T0
  421. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  422. jz .CONTINUER16 ; if (idx) {
  423. mov EOB, KK
  424. add EOB, T1d ; EOB = k + idx;
  425. .CONTINUER16:
  426. add VALUES, 16*2
  427. add LUT, 16*SIZEOF_INT
  428. add KK, 16
  429. dec K
  430. jnz .BLOOPR16
  431. test LEN, 15
  432. je .PADDINGR
  433. .ELOOPR16:
  434. test LEN, 8
  435. jz .TRYR7
  436. test LEN, 7
  437. jz .TRYR8
  438. LOAD15
  439. pcmpgtw N0, X0
  440. pcmpgtw N1, X1
  441. paddw X0, N0
  442. paddw X1, N1
  443. pxor X0, N0
  444. pxor X1, N1
  445. psrlw X0, AL
  446. psrlw X1, AL
  447. movdqa XMMWORD [VALUES + (0) * 2], X0
  448. movdqa XMMWORD [VALUES + (8) * 2], X1
  449. pcmpeqw X0, ONE
  450. pcmpeqw X1, ONE
  451. packsswb N0, N1
  452. packsswb X0, X1
  453. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  454. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  455. shr SIGN, 16 ; make room for sizebits
  456. shl T0, 48
  457. or SIGN, T0
  458. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  459. jz .CONTINUER15 ; if (idx) {
  460. mov EOB, KK
  461. add EOB, T1d ; EOB = k + idx;
  462. .CONTINUER15:
  463. add VALUES, 16*2
  464. jmp .PADDINGR
  465. .TRYR8:
  466. LOAD8
  467. pcmpgtw N0, X0
  468. paddw X0, N0
  469. pxor X0, N0
  470. psrlw X0, AL
  471. movdqa XMMWORD [VALUES + (0) * 2], X0
  472. pcmpeqw X0, ONE
  473. packsswb N0, ZERO
  474. packsswb X0, ZERO
  475. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  476. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  477. shr SIGN, 8 ; make room for sizebits
  478. shl T0, 56
  479. or SIGN, T0
  480. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  481. jz .CONTINUER8 ; if (idx) {
  482. mov EOB, KK
  483. add EOB, T1d ; EOB = k + idx;
  484. .CONTINUER8:
  485. add VALUES, 8*2
  486. jmp .PADDINGR
  487. .TRYR7:
  488. LOAD7
  489. pcmpgtw N0, X0
  490. paddw X0, N0
  491. pxor X0, N0
  492. psrlw X0, AL
  493. movdqa XMMWORD [VALUES + (0) * 2], X0
  494. pcmpeqw X0, ONE
  495. packsswb N0, ZERO
  496. packsswb X0, ZERO
  497. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  498. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  499. shr SIGN, 8 ; make room for sizebits
  500. shl T0, 56
  501. or SIGN, T0
  502. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  503. jz .CONTINUER7 ; if (idx) {
  504. mov EOB, KK
  505. add EOB, T1d ; EOB = k + idx;
  506. .CONTINUER7:
  507. add VALUES, 8*2
  508. .PADDINGR:
  509. mov K, LEN
  510. add K, 7
  511. and K, -8
  512. shr K, 3
  513. sub K, DCTSIZE2/8
  514. jz .EPADDINGR
  515. align 16
  516. .ZEROLOOPR:
  517. movdqa XMMWORD [VALUES + 0], ZERO
  518. shr SIGN, 8
  519. add VALUES, 8*2
  520. inc K
  521. jnz .ZEROLOOPR
  522. .EPADDINGR:
  523. not SIGN
  524. sub VALUES, DCTSIZE2*2
  525. mov MMWORD [r15+SIZEOF_MMWORD], SIGN
  526. REDUCE0
  527. mov eax, EOB
  528. UNCOLLECT_ARGS 6
  529. movdqa ZERO, XMMWORD [rsp]
  530. mov rsp, rbp
  531. pop rbp
  532. ret
  533. %undef ZERO
  534. %undef ONE
  535. %undef X0
  536. %undef X1
  537. %undef N0
  538. %undef N1
  539. %undef AL
  540. %undef K
  541. %undef KK
  542. %undef EOB
  543. %undef SIGN
  544. %undef LUT
  545. %undef T0
  546. %undef T0d
  547. %undef T1
  548. %undef T1d
  549. %undef BLOCK
  550. %undef VALUES
  551. %undef LEN
  552. %undef LENEND
  553. ; For some reason, the OS X linker does not honor the request to align the
  554. ; segment unless we do this.
  555. align 32