jquant-mmx.asm 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. ;
  2. ; Sample data conversion and quantization (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. %include "jsimdext.inc"
  13. %include "jdct.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 32
  17. ;
  18. ; Load data into workspace, applying unsigned->signed conversion
  19. ;
  20. ; GLOBAL(void)
  21. ; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
  22. ; DCTELEM *workspace);
  23. ;
  24. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  25. %define start_col ebp + 12 ; JDIMENSION start_col
  26. %define workspace ebp + 16 ; DCTELEM *workspace
  27. align 32
  28. GLOBAL_FUNCTION(jsimd_convsamp_mmx)
  29. EXTN(jsimd_convsamp_mmx):
  30. push ebp
  31. mov ebp, esp
  32. push ebx
  33. ; push ecx ; need not be preserved
  34. ; push edx ; need not be preserved
  35. push esi
  36. push edi
  37. pxor mm6, mm6 ; mm6=(all 0's)
  38. pcmpeqw mm7, mm7
  39. psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
  40. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  41. mov eax, JDIMENSION [start_col]
  42. mov edi, POINTER [workspace] ; (DCTELEM *)
  43. mov ecx, DCTSIZE/4
  44. ALIGNX 16, 7
  45. .convloop:
  46. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  47. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  48. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
  49. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
  50. mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
  53. movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
  54. movq mm4, mm0
  55. punpcklbw mm0, mm6 ; mm0=(0123)
  56. punpckhbw mm4, mm6 ; mm4=(4567)
  57. movq mm5, mm1
  58. punpcklbw mm1, mm6 ; mm1=(89AB)
  59. punpckhbw mm5, mm6 ; mm5=(CDEF)
  60. paddw mm0, mm7
  61. paddw mm4, mm7
  62. paddw mm1, mm7
  63. paddw mm5, mm7
  64. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
  65. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
  66. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
  67. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
  68. movq mm0, mm2
  69. punpcklbw mm2, mm6 ; mm2=(GHIJ)
  70. punpckhbw mm0, mm6 ; mm0=(KLMN)
  71. movq mm4, mm3
  72. punpcklbw mm3, mm6 ; mm3=(OPQR)
  73. punpckhbw mm4, mm6 ; mm4=(STUV)
  74. paddw mm2, mm7
  75. paddw mm0, mm7
  76. paddw mm3, mm7
  77. paddw mm4, mm7
  78. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
  79. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
  80. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
  81. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
  82. add esi, byte 4*SIZEOF_JSAMPROW
  83. add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  84. dec ecx
  85. jnz short .convloop
  86. emms ; empty MMX state
  87. pop edi
  88. pop esi
  89. ; pop edx ; need not be preserved
  90. ; pop ecx ; need not be preserved
  91. pop ebx
  92. pop ebp
  93. ret
  94. ; --------------------------------------------------------------------------
  95. ;
  96. ; Quantize/descale the coefficients, and store into coef_block
  97. ;
  98. ; This implementation is based on an algorithm described in
  99. ; "Optimizing subroutines in assembly language:
  100. ; An optimization guide for x86 platforms" (https://agner.org/optimize).
  101. ;
  102. ; GLOBAL(void)
  103. ; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
  104. ; DCTELEM *workspace);
  105. ;
  106. %define RECIPROCAL(m, n, b) \
  107. MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  108. %define CORRECTION(m, n, b) \
  109. MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  110. %define SCALE(m, n, b) \
  111. MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  112. %define SHIFT(m, n, b) \
  113. MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
  114. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  115. %define divisors ebp + 12 ; DCTELEM *divisors
  116. %define workspace ebp + 16 ; DCTELEM *workspace
  117. align 32
  118. GLOBAL_FUNCTION(jsimd_quantize_mmx)
  119. EXTN(jsimd_quantize_mmx):
  120. push ebp
  121. mov ebp, esp
  122. ; push ebx ; unused
  123. ; push ecx ; unused
  124. ; push edx ; need not be preserved
  125. push esi
  126. push edi
  127. mov esi, POINTER [workspace]
  128. mov edx, POINTER [divisors]
  129. mov edi, JCOEFPTR [coef_block]
  130. mov ah, 2
  131. ALIGNX 16, 7
  132. .quantloop1:
  133. mov al, DCTSIZE2/8/2
  134. ALIGNX 16, 7
  135. .quantloop2:
  136. movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
  137. movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
  138. movq mm0, mm2
  139. movq mm1, mm3
  140. psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise
  141. psraw mm3, (WORD_BIT-1)
  142. pxor mm0, mm2 ; val = -val
  143. pxor mm1, mm3
  144. psubw mm0, mm2
  145. psubw mm1, mm3
  146. ;
  147. ; MMX is an annoyingly crappy instruction set. It has two
  148. ; misfeatures that are causing problems here:
  149. ;
  150. ; - All multiplications are signed.
  151. ;
  152. ; - The second operand for the shifts is not treated as packed.
  153. ;
  154. ;
  155. ; We work around the first problem by implementing this algorithm:
  156. ;
  157. ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
  158. ; {
  159. ; enum { SHORT_BIT = 16 };
  160. ; signed short sx = (signed short)x;
  161. ; signed short sy = (signed short)y;
  162. ; signed long sz;
  163. ;
  164. ; sz = (long)sx * (long)sy; /* signed multiply */
  165. ;
  166. ; if (sx < 0) sz += (long)sy << SHORT_BIT;
  167. ; if (sy < 0) sz += (long)sx << SHORT_BIT;
  168. ;
  169. ; return (unsigned long)sz;
  170. ; }
  171. ;
  172. ; (note that a negative sx adds _sy_ and vice versa)
  173. ;
  174. ; For the second problem, we replace the shift by a multiplication.
  175. ; Unfortunately that means we have to deal with the signed issue again.
  176. ;
  177. paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
  178. paddw mm1, MMWORD [CORRECTION(0,1,edx)]
  179. movq mm4, mm0 ; store current value for later
  180. movq mm5, mm1
  181. pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
  182. pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
  183. paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
  184. paddw mm1, mm5 ; so we always need to add the initial value
  185. ; (input value is never negative as we
  186. ; inverted it at the start of this routine)
  187. ; here it gets a bit tricky as both scale
  188. ; and mm0/mm1 can be negative
  189. movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
  190. movq mm7, MMWORD [SCALE(0,1,edx)]
  191. movq mm4, mm0
  192. movq mm5, mm1
  193. pmulhw mm0, mm6
  194. pmulhw mm1, mm7
  195. psraw mm6, (WORD_BIT-1) ; determine if scale is negative
  196. psraw mm7, (WORD_BIT-1)
  197. pand mm6, mm4 ; and add input if it is
  198. pand mm7, mm5
  199. paddw mm0, mm6
  200. paddw mm1, mm7
  201. psraw mm4, (WORD_BIT-1) ; then check if negative input
  202. psraw mm5, (WORD_BIT-1)
  203. pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
  204. pand mm5, MMWORD [SCALE(0,1,edx)]
  205. paddw mm0, mm4
  206. paddw mm1, mm5
  207. pxor mm0, mm2 ; val = -val
  208. pxor mm1, mm3
  209. psubw mm0, mm2
  210. psubw mm1, mm3
  211. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
  212. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
  213. add esi, byte 8*SIZEOF_DCTELEM
  214. add edx, byte 8*SIZEOF_DCTELEM
  215. add edi, byte 8*SIZEOF_JCOEF
  216. dec al
  217. jnz near .quantloop2
  218. dec ah
  219. jnz near .quantloop1 ; to avoid branch misprediction
  220. emms ; empty MMX state
  221. pop edi
  222. pop esi
  223. ; pop edx ; need not be preserved
  224. ; pop ecx ; unused
  225. ; pop ebx ; unused
  226. pop ebp
  227. ret
  228. ; For some reason, the OS X linker does not honor the request to align the
  229. ; segment unless we do this.
  230. align 32