jquant-sse.asm 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. ;
  2. ; Sample data conversion and quantization (SSE & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. %include "jsimdext.inc"
  13. %include "jdct.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 32
  17. ;
  18. ; Load data into workspace, applying unsigned->signed conversion
  19. ;
  20. ; GLOBAL(void)
  21. ; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
  22. ; FAST_FLOAT *workspace);
  23. ;
  24. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  25. %define start_col ebp + 12 ; JDIMENSION start_col
  26. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  27. align 32
  28. GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
  29. EXTN(jsimd_convsamp_float_sse):
  30. push ebp
  31. mov ebp, esp
  32. push ebx
  33. ; push ecx ; need not be preserved
  34. ; push edx ; need not be preserved
  35. push esi
  36. push edi
  37. pcmpeqw mm7, mm7
  38. psllw mm7, 7
  39. packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  40. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  41. mov eax, JDIMENSION [start_col]
  42. mov edi, POINTER [workspace] ; (DCTELEM *)
  43. mov ecx, DCTSIZE/2
  44. ALIGNX 16, 7
  45. .convloop:
  46. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  47. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  48. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  49. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  50. psubb mm0, mm7 ; mm0=(01234567)
  51. psubb mm1, mm7 ; mm1=(89ABCDEF)
  52. punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
  53. punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
  54. punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
  55. punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
  56. punpcklwd mm4, mm2 ; mm4=(***0***1)
  57. punpckhwd mm2, mm2 ; mm2=(***2***3)
  58. punpcklwd mm5, mm0 ; mm5=(***4***5)
  59. punpckhwd mm0, mm0 ; mm0=(***6***7)
  60. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
  61. psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
  62. cvtpi2ps xmm0, mm4 ; xmm0=(01**)
  63. cvtpi2ps xmm1, mm2 ; xmm1=(23**)
  64. psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
  65. psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
  66. cvtpi2ps xmm2, mm5 ; xmm2=(45**)
  67. cvtpi2ps xmm3, mm0 ; xmm3=(67**)
  68. punpcklwd mm6, mm3 ; mm6=(***8***9)
  69. punpckhwd mm3, mm3 ; mm3=(***A***B)
  70. punpcklwd mm4, mm1 ; mm4=(***C***D)
  71. punpckhwd mm1, mm1 ; mm1=(***E***F)
  72. psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
  73. psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  74. cvtpi2ps xmm4, mm6 ; xmm4=(89**)
  75. cvtpi2ps xmm5, mm3 ; xmm5=(AB**)
  76. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  77. psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  78. cvtpi2ps xmm6, mm4 ; xmm6=(CD**)
  79. cvtpi2ps xmm7, mm1 ; xmm7=(EF**)
  80. movlhps xmm0, xmm1 ; xmm0=(0123)
  81. movlhps xmm2, xmm3 ; xmm2=(4567)
  82. movlhps xmm4, xmm5 ; xmm4=(89AB)
  83. movlhps xmm6, xmm7 ; xmm6=(CDEF)
  84. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  85. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
  86. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
  87. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
  88. add esi, byte 2*SIZEOF_JSAMPROW
  89. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  90. dec ecx
  91. jnz near .convloop
  92. emms ; empty MMX state
  93. pop edi
  94. pop esi
  95. ; pop edx ; need not be preserved
  96. ; pop ecx ; need not be preserved
  97. pop ebx
  98. pop ebp
  99. ret
  100. ; --------------------------------------------------------------------------
  101. ;
  102. ; Quantize/descale the coefficients, and store into coef_block
  103. ;
  104. ; GLOBAL(void)
  105. ; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  106. ; FAST_FLOAT *workspace);
  107. ;
  108. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  109. %define divisors ebp + 12 ; FAST_FLOAT *divisors
  110. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  111. align 32
  112. GLOBAL_FUNCTION(jsimd_quantize_float_sse)
  113. EXTN(jsimd_quantize_float_sse):
  114. push ebp
  115. mov ebp, esp
  116. ; push ebx ; unused
  117. ; push ecx ; unused
  118. ; push edx ; need not be preserved
  119. push esi
  120. push edi
  121. mov esi, POINTER [workspace]
  122. mov edx, POINTER [divisors]
  123. mov edi, JCOEFPTR [coef_block]
  124. mov eax, DCTSIZE2/16
  125. ALIGNX 16, 7
  126. .quantloop:
  127. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  128. movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  129. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  130. mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  131. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  132. movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  133. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  134. mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  135. movhlps xmm4, xmm0
  136. movhlps xmm5, xmm1
  137. cvtps2pi mm0, xmm0
  138. cvtps2pi mm1, xmm1
  139. cvtps2pi mm4, xmm4
  140. cvtps2pi mm5, xmm5
  141. movhlps xmm6, xmm2
  142. movhlps xmm7, xmm3
  143. cvtps2pi mm2, xmm2
  144. cvtps2pi mm3, xmm3
  145. cvtps2pi mm6, xmm6
  146. cvtps2pi mm7, xmm7
  147. packssdw mm0, mm4
  148. packssdw mm1, mm5
  149. packssdw mm2, mm6
  150. packssdw mm3, mm7
  151. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  152. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  153. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
  154. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  155. add esi, byte 16*SIZEOF_FAST_FLOAT
  156. add edx, byte 16*SIZEOF_FAST_FLOAT
  157. add edi, byte 16*SIZEOF_JCOEF
  158. dec eax
  159. jnz short .quantloop
  160. emms ; empty MMX state
  161. pop edi
  162. pop esi
  163. ; pop edx ; need not be preserved
  164. ; pop ecx ; unused
  165. ; pop ebx ; unused
  166. pop ebp
  167. ret
  168. ; For some reason, the OS X linker does not honor the request to align the
  169. ; segment unless we do this.
  170. align 32