jquantf-sse2.asm 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. ;
  2. ; Sample data conversion and quantization (SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. %include "jsimdext.inc"
  13. %include "jdct.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 32
  17. ;
  18. ; Load data into workspace, applying unsigned->signed conversion
  19. ;
  20. ; GLOBAL(void)
  21. ; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
  22. ; FAST_FLOAT *workspace);
  23. ;
  24. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  25. %define start_col ebp + 12 ; JDIMENSION start_col
  26. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  27. align 32
  28. GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
  29. EXTN(jsimd_convsamp_float_sse2):
  30. push ebp
  31. mov ebp, esp
  32. push ebx
  33. ; push ecx ; need not be preserved
  34. ; push edx ; need not be preserved
  35. push esi
  36. push edi
  37. pcmpeqw xmm7, xmm7
  38. psllw xmm7, 7
  39. packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
  40. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  41. mov eax, JDIMENSION [start_col]
  42. mov edi, POINTER [workspace] ; (DCTELEM *)
  43. mov ecx, DCTSIZE/2
  44. ALIGNX 16, 7
  45. .convloop:
  46. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  47. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  48. movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  49. movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
  50. psubb xmm0, xmm7 ; xmm0=(01234567)
  51. psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
  52. punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
  53. punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
  54. punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
  55. punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
  56. punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
  57. punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
  58. psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
  59. psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
  60. cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
  61. cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
  62. psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
  63. psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
  64. cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
  65. cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
  66. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
  67. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
  68. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
  69. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
  70. add esi, byte 2*SIZEOF_JSAMPROW
  71. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  72. dec ecx
  73. jnz short .convloop
  74. pop edi
  75. pop esi
  76. ; pop edx ; need not be preserved
  77. ; pop ecx ; need not be preserved
  78. pop ebx
  79. pop ebp
  80. ret
  81. ; --------------------------------------------------------------------------
  82. ;
  83. ; Quantize/descale the coefficients, and store into coef_block
  84. ;
  85. ; GLOBAL(void)
  86. ; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  87. ; FAST_FLOAT *workspace);
  88. ;
  89. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  90. %define divisors ebp + 12 ; FAST_FLOAT *divisors
  91. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  92. align 32
  93. GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
  94. EXTN(jsimd_quantize_float_sse2):
  95. push ebp
  96. mov ebp, esp
  97. ; push ebx ; unused
  98. ; push ecx ; unused
  99. ; push edx ; need not be preserved
  100. push esi
  101. push edi
  102. mov esi, POINTER [workspace]
  103. mov edx, POINTER [divisors]
  104. mov edi, JCOEFPTR [coef_block]
  105. mov eax, DCTSIZE2/16
  106. ALIGNX 16, 7
  107. .quantloop:
  108. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  109. movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  110. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  111. mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  112. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  113. movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  114. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  115. mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  116. cvtps2dq xmm0, xmm0
  117. cvtps2dq xmm1, xmm1
  118. cvtps2dq xmm2, xmm2
  119. cvtps2dq xmm3, xmm3
  120. packssdw xmm0, xmm1
  121. packssdw xmm2, xmm3
  122. movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
  123. movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
  124. add esi, byte 16*SIZEOF_FAST_FLOAT
  125. add edx, byte 16*SIZEOF_FAST_FLOAT
  126. add edi, byte 16*SIZEOF_JCOEF
  127. dec eax
  128. jnz short .quantloop
  129. pop edi
  130. pop esi
  131. ; pop edx ; need not be preserved
  132. ; pop ecx ; unused
  133. ; pop ebx ; unused
  134. pop ebp
  135. ret
  136. ; For some reason, the OS X linker does not honor the request to align the
  137. ; segment unless we do this.
  138. align 32