jquant-3dn.asm 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. ;
  2. ; Sample data conversion and quantization (3DNow! & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. %include "jsimdext.inc"
  13. %include "jdct.inc"
  14. ; --------------------------------------------------------------------------
  15. SECTION SEG_TEXT
  16. BITS 32
  17. ;
  18. ; Load data into workspace, applying unsigned->signed conversion
  19. ;
  20. ; GLOBAL(void)
  21. ; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
  22. ; FAST_FLOAT *workspace);
  23. ;
  24. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  25. %define start_col ebp + 12 ; JDIMENSION start_col
  26. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  27. align 32
  28. GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
  29. EXTN(jsimd_convsamp_float_3dnow):
  30. push ebp
  31. mov ebp, esp
  32. push ebx
  33. ; push ecx ; need not be preserved
  34. ; push edx ; need not be preserved
  35. push esi
  36. push edi
  37. pcmpeqw mm7, mm7
  38. psllw mm7, 7
  39. packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  40. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  41. mov eax, JDIMENSION [start_col]
  42. mov edi, POINTER [workspace] ; (DCTELEM *)
  43. mov ecx, DCTSIZE/2
  44. ALIGNX 16, 7
  45. .convloop:
  46. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  47. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  48. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  49. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  50. psubb mm0, mm7 ; mm0=(01234567)
  51. psubb mm1, mm7 ; mm1=(89ABCDEF)
  52. punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
  53. punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
  54. punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
  55. punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
  56. punpcklwd mm4, mm2 ; mm4=(***0***1)
  57. punpckhwd mm2, mm2 ; mm2=(***2***3)
  58. punpcklwd mm5, mm0 ; mm5=(***4***5)
  59. punpckhwd mm0, mm0 ; mm0=(***6***7)
  60. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
  61. psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
  62. pi2fd mm4, mm4
  63. pi2fd mm2, mm2
  64. psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
  65. psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
  66. pi2fd mm5, mm5
  67. pi2fd mm0, mm0
  68. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
  69. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
  70. movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
  71. movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
  72. punpcklwd mm6, mm3 ; mm6=(***8***9)
  73. punpckhwd mm3, mm3 ; mm3=(***A***B)
  74. punpcklwd mm4, mm1 ; mm4=(***C***D)
  75. punpckhwd mm1, mm1 ; mm1=(***E***F)
  76. psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
  77. psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  78. pi2fd mm6, mm6
  79. pi2fd mm3, mm3
  80. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  81. psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  82. pi2fd mm4, mm4
  83. pi2fd mm1, mm1
  84. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
  85. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
  86. movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
  87. movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
  88. add esi, byte 2*SIZEOF_JSAMPROW
  89. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  90. dec ecx
  91. jnz near .convloop
  92. femms ; empty MMX/3DNow! state
  93. pop edi
  94. pop esi
  95. ; pop edx ; need not be preserved
  96. ; pop ecx ; need not be preserved
  97. pop ebx
  98. pop ebp
  99. ret
  100. ; --------------------------------------------------------------------------
  101. ;
  102. ; Quantize/descale the coefficients, and store into coef_block
  103. ;
  104. ; GLOBAL(void)
  105. ; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  106. ; FAST_FLOAT *workspace);
  107. ;
  108. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  109. %define divisors ebp + 12 ; FAST_FLOAT *divisors
  110. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  111. align 32
  112. GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
  113. EXTN(jsimd_quantize_float_3dnow):
  114. push ebp
  115. mov ebp, esp
  116. ; push ebx ; unused
  117. ; push ecx ; unused
  118. ; push edx ; need not be preserved
  119. push esi
  120. push edi
  121. mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
  122. movd mm7, eax
  123. punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F}
  124. mov esi, POINTER [workspace]
  125. mov edx, POINTER [divisors]
  126. mov edi, JCOEFPTR [coef_block]
  127. mov eax, DCTSIZE2/16
  128. ALIGNX 16, 7
  129. .quantloop:
  130. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  131. movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  132. pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  133. pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  134. movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
  135. movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
  136. pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
  137. pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
  138. pfadd mm0, mm7 ; mm0=(00 ** 01 **)
  139. pfadd mm1, mm7 ; mm1=(02 ** 03 **)
  140. pfadd mm2, mm7 ; mm0=(04 ** 05 **)
  141. pfadd mm3, mm7 ; mm1=(06 ** 07 **)
  142. movq mm4, mm0
  143. punpcklwd mm0, mm1 ; mm0=(00 02 ** **)
  144. punpckhwd mm4, mm1 ; mm4=(01 03 ** **)
  145. movq mm5, mm2
  146. punpcklwd mm2, mm3 ; mm2=(04 06 ** **)
  147. punpckhwd mm5, mm3 ; mm5=(05 07 ** **)
  148. punpcklwd mm0, mm4 ; mm0=(00 01 02 03)
  149. punpcklwd mm2, mm5 ; mm2=(04 05 06 07)
  150. movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  151. movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  152. pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  153. pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  154. movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
  155. movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
  156. pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
  157. pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
  158. pfadd mm6, mm7 ; mm0=(10 ** 11 **)
  159. pfadd mm1, mm7 ; mm4=(12 ** 13 **)
  160. pfadd mm3, mm7 ; mm0=(14 ** 15 **)
  161. pfadd mm4, mm7 ; mm4=(16 ** 17 **)
  162. movq mm5, mm6
  163. punpcklwd mm6, mm1 ; mm6=(10 12 ** **)
  164. punpckhwd mm5, mm1 ; mm5=(11 13 ** **)
  165. movq mm1, mm3
  166. punpcklwd mm3, mm4 ; mm3=(14 16 ** **)
  167. punpckhwd mm1, mm4 ; mm1=(15 17 ** **)
  168. punpcklwd mm6, mm5 ; mm6=(10 11 12 13)
  169. punpcklwd mm3, mm1 ; mm3=(14 15 16 17)
  170. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  171. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
  172. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
  173. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  174. add esi, byte 16*SIZEOF_FAST_FLOAT
  175. add edx, byte 16*SIZEOF_FAST_FLOAT
  176. add edi, byte 16*SIZEOF_JCOEF
  177. dec eax
  178. jnz near .quantloop
  179. femms ; empty MMX/3DNow! state
  180. pop edi
  181. pop esi
  182. ; pop edx ; need not be preserved
  183. ; pop ecx ; unused
  184. ; pop ebx ; unused
  185. pop ebp
  186. ret
  187. ; For some reason, the OS X linker does not honor the request to align the
  188. ; segment unless we do this.
  189. align 32