jidctflt-3dn.asm 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. ;
  2. ; Floating-point IDCT (3DNow! & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2024, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
  12. ;
  13. ; This file contains a floating-point implementation of the inverse DCT
  14. ; (Discrete Cosine Transform). The following code is based directly on
  15. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_CONST
  20. ALIGNZ 32
  21. GLOBAL_DATA(jconst_idct_float_3dnow)
  22. EXTN(jconst_idct_float_3dnow):
  23. PD_1_414 times 2 dd 1.414213562373095048801689
  24. PD_1_847 times 2 dd 1.847759065022573512256366
  25. PD_1_082 times 2 dd 1.082392200292393968799446
  26. PD_2_613 times 2 dd 2.613125929752753055713286
  27. PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
  28. PB_CENTERJSAMP times 8 db CENTERJSAMPLE
  29. ALIGNZ 32
  30. ; --------------------------------------------------------------------------
  31. SECTION SEG_TEXT
  32. BITS 32
  33. ;
  34. ; Perform dequantization and inverse DCT on one block of coefficients.
  35. ;
  36. ; GLOBAL(void)
  37. ; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
  38. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  39. ;
  40. %define dct_table(b) (b) + 8 ; void *dct_table
  41. %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
  42. %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
  43. %define output_col(b) (b) + 20 ; JDIMENSION output_col
  44. %define original_ebp ebp + 0
  45. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
  46. ; mmword wk[WK_NUM]
  47. %define WK_NUM 2
  48. %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  49. ; FAST_FLOAT workspace[DCTSIZE2]
  50. align 32
  51. GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
  52. EXTN(jsimd_idct_float_3dnow):
  53. push ebp
  54. mov eax, esp ; eax = original ebp
  55. sub esp, byte 4
  56. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  57. mov [esp], eax
  58. mov ebp, esp ; ebp = aligned ebp
  59. lea esp, [workspace]
  60. push ebx
  61. ; push ecx ; need not be preserved
  62. ; push edx ; need not be preserved
  63. push esi
  64. push edi
  65. GET_GOT ebx ; get GOT address
  66. ; ---- Pass 1: process columns from input, store into work array.
  67. ; mov eax, [original_ebp]
  68. mov edx, POINTER [dct_table(eax)] ; quantptr
  69. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  70. lea edi, [workspace] ; FAST_FLOAT *wsptr
  71. mov ecx, DCTSIZE/2 ; ctr
  72. ALIGNX 16, 7
  73. .columnloop:
  74. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
  75. mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  76. or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  77. jnz short .columnDCT
  78. PUSHPIC ebx ; save GOT address
  79. mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
  80. mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
  81. or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
  82. or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
  83. or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
  84. or eax, ebx
  85. POPPIC ebx ; restore GOT address
  86. jnz short .columnDCT
  87. ; -- AC terms all zero
  88. movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
  89. punpcklwd mm0, mm0
  90. psrad mm0, (DWORD_BIT-WORD_BIT)
  91. pi2fd mm0, mm0
  92. pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  93. movq mm1, mm0
  94. punpckldq mm0, mm0
  95. punpckhdq mm1, mm1
  96. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
  97. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
  98. movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
  99. movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
  100. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
  101. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
  102. movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
  103. movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
  104. jmp near .nextcolumn
  105. ALIGNX 16, 7
  106. %endif
  107. .columnDCT:
  108. ; -- Even part
  109. movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
  110. movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  111. movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
  112. movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
  113. punpcklwd mm0, mm0
  114. punpcklwd mm1, mm1
  115. psrad mm0, (DWORD_BIT-WORD_BIT)
  116. psrad mm1, (DWORD_BIT-WORD_BIT)
  117. pi2fd mm0, mm0
  118. pi2fd mm1, mm1
  119. pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  120. pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  121. punpcklwd mm2, mm2
  122. punpcklwd mm3, mm3
  123. psrad mm2, (DWORD_BIT-WORD_BIT)
  124. psrad mm3, (DWORD_BIT-WORD_BIT)
  125. pi2fd mm2, mm2
  126. pi2fd mm3, mm3
  127. pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  128. pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  129. movq mm4, mm0
  130. movq mm5, mm1
  131. pfsub mm0, mm2 ; mm0=tmp11
  132. pfsub mm1, mm3
  133. pfadd mm4, mm2 ; mm4=tmp10
  134. pfadd mm5, mm3 ; mm5=tmp13
  135. pfmul mm1, [GOTOFF(ebx,PD_1_414)]
  136. pfsub mm1, mm5 ; mm1=tmp12
  137. movq mm6, mm4
  138. movq mm7, mm0
  139. pfsub mm4, mm5 ; mm4=tmp3
  140. pfsub mm0, mm1 ; mm0=tmp2
  141. pfadd mm6, mm5 ; mm6=tmp0
  142. pfadd mm7, mm1 ; mm7=tmp1
  143. movq MMWORD [wk(1)], mm4 ; tmp3
  144. movq MMWORD [wk(0)], mm0 ; tmp2
  145. ; -- Odd part
  146. movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  147. movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
  148. movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
  149. movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
  150. punpcklwd mm2, mm2
  151. punpcklwd mm3, mm3
  152. psrad mm2, (DWORD_BIT-WORD_BIT)
  153. psrad mm3, (DWORD_BIT-WORD_BIT)
  154. pi2fd mm2, mm2
  155. pi2fd mm3, mm3
  156. pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  157. pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  158. punpcklwd mm5, mm5
  159. punpcklwd mm1, mm1
  160. psrad mm5, (DWORD_BIT-WORD_BIT)
  161. psrad mm1, (DWORD_BIT-WORD_BIT)
  162. pi2fd mm5, mm5
  163. pi2fd mm1, mm1
  164. pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  165. pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  166. movq mm4, mm2
  167. movq mm0, mm5
  168. pfadd mm2, mm1 ; mm2=z11
  169. pfadd mm5, mm3 ; mm5=z13
  170. pfsub mm4, mm1 ; mm4=z12
  171. pfsub mm0, mm3 ; mm0=z10
  172. movq mm1, mm2
  173. pfsub mm2, mm5
  174. pfadd mm1, mm5 ; mm1=tmp7
  175. pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
  176. movq mm3, mm0
  177. pfadd mm0, mm4
  178. pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
  179. pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
  180. pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
  181. pfsubr mm3, mm0 ; mm3=tmp12
  182. pfsub mm4, mm0 ; mm4=tmp10
  183. ; -- Final output stage
  184. pfsub mm3, mm1 ; mm3=tmp6
  185. movq mm5, mm6
  186. movq mm0, mm7
  187. pfadd mm6, mm1 ; mm6=data0=(00 01)
  188. pfadd mm7, mm3 ; mm7=data1=(10 11)
  189. pfsub mm5, mm1 ; mm5=data7=(70 71)
  190. pfsub mm0, mm3 ; mm0=data6=(60 61)
  191. pfsub mm2, mm3 ; mm2=tmp5
  192. movq mm1, mm6 ; transpose coefficients
  193. punpckldq mm6, mm7 ; mm6=(00 10)
  194. punpckhdq mm1, mm7 ; mm1=(01 11)
  195. movq mm3, mm0 ; transpose coefficients
  196. punpckldq mm0, mm5 ; mm0=(60 70)
  197. punpckhdq mm3, mm5 ; mm3=(61 71)
  198. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
  199. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
  200. movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
  201. movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
  202. movq mm7, MMWORD [wk(0)] ; mm7=tmp2
  203. movq mm5, MMWORD [wk(1)] ; mm5=tmp3
  204. pfadd mm4, mm2 ; mm4=tmp4
  205. movq mm6, mm7
  206. movq mm1, mm5
  207. pfadd mm7, mm2 ; mm7=data2=(20 21)
  208. pfadd mm5, mm4 ; mm5=data4=(40 41)
  209. pfsub mm6, mm2 ; mm6=data5=(50 51)
  210. pfsub mm1, mm4 ; mm1=data3=(30 31)
  211. movq mm0, mm7 ; transpose coefficients
  212. punpckldq mm7, mm1 ; mm7=(20 30)
  213. punpckhdq mm0, mm1 ; mm0=(21 31)
  214. movq mm3, mm5 ; transpose coefficients
  215. punpckldq mm5, mm6 ; mm5=(40 50)
  216. punpckhdq mm3, mm6 ; mm3=(41 51)
  217. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
  218. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
  219. movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
  220. movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
  221. .nextcolumn:
  222. add esi, byte 2*SIZEOF_JCOEF ; coef_block
  223. add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  224. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  225. dec ecx ; ctr
  226. jnz near .columnloop
  227. ; -- Prefetch the next coefficient block
  228. prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  229. prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  230. prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  231. prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  232. ; ---- Pass 2: process rows from work array, store into output array.
  233. mov eax, [original_ebp]
  234. lea esi, [workspace] ; FAST_FLOAT *wsptr
  235. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  236. mov eax, JDIMENSION [output_col(eax)]
  237. mov ecx, DCTSIZE/2 ; ctr
  238. ALIGNX 16, 7
  239. .rowloop:
  240. ; -- Even part
  241. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  242. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
  243. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
  244. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
  245. movq mm4, mm0
  246. movq mm5, mm1
  247. pfsub mm0, mm2 ; mm0=tmp11
  248. pfsub mm1, mm3
  249. pfadd mm4, mm2 ; mm4=tmp10
  250. pfadd mm5, mm3 ; mm5=tmp13
  251. pfmul mm1, [GOTOFF(ebx,PD_1_414)]
  252. pfsub mm1, mm5 ; mm1=tmp12
  253. movq mm6, mm4
  254. movq mm7, mm0
  255. pfsub mm4, mm5 ; mm4=tmp3
  256. pfsub mm0, mm1 ; mm0=tmp2
  257. pfadd mm6, mm5 ; mm6=tmp0
  258. pfadd mm7, mm1 ; mm7=tmp1
  259. movq MMWORD [wk(1)], mm4 ; tmp3
  260. movq MMWORD [wk(0)], mm0 ; tmp2
  261. ; -- Odd part
  262. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  263. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
  264. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
  265. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
  266. movq mm4, mm2
  267. movq mm0, mm5
  268. pfadd mm2, mm1 ; mm2=z11
  269. pfadd mm5, mm3 ; mm5=z13
  270. pfsub mm4, mm1 ; mm4=z12
  271. pfsub mm0, mm3 ; mm0=z10
  272. movq mm1, mm2
  273. pfsub mm2, mm5
  274. pfadd mm1, mm5 ; mm1=tmp7
  275. pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
  276. movq mm3, mm0
  277. pfadd mm0, mm4
  278. pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
  279. pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
  280. pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
  281. pfsubr mm3, mm0 ; mm3=tmp12
  282. pfsub mm4, mm0 ; mm4=tmp10
  283. ; -- Final output stage
  284. pfsub mm3, mm1 ; mm3=tmp6
  285. movq mm5, mm6
  286. movq mm0, mm7
  287. pfadd mm6, mm1 ; mm6=data0=(00 10)
  288. pfadd mm7, mm3 ; mm7=data1=(01 11)
  289. pfsub mm5, mm1 ; mm5=data7=(07 17)
  290. pfsub mm0, mm3 ; mm0=data6=(06 16)
  291. pfsub mm2, mm3 ; mm2=tmp5
  292. movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
  293. pcmpeqd mm3, mm3
  294. psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
  295. pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
  296. pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
  297. pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
  298. pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
  299. pand mm6, mm3 ; mm6=(00 -- 10 --)
  300. pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11)
  301. pand mm0, mm3 ; mm0=(06 -- 16 --)
  302. pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17)
  303. por mm6, mm7 ; mm6=(00 01 10 11)
  304. por mm0, mm5 ; mm0=(06 07 16 17)
  305. movq mm1, MMWORD [wk(0)] ; mm1=tmp2
  306. movq mm3, MMWORD [wk(1)] ; mm3=tmp3
  307. pfadd mm4, mm2 ; mm4=tmp4
  308. movq mm7, mm1
  309. movq mm5, mm3
  310. pfadd mm1, mm2 ; mm1=data2=(02 12)
  311. pfadd mm3, mm4 ; mm3=data4=(04 14)
  312. pfsub mm7, mm2 ; mm7=data5=(05 15)
  313. pfsub mm5, mm4 ; mm5=data3=(03 13)
  314. movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
  315. pcmpeqd mm4, mm4
  316. psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
  317. pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
  318. pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
  319. pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
  320. pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
  321. pand mm3, mm4 ; mm3=(04 -- 14 --)
  322. pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15)
  323. pand mm1, mm4 ; mm1=(02 -- 12 --)
  324. pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13)
  325. por mm3, mm7 ; mm3=(04 05 14 15)
  326. por mm1, mm5 ; mm1=(02 03 12 13)
  327. movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
  328. packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15)
  329. packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17)
  330. paddb mm6, mm2
  331. paddb mm1, mm2
  332. movq mm4, mm6 ; transpose coefficients(phase 2)
  333. punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13)
  334. punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17)
  335. movq mm7, mm6 ; transpose coefficients(phase 3)
  336. punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07)
  337. punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17)
  338. PUSHPIC ebx ; save GOT address
  339. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  340. mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  341. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
  342. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
  343. POPPIC ebx ; restore GOT address
  344. add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
  345. add edi, byte 2*SIZEOF_JSAMPROW
  346. dec ecx ; ctr
  347. jnz near .rowloop
  348. femms ; empty MMX/3DNow! state
  349. pop edi
  350. pop esi
  351. ; pop edx ; need not be preserved
  352. ; pop ecx ; need not be preserved
  353. pop ebx
  354. mov esp, ebp ; esp <- aligned ebp
  355. pop esp ; esp <- original ebp
  356. pop ebp
  357. ret
  358. ; For some reason, the OS X linker does not honor the request to align the
  359. ; segment unless we do this.
  360. align 32