jsimdext.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. ;
  2. ; jsimdext.inc - common declarations
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2010, 2016, 2018-2019, 2024, D. R. Commander.
  6. ; Copyright (C) 2018, Matthieu Darbois.
  7. ; Copyright (C) 2018, Matthias Räncker.
  8. ; Copyright (C) 2023, Aliaksiej Kandracienka.
  9. ;
  10. ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
  11. ;
  12. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  13. ;
  14. ; This software is provided 'as-is', without any express or implied
  15. ; warranty. In no event will the authors be held liable for any damages
  16. ; arising from the use of this software.
  17. ;
  18. ; Permission is granted to anyone to use this software for any purpose,
  19. ; including commercial applications, and to alter it and redistribute it
  20. ; freely, subject to the following restrictions:
  21. ;
  22. ; 1. The origin of this software must not be misrepresented; you must not
  23. ; claim that you wrote the original software. If you use this software
  24. ; in a product, an acknowledgment in the product documentation would be
  25. ; appreciated but is not required.
  26. ; 2. Altered source versions must be plainly marked as such, and must not be
  27. ; misrepresented as being the original software.
  28. ; 3. This notice may not be removed or altered from any source distribution.
  29. ; ==========================================================================
  30. ; System-dependent configurations
  31. %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
  32. ; * Microsoft Visual C++
  33. ; * MinGW (Minimalist GNU for Windows)
  34. ; * CygWin
  35. ; * LCC-Win32
  36. ; -- segment definition --
  37. ;
  38. %ifdef __YASM_VER__
  39. %define SEG_TEXT .text align=32
  40. %define SEG_CONST .rdata align=32
  41. %else
  42. %define SEG_TEXT .text align=32 public use32 class=CODE
  43. %define SEG_CONST .rdata align=32 public use32 class=CONST
  44. %endif
  45. %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
  46. ; * Microsoft Visual C++
  47. ; -- segment definition --
  48. ;
  49. %ifdef __YASM_VER__
  50. %define SEG_TEXT .text align=32
  51. %define SEG_CONST .rdata align=32
  52. %else
  53. %define SEG_TEXT .text align=32 public use64 class=CODE
  54. %define SEG_CONST .rdata align=32 public use64 class=CONST
  55. %endif
  56. %define EXTN(name) name ; foo() -> foo
  57. %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
  58. ; * Borland C++ (Win32)
  59. ; -- segment definition --
  60. ;
  61. %define SEG_TEXT _text align=32 public use32 class=CODE
  62. %define SEG_CONST _data align=32 public use32 class=DATA
  63. %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
  64. ; * Linux
  65. ; * *BSD family Unix using elf format
  66. ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
  67. ; mark stack as non-executable
  68. section .note.GNU-stack noalloc noexec nowrite progbits
  69. %ifdef __CET__
  70. %ifdef __x86_64__
  71. section .note.gnu.property note alloc noexec align=8
  72. dd 0x00000004, 0x00000010, 0x00000005, 0x00554e47
  73. dd 0xc0000002, 0x00000004, 0x00000003, 0x00000000
  74. %endif
  75. %endif
  76. ; -- segment definition --
  77. ;
  78. %ifdef __x86_64__
  79. %define SEG_TEXT .text progbits align=32
  80. %define SEG_CONST .rodata progbits align=32
  81. %else
  82. %define SEG_TEXT .text progbits alloc exec nowrite align=32
  83. %define SEG_CONST .rodata progbits alloc noexec nowrite align=32
  84. %endif
  85. ; To make the code position-independent, append -DPIC to the commandline
  86. ;
  87. %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
  88. %define EXTN(name) name ; foo() -> foo
  89. %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
  90. ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
  91. ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
  92. ; -- segment definition --
  93. ;
  94. %define SEG_TEXT .text
  95. %define SEG_CONST .data
  96. ; To make the code position-independent, append -DPIC to the commandline
  97. ;
  98. %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
  99. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  100. ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
  101. ; -- segment definition --
  102. ;
  103. %define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
  104. %define SEG_CONST .rodata align=32
  105. ; The generation of position-independent code (PIC) is the default on Darwin.
  106. ;
  107. %define PIC
  108. %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
  109. %else ; ----(Other case)----------------------
  110. ; -- segment definition --
  111. ;
  112. %define SEG_TEXT .text
  113. %define SEG_CONST .data
  114. %endif ; ----------------------------------------------
  115. ; ==========================================================================
  116. ; --------------------------------------------------------------------------
  117. ; Common types
  118. ;
  119. %ifdef __x86_64__
  120. %ifnidn __OUTPUT_FORMAT__, elfx32
  121. %define POINTER qword ; general pointer type
  122. %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
  123. %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  124. %define resp resq
  125. %define dp dq
  126. %define raxp rax
  127. %define rbxp rbx
  128. %define rcxp rcx
  129. %define rdxp rdx
  130. %define rsip rsi
  131. %define rdip rdi
  132. %define rbpp rbp
  133. %define rspp rsp
  134. %define r8p r8
  135. %define r9p r9
  136. %define r10p r10
  137. %define r11p r11
  138. %define r12p r12
  139. %define r13p r13
  140. %define r14p r14
  141. %define r15p r15
  142. %endif
  143. %endif
  144. %ifndef raxp
  145. %define POINTER dword ; general pointer type
  146. %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
  147. %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  148. %define resp resd
  149. %define dp dd
  150. ; x86_64 ILP32 ABI (x32)
  151. %define raxp eax
  152. %define rbxp ebx
  153. %define rcxp ecx
  154. %define rdxp edx
  155. %define rsip esi
  156. %define rdip edi
  157. %define rbpp ebp
  158. %define rspp esp
  159. %define r8p r8d
  160. %define r9p r9d
  161. %define r10p r10d
  162. %define r11p r11d
  163. %define r12p r12d
  164. %define r13p r13d
  165. %define r14p r14d
  166. %define r15p r15d
  167. %endif
  168. %define INT dword ; signed integer type
  169. %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
  170. %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
  171. %define FP32 dword ; IEEE754 single
  172. %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
  173. %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
  174. %define MMWORD qword ; int64 (MMX register)
  175. %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
  176. %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
  177. ; NASM is buggy and doesn't properly handle operand sizes for SSE
  178. ; instructions, so for now we have to define XMMWORD as blank.
  179. %define XMMWORD ; int128 (SSE register)
  180. %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
  181. %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
  182. %define YMMWORD ; int256 (AVX register)
  183. %define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
  184. %define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
  185. ; Similar hacks for when we load a dword or MMWORD into an xmm# register
  186. %define XMM_DWORD
  187. %define XMM_MMWORD
  188. %define SIZEOF_BYTE 1 ; sizeof(byte)
  189. %define SIZEOF_WORD 2 ; sizeof(word)
  190. %define SIZEOF_DWORD 4 ; sizeof(dword)
  191. %define SIZEOF_QWORD 8 ; sizeof(qword)
  192. %define SIZEOF_OWORD 16 ; sizeof(oword)
  193. %define SIZEOF_YWORD 32 ; sizeof(yword)
  194. %define BYTE_BIT 8 ; CHAR_BIT in C
  195. %define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
  196. %define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
  197. %define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
  198. %define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
  199. %define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
  200. ; --------------------------------------------------------------------------
  201. ; External Symbol Name
  202. ;
  203. %ifndef EXTN
  204. %define EXTN(name) _ %+ name ; foo() -> _foo
  205. %endif
  206. ; --------------------------------------------------------------------------
  207. ; Hidden symbols
  208. ;
  209. %ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
  210. %define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
  211. %define GLOBAL_DATA(name) global EXTN(name):data hidden
  212. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  213. %ifdef __YASM_VER__
  214. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  215. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  216. %else
  217. %if __NASM_VERSION_ID__ >= 0x020E0000
  218. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  219. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  220. %endif
  221. %endif
  222. %endif
  223. %ifndef GLOBAL_FUNCTION
  224. %define GLOBAL_FUNCTION(name) global EXTN(name)
  225. %endif
  226. %ifndef GLOBAL_DATA
  227. %define GLOBAL_DATA(name) global EXTN(name)
  228. %endif
  229. ; --------------------------------------------------------------------------
  230. ; Macros for position-independent code (PIC) support
  231. ;
  232. %ifndef GOT_SYMBOL
  233. %undef PIC
  234. %endif
  235. %ifdef PIC ; -------------------------------------------
  236. %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
  237. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
  238. ; The PIC support code below is a little tricky.
  239. SECTION SEG_CONST
  240. const_base:
  241. %define GOTOFF(got, sym) (got) + (sym) - const_base
  242. %imacro GET_GOT 1
  243. ; NOTE: this macro destroys ecx resister.
  244. call %%geteip
  245. add ecx, byte (%%ref - $)
  246. jmp short %%adjust
  247. %%geteip:
  248. mov ecx, POINTER [esp]
  249. ret
  250. %%adjust:
  251. push ebp
  252. xor ebp, ebp ; ebp = 0
  253. %ifidni %1, ebx ; (%1 == ebx)
  254. ; db 0x8D,0x9C + jmp near const_base =
  255. ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
  256. db 0x8D, 0x9C ; 8D,9C
  257. jmp near const_base ; E9,(const_base-%%ref)
  258. %%ref:
  259. %else ; (%1 != ebx)
  260. ; db 0x8D,0x8C + jmp near const_base =
  261. ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
  262. db 0x8D, 0x8C ; 8D,8C
  263. jmp near const_base ; E9,(const_base-%%ref)
  264. %%ref:
  265. mov %1, ecx
  266. %endif ; (%1 == ebx)
  267. pop ebp
  268. %endmacro
  269. %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
  270. %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
  271. %imacro GET_GOT 1
  272. extern GOT_SYMBOL
  273. call %%geteip
  274. add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
  275. jmp short %%done
  276. %%geteip:
  277. mov %1, POINTER [esp]
  278. ret
  279. %%done:
  280. %endmacro
  281. %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
  282. %imacro PUSHPIC 1.nolist
  283. push %1
  284. %endmacro
  285. %imacro POPPIC 1.nolist
  286. pop %1
  287. %endmacro
  288. %imacro MOVPIC 2.nolist
  289. mov %1, %2
  290. %endmacro
  291. %else ; !PIC -----------------------------------------
  292. %define GOTOFF(got, sym) (sym)
  293. %imacro GET_GOT 1.nolist
  294. %endmacro
  295. %imacro PUSHPIC 1.nolist
  296. %endmacro
  297. %imacro POPPIC 1.nolist
  298. %endmacro
  299. %imacro MOVPIC 2.nolist
  300. %endmacro
  301. %endif ; PIC -----------------------------------------
  302. ; --------------------------------------------------------------------------
  303. ; Align the next instruction on {2,4,8,16,..}-byte boundary.
  304. ; ".balign n,,m" in GNU as
  305. ;
  306. %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
  307. %define FILLB(b, n) (($$-(b)) & ((n)-1))
  308. %imacro ALIGNX 1-2.nolist 0xFFFF
  309. %%bs: \
  310. times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
  311. db 0x90 ; nop
  312. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
  313. db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
  314. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
  315. db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  316. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
  317. db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  318. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
  319. db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
  320. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
  321. db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
  322. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
  323. db 0x8B, 0xED ; mov ebp,ebp
  324. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
  325. db 0x90 ; nop
  326. %endmacro
  327. ; Align the next data on {2,4,8,16,..}-byte boundary.
  328. ;
  329. %imacro ALIGNZ 1.nolist
  330. align %1, db 0 ; filling zeros
  331. %endmacro
  332. %ifdef __x86_64__
  333. %ifdef WIN64
  334. %imacro COLLECT_ARGS 1
  335. sub rsp, SIZEOF_XMMWORD
  336. movaps XMMWORD [rsp], xmm6
  337. sub rsp, SIZEOF_XMMWORD
  338. movaps XMMWORD [rsp], xmm7
  339. mov r10, rcx
  340. %if %1 > 1
  341. mov r11, rdx
  342. %endif
  343. %if %1 > 2
  344. push r12
  345. mov r12, r8
  346. %endif
  347. %if %1 > 3
  348. push r13
  349. mov r13, r9
  350. %endif
  351. %if %1 > 4
  352. push r14
  353. mov r14, [rbp+48]
  354. %endif
  355. %if %1 > 5
  356. push r15
  357. mov r15, [rbp+56]
  358. %endif
  359. push rsi
  360. push rdi
  361. %endmacro
  362. %imacro UNCOLLECT_ARGS 1
  363. pop rdi
  364. pop rsi
  365. %if %1 > 5
  366. pop r15
  367. %endif
  368. %if %1 > 4
  369. pop r14
  370. %endif
  371. %if %1 > 3
  372. pop r13
  373. %endif
  374. %if %1 > 2
  375. pop r12
  376. %endif
  377. movaps xmm7, XMMWORD [rsp]
  378. add rsp, SIZEOF_XMMWORD
  379. movaps xmm6, XMMWORD [rsp]
  380. add rsp, SIZEOF_XMMWORD
  381. %endmacro
  382. %imacro PUSH_XMM 1
  383. sub rsp, %1 * SIZEOF_XMMWORD
  384. movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
  385. %if %1 > 1
  386. movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
  387. %endif
  388. %if %1 > 2
  389. movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
  390. %endif
  391. %if %1 > 3
  392. movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
  393. %endif
  394. %endmacro
  395. %imacro POP_XMM 1
  396. movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
  397. %if %1 > 1
  398. movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
  399. %endif
  400. %if %1 > 2
  401. movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
  402. %endif
  403. %if %1 > 3
  404. movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
  405. %endif
  406. add rsp, %1 * SIZEOF_XMMWORD
  407. %endmacro
  408. %else
  409. %imacro COLLECT_ARGS 1
  410. push r10
  411. mov r10, rdi
  412. %if %1 > 1
  413. push r11
  414. mov r11, rsi
  415. %endif
  416. %if %1 > 2
  417. push r12
  418. mov r12, rdx
  419. %endif
  420. %if %1 > 3
  421. push r13
  422. mov r13, rcx
  423. %endif
  424. %if %1 > 4
  425. push r14
  426. mov r14, r8
  427. %endif
  428. %if %1 > 5
  429. push r15
  430. mov r15, r9
  431. %endif
  432. %endmacro
  433. %imacro UNCOLLECT_ARGS 1
  434. %if %1 > 5
  435. pop r15
  436. %endif
  437. %if %1 > 4
  438. pop r14
  439. %endif
  440. %if %1 > 3
  441. pop r13
  442. %endif
  443. %if %1 > 2
  444. pop r12
  445. %endif
  446. %if %1 > 1
  447. pop r11
  448. %endif
  449. pop r10
  450. %endmacro
  451. %imacro PUSH_XMM 1
  452. %endmacro
  453. %imacro POP_XMM 1
  454. %endmacro
  455. %endif
  456. %endif
  457. %ifdef __CET__
  458. %imacro ENDBR64 0
  459. dd 0xfa1e0ff3
  460. %endmacro
  461. %else
  462. %imacro ENDBR64 0
  463. %endmacro
  464. %endif
  465. ; --------------------------------------------------------------------------
  466. ; Defines picked up from the C headers
  467. ;
  468. %include "jsimdcfg.inc"
  469. ; --------------------------------------------------------------------------