loongson-mmintrin.h 22 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  5. * All Rights Reserved.
  6. * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
  7. *
  8. * This software is provided 'as-is', without any express or implied
  9. * warranty. In no event will the authors be held liable for any damages
  10. * arising from the use of this software.
  11. *
  12. * Permission is granted to anyone to use this software for any purpose,
  13. * including commercial applications, and to alter it and redistribute it
  14. * freely, subject to the following restrictions:
  15. *
  16. * 1. The origin of this software must not be misrepresented; you must not
  17. * claim that you wrote the original software. If you use this software
  18. * in a product, an acknowledgment in the product documentation would be
  19. * appreciated but is not required.
  20. * 2. Altered source versions must be plainly marked as such, and must not be
  21. * misrepresented as being the original software.
  22. * 3. This notice may not be removed or altered from any source distribution.
  23. */
  24. #ifndef __LOONGSON_MMINTRIN_H__
  25. #define __LOONGSON_MMINTRIN_H__
  26. #include <stdint.h>
  27. #define FUNCTION_ATTRIBS \
  28. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  29. /* Vectors are stored in 64-bit floating-point registers. */
  30. typedef double __m64;
  31. /* Having a 32-bit datatype allows us to use 32-bit loads in places like
  32. load8888. */
  33. typedef float __m32;
  34. /********** Set Operations **********/
  35. extern __inline __m64 FUNCTION_ATTRIBS
  36. _mm_setzero_si64(void)
  37. {
  38. return 0.0;
  39. }
  40. extern __inline __m64 FUNCTION_ATTRIBS
  41. _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
  42. uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
  43. {
  44. __m64 ret;
  45. uint32_t lo = ((uint32_t)__b6 << 24) |
  46. ((uint32_t)__b4 << 16) |
  47. ((uint32_t)__b2 << 8) |
  48. (uint32_t)__b0;
  49. uint32_t hi = ((uint32_t)__b7 << 24) |
  50. ((uint32_t)__b5 << 16) |
  51. ((uint32_t)__b3 << 8) |
  52. (uint32_t)__b1;
  53. asm("mtc1 %1, %0\n\t"
  54. "mtc1 %2, $f0\n\t"
  55. "punpcklbh %0, %0, $f0\n\t"
  56. : "=f" (ret)
  57. : "r" (lo), "r" (hi)
  58. : "$f0"
  59. );
  60. return ret;
  61. }
  62. extern __inline __m64 FUNCTION_ATTRIBS
  63. _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
  64. {
  65. __m64 ret;
  66. uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
  67. uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
  68. asm("mtc1 %1, %0\n\t"
  69. "mtc1 %2, $f0\n\t"
  70. "punpcklhw %0, %0, $f0\n\t"
  71. : "=f" (ret)
  72. : "r" (lo), "r" (hi)
  73. : "$f0"
  74. );
  75. return ret;
  76. }
  77. #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
  78. (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  79. extern __inline __m64 FUNCTION_ATTRIBS
  80. _mm_set_pi32(uint32_t __i1, uint32_t __i0)
  81. {
  82. if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
  83. uint64_t val = ((uint64_t)__i1 << 32) |
  84. ((uint64_t)__i0 << 0);
  85. return *(__m64 *)&val;
  86. } else if (__i1 == __i0) {
  87. uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
  88. __m64 ret;
  89. asm("pshufh %0, %1, %2\n\t"
  90. : "=f" (ret)
  91. : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
  92. );
  93. return ret;
  94. } else {
  95. uint64_t val = ((uint64_t)__i1 << 32) |
  96. ((uint64_t)__i0 << 0);
  97. return *(__m64 *)&val;
  98. }
  99. }
  100. extern __inline __m64 FUNCTION_ATTRIBS
  101. _mm_set1_pi8(uint8_t __b0)
  102. {
  103. __m64 ret;
  104. asm("sll $8, %1, 8\n\t"
  105. "or %1, %1, $8\n\t"
  106. "mtc1 %1, %0\n\t"
  107. "mtc1 $0, $f0\n\t"
  108. "pshufh %0, %0, $f0\n\t"
  109. : "=f" (ret)
  110. : "r" (__b0)
  111. : "$8", "$f0"
  112. );
  113. return ret;
  114. }
  115. extern __inline __m64 FUNCTION_ATTRIBS
  116. _mm_set1_pi16(uint16_t __h0)
  117. {
  118. __m64 ret;
  119. asm("mtc1 %1, %0\n\t"
  120. "mtc1 $0, $f0\n\t"
  121. "pshufh %0, %0, $f0\n\t"
  122. : "=f" (ret)
  123. : "r" (__h0)
  124. : "$8", "$f0"
  125. );
  126. return ret;
  127. }
  128. extern __inline __m64 FUNCTION_ATTRIBS
  129. _mm_set1_pi32(unsigned __i0)
  130. {
  131. return _mm_set_pi32(__i0, __i0);
  132. }
  133. extern __inline __m64 FUNCTION_ATTRIBS
  134. _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
  135. uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
  136. {
  137. return _mm_set_pi8(__h7, __h6, __h5, __h4,
  138. __h3, __h2, __h1, __h0);
  139. }
  140. extern __inline __m64 FUNCTION_ATTRIBS
  141. _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
  142. {
  143. return _mm_set_pi16(__w3, __w2, __w1, __w0);
  144. }
  145. extern __inline __m64 FUNCTION_ATTRIBS
  146. _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
  147. {
  148. return _mm_set_pi32(__i1, __i0);
  149. }
  150. /********** Arithmetic Operations **********/
  151. extern __inline __m64 FUNCTION_ATTRIBS
  152. _mm_add_pi8(__m64 __m1, __m64 __m2)
  153. {
  154. __m64 ret;
  155. asm("paddb %0, %1, %2\n\t"
  156. : "=f" (ret)
  157. : "f" (__m1), "f" (__m2)
  158. );
  159. return ret;
  160. }
  161. extern __inline __m64 FUNCTION_ATTRIBS
  162. _mm_add_pi16(__m64 __m1, __m64 __m2)
  163. {
  164. __m64 ret;
  165. asm("paddh %0, %1, %2\n\t"
  166. : "=f" (ret)
  167. : "f" (__m1), "f" (__m2)
  168. );
  169. return ret;
  170. }
  171. extern __inline __m64 FUNCTION_ATTRIBS
  172. _mm_add_pi32(__m64 __m1, __m64 __m2)
  173. {
  174. __m64 ret;
  175. asm("paddw %0, %1, %2\n\t"
  176. : "=f" (ret)
  177. : "f" (__m1), "f" (__m2)
  178. );
  179. return ret;
  180. }
  181. extern __inline __m64 FUNCTION_ATTRIBS
  182. _mm_add_si64(__m64 __m1, __m64 __m2)
  183. {
  184. __m64 ret;
  185. asm("paddd %0, %1, %2\n\t"
  186. : "=f" (ret)
  187. : "f" (__m1), "f" (__m2)
  188. );
  189. return ret;
  190. }
  191. extern __inline __m64 FUNCTION_ATTRIBS
  192. _mm_adds_pi8(__m64 __m1, __m64 __m2)
  193. {
  194. __m64 ret;
  195. asm("paddsb %0, %1, %2\n\t"
  196. : "=f" (ret)
  197. : "f" (__m1), "f" (__m2)
  198. );
  199. return ret;
  200. }
  201. extern __inline __m64 FUNCTION_ATTRIBS
  202. _mm_adds_pi16(__m64 __m1, __m64 __m2)
  203. {
  204. __m64 ret;
  205. asm("paddsh %0, %1, %2\n\t"
  206. : "=f" (ret)
  207. : "f" (__m1), "f" (__m2)
  208. );
  209. return ret;
  210. }
  211. extern __inline __m64 FUNCTION_ATTRIBS
  212. _mm_adds_pu8(__m64 __m1, __m64 __m2)
  213. {
  214. __m64 ret;
  215. asm("paddusb %0, %1, %2\n\t"
  216. : "=f" (ret)
  217. : "f" (__m1), "f" (__m2)
  218. );
  219. return ret;
  220. }
  221. extern __inline __m64 FUNCTION_ATTRIBS
  222. _mm_adds_pu16(__m64 __m1, __m64 __m2)
  223. {
  224. __m64 ret;
  225. asm("paddush %0, %1, %2\n\t"
  226. : "=f" (ret)
  227. : "f" (__m1), "f" (__m2)
  228. );
  229. return ret;
  230. }
  231. extern __inline __m64 FUNCTION_ATTRIBS
  232. _mm_avg_pu8(__m64 __m1, __m64 __m2)
  233. {
  234. __m64 ret;
  235. asm("pavgb %0, %1, %2\n\t"
  236. : "=f" (ret)
  237. : "f" (__m1), "f" (__m2)
  238. );
  239. return ret;
  240. }
  241. extern __inline __m64 FUNCTION_ATTRIBS
  242. _mm_avg_pu16(__m64 __m1, __m64 __m2)
  243. {
  244. __m64 ret;
  245. asm("pavgh %0, %1, %2\n\t"
  246. : "=f" (ret)
  247. : "f" (__m1), "f" (__m2)
  248. );
  249. return ret;
  250. }
  251. extern __inline __m64 FUNCTION_ATTRIBS
  252. _mm_madd_pi16(__m64 __m1, __m64 __m2)
  253. {
  254. __m64 ret;
  255. asm("pmaddhw %0, %1, %2\n\t"
  256. : "=f" (ret)
  257. : "f" (__m1), "f" (__m2)
  258. );
  259. return ret;
  260. }
  261. extern __inline __m64 FUNCTION_ATTRIBS
  262. _mm_max_pi16(__m64 __m1, __m64 __m2)
  263. {
  264. __m64 ret;
  265. asm("pmaxsh %0, %1, %2\n\t"
  266. : "=f" (ret)
  267. : "f" (__m1), "f" (__m2)
  268. );
  269. return ret;
  270. }
  271. extern __inline __m64 FUNCTION_ATTRIBS
  272. _mm_max_pu8(__m64 __m1, __m64 __m2)
  273. {
  274. __m64 ret;
  275. asm("pmaxub %0, %1, %2\n\t"
  276. : "=f" (ret)
  277. : "f" (__m1), "f" (__m2)
  278. );
  279. return ret;
  280. }
  281. extern __inline __m64 FUNCTION_ATTRIBS
  282. _mm_min_pi16(__m64 __m1, __m64 __m2)
  283. {
  284. __m64 ret;
  285. asm("pminsh %0, %1, %2\n\t"
  286. : "=f" (ret)
  287. : "f" (__m1), "f" (__m2)
  288. );
  289. return ret;
  290. }
  291. extern __inline __m64 FUNCTION_ATTRIBS
  292. _mm_min_pu8(__m64 __m1, __m64 __m2)
  293. {
  294. __m64 ret;
  295. asm("pminub %0, %1, %2\n\t"
  296. : "=f" (ret)
  297. : "f" (__m1), "f" (__m2)
  298. );
  299. return ret;
  300. }
  301. extern __inline int FUNCTION_ATTRIBS
  302. _mm_movemask_pi8(__m64 __m1)
  303. {
  304. int ret;
  305. asm("pmovmskb %0, %1\n\t"
  306. : "=r" (ret)
  307. : "y" (__m1)
  308. );
  309. return ret;
  310. }
  311. extern __inline __m64 FUNCTION_ATTRIBS
  312. _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
  313. {
  314. __m64 ret;
  315. asm("pmulhh %0, %1, %2\n\t"
  316. : "=f" (ret)
  317. : "f" (__m1), "f" (__m2)
  318. );
  319. return ret;
  320. }
  321. extern __inline __m64 FUNCTION_ATTRIBS
  322. _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
  323. {
  324. __m64 ret;
  325. asm("pmulhuh %0, %1, %2\n\t"
  326. : "=f" (ret)
  327. : "f" (__m1), "f" (__m2)
  328. );
  329. return ret;
  330. }
  331. extern __inline __m64 FUNCTION_ATTRIBS
  332. _mm_mullo_pi16(__m64 __m1, __m64 __m2)
  333. {
  334. __m64 ret;
  335. asm("pmullh %0, %1, %2\n\t"
  336. : "=f" (ret)
  337. : "f" (__m1), "f" (__m2)
  338. );
  339. return ret;
  340. }
  341. extern __inline __m64 FUNCTION_ATTRIBS
  342. _mm_mul_pu32(__m64 __m1, __m64 __m2)
  343. {
  344. __m64 ret;
  345. asm("pmuluw %0, %1, %2\n\t"
  346. : "=f" (ret)
  347. : "f" (__m1), "f" (__m2)
  348. );
  349. return ret;
  350. }
  351. extern __inline __m64 FUNCTION_ATTRIBS
  352. _mm_sad_pu8(__m64 __m1, __m64 __m2)
  353. {
  354. __m64 ret;
  355. asm("psadbh %0, %1, %2\n\t"
  356. : "=f" (ret)
  357. : "f" (__m1), "f" (__m2)
  358. );
  359. return ret;
  360. }
  361. extern __inline __m64 FUNCTION_ATTRIBS
  362. _mm_asub_pu8(__m64 __m1, __m64 __m2)
  363. {
  364. __m64 ret;
  365. asm("pasubub %0, %1, %2\n\t"
  366. : "=f" (ret)
  367. : "f" (__m1), "f" (__m2)
  368. );
  369. return ret;
  370. }
  371. extern __inline __m64 FUNCTION_ATTRIBS
  372. _mm_biadd_pu8(__m64 __m1, __m64 __m2)
  373. {
  374. __m64 ret;
  375. asm("biadd %0, %1, %2\n\t"
  376. : "=f" (ret)
  377. : "f" (__m1), "f" (__m2)
  378. );
  379. return ret;
  380. }
  381. extern __inline __m64 FUNCTION_ATTRIBS
  382. _mm_sub_pi8(__m64 __m1, __m64 __m2)
  383. {
  384. __m64 ret;
  385. asm("psubb %0, %1, %2\n\t"
  386. : "=f" (ret)
  387. : "f" (__m1), "f" (__m2)
  388. );
  389. return ret;
  390. }
  391. extern __inline __m64 FUNCTION_ATTRIBS
  392. _mm_sub_pi16(__m64 __m1, __m64 __m2)
  393. {
  394. __m64 ret;
  395. asm("psubh %0, %1, %2\n\t"
  396. : "=f" (ret)
  397. : "f" (__m1), "f" (__m2)
  398. );
  399. return ret;
  400. }
  401. extern __inline __m64 FUNCTION_ATTRIBS
  402. _mm_sub_pi32(__m64 __m1, __m64 __m2)
  403. {
  404. __m64 ret;
  405. asm("psubw %0, %1, %2\n\t"
  406. : "=f" (ret)
  407. : "f" (__m1), "f" (__m2)
  408. );
  409. return ret;
  410. }
  411. extern __inline __m64 FUNCTION_ATTRIBS
  412. _mm_sub_si64(__m64 __m1, __m64 __m2)
  413. {
  414. __m64 ret;
  415. asm("psubd %0, %1, %2\n\t"
  416. : "=f" (ret)
  417. : "f" (__m1), "f" (__m2)
  418. );
  419. return ret;
  420. }
  421. extern __inline __m64 FUNCTION_ATTRIBS
  422. _mm_subs_pi8(__m64 __m1, __m64 __m2)
  423. {
  424. __m64 ret;
  425. asm("psubsb %0, %1, %2\n\t"
  426. : "=f" (ret)
  427. : "f" (__m1), "f" (__m2)
  428. );
  429. return ret;
  430. }
  431. extern __inline __m64 FUNCTION_ATTRIBS
  432. _mm_subs_pi16(__m64 __m1, __m64 __m2)
  433. {
  434. __m64 ret;
  435. asm("psubsh %0, %1, %2\n\t"
  436. : "=f" (ret)
  437. : "f" (__m1), "f" (__m2)
  438. );
  439. return ret;
  440. }
  441. extern __inline __m64 FUNCTION_ATTRIBS
  442. _mm_subs_pu8(__m64 __m1, __m64 __m2)
  443. {
  444. __m64 ret;
  445. asm("psubusb %0, %1, %2\n\t"
  446. : "=f" (ret)
  447. : "f" (__m1), "f" (__m2)
  448. );
  449. return ret;
  450. }
  451. extern __inline __m64 FUNCTION_ATTRIBS
  452. _mm_subs_pu16(__m64 __m1, __m64 __m2)
  453. {
  454. __m64 ret;
  455. asm("psubush %0, %1, %2\n\t"
  456. : "=f" (ret)
  457. : "f" (__m1), "f" (__m2)
  458. );
  459. return ret;
  460. }
  461. /********** Logical Operations **********/
  462. extern __inline __m64 FUNCTION_ATTRIBS
  463. _mm_and_si64(__m64 __m1, __m64 __m2)
  464. {
  465. __m64 ret;
  466. asm("and %0, %1, %2\n\t"
  467. : "=f" (ret)
  468. : "f" (__m1), "f" (__m2)
  469. );
  470. return ret;
  471. }
  472. extern __inline __m64 FUNCTION_ATTRIBS
  473. _mm_andnot_si64(__m64 __m1, __m64 __m2)
  474. {
  475. __m64 ret;
  476. asm("andn %0, %1, %2\n\t"
  477. : "=f" (ret)
  478. : "f" (__m1), "f" (__m2)
  479. );
  480. return ret;
  481. }
  482. extern __inline __m64 FUNCTION_ATTRIBS
  483. _mm_or_si32(__m32 __m1, __m32 __m2)
  484. {
  485. __m32 ret;
  486. asm("or %0, %1, %2\n\t"
  487. : "=f" (ret)
  488. : "f" (__m1), "f" (__m2)
  489. );
  490. return ret;
  491. }
  492. extern __inline __m64 FUNCTION_ATTRIBS
  493. _mm_or_si64(__m64 __m1, __m64 __m2)
  494. {
  495. __m64 ret;
  496. asm("or %0, %1, %2\n\t"
  497. : "=f" (ret)
  498. : "f" (__m1), "f" (__m2)
  499. );
  500. return ret;
  501. }
  502. extern __inline __m64 FUNCTION_ATTRIBS
  503. _mm_xor_si64(__m64 __m1, __m64 __m2)
  504. {
  505. __m64 ret;
  506. asm("xor %0, %1, %2\n\t"
  507. : "=f" (ret)
  508. : "f" (__m1), "f" (__m2)
  509. );
  510. return ret;
  511. }
  512. /********** Shift Operations **********/
  513. extern __inline __m64 FUNCTION_ATTRIBS
  514. _mm_slli_pi16(__m64 __m, int64_t __count)
  515. {
  516. __m64 ret;
  517. asm("psllh %0, %1, %2\n\t"
  518. : "=f" (ret)
  519. : "f" (__m), "f" (*(__m64 *)&__count)
  520. );
  521. return ret;
  522. }
  523. extern __inline __m64 FUNCTION_ATTRIBS
  524. _mm_slli_pi32(__m64 __m, int64_t __count)
  525. {
  526. __m64 ret;
  527. asm("psllw %0, %1, %2\n\t"
  528. : "=f" (ret)
  529. : "f" (__m), "f" (*(__m64 *)&__count)
  530. );
  531. return ret;
  532. }
  533. extern __inline __m64 FUNCTION_ATTRIBS
  534. _mm_slli_si64(__m64 __m, int64_t __count)
  535. {
  536. __m64 ret;
  537. asm("dsll %0, %1, %2\n\t"
  538. : "=f" (ret)
  539. : "f" (__m), "f" (*(__m64 *)&__count)
  540. );
  541. return ret;
  542. }
  543. extern __inline __m64 FUNCTION_ATTRIBS
  544. _mm_srli_pi16(__m64 __m, int64_t __count)
  545. {
  546. __m64 ret;
  547. asm("psrlh %0, %1, %2\n\t"
  548. : "=f" (ret)
  549. : "f" (__m), "f" (*(__m64 *)&__count)
  550. );
  551. return ret;
  552. }
  553. extern __inline __m64 FUNCTION_ATTRIBS
  554. _mm_srli_pi32(__m64 __m, int64_t __count)
  555. {
  556. __m64 ret;
  557. asm("psrlw %0, %1, %2\n\t"
  558. : "=f" (ret)
  559. : "f" (__m), "f" (*(__m64 *)&__count)
  560. );
  561. return ret;
  562. }
  563. extern __inline __m64 FUNCTION_ATTRIBS
  564. _mm_srli_si64(__m64 __m, int64_t __count)
  565. {
  566. __m64 ret;
  567. asm("dsrl %0, %1, %2\n\t"
  568. : "=f" (ret)
  569. : "f" (__m), "f" (*(__m64 *)&__count)
  570. );
  571. return ret;
  572. }
  573. extern __inline __m64 FUNCTION_ATTRIBS
  574. _mm_srai_pi16(__m64 __m, int64_t __count)
  575. {
  576. __m64 ret;
  577. asm("psrah %0, %1, %2\n\t"
  578. : "=f" (ret)
  579. : "f" (__m), "f" (*(__m64 *)&__count)
  580. );
  581. return ret;
  582. }
  583. extern __inline __m64 FUNCTION_ATTRIBS
  584. _mm_srai_pi32(__m64 __m, int64_t __count)
  585. {
  586. __m64 ret;
  587. asm("psraw %0, %1, %2\n\t"
  588. : "=f" (ret)
  589. : "f" (__m), "f" (*(__m64 *)&__count)
  590. );
  591. return ret;
  592. }
  593. extern __inline __m64 FUNCTION_ATTRIBS
  594. _mm_srai_si64(__m64 __m, int64_t __count)
  595. {
  596. __m64 ret;
  597. asm("dsra %0, %1, %2\n\t"
  598. : "=f" (ret)
  599. : "f" (__m), "f" (*(__m64 *)&__count)
  600. );
  601. return ret;
  602. }
  603. /********** Conversion Intrinsics **********/
  604. extern __inline __m64 FUNCTION_ATTRIBS
  605. to_m64(uint64_t x)
  606. {
  607. return *(__m64 *)&x;
  608. }
  609. extern __inline uint64_t FUNCTION_ATTRIBS
  610. to_uint64(__m64 x)
  611. {
  612. return *(uint64_t *)&x;
  613. }
  614. /********** Comparison Intrinsics **********/
  615. extern __inline __m64 FUNCTION_ATTRIBS
  616. _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
  617. {
  618. __m64 ret;
  619. asm("pcmpeqb %0, %1, %2\n\t"
  620. : "=f" (ret)
  621. : "f" (__m1), "f" (__m2)
  622. );
  623. return ret;
  624. }
  625. extern __inline __m64 FUNCTION_ATTRIBS
  626. _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
  627. {
  628. __m64 ret;
  629. asm("pcmpeqh %0, %1, %2\n\t"
  630. : "=f" (ret)
  631. : "f" (__m1), "f" (__m2)
  632. );
  633. return ret;
  634. }
  635. extern __inline __m64 FUNCTION_ATTRIBS
  636. _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
  637. {
  638. __m64 ret;
  639. asm("pcmpeqw %0, %1, %2\n\t"
  640. : "=f" (ret)
  641. : "f" (__m1), "f" (__m2)
  642. );
  643. return ret;
  644. }
  645. extern __inline __m64 FUNCTION_ATTRIBS
  646. _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
  647. {
  648. __m64 ret;
  649. asm("pcmpgtb %0, %1, %2\n\t"
  650. : "=f" (ret)
  651. : "f" (__m1), "f" (__m2)
  652. );
  653. return ret;
  654. }
  655. extern __inline __m64 FUNCTION_ATTRIBS
  656. _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
  657. {
  658. __m64 ret;
  659. asm("pcmpgth %0, %1, %2\n\t"
  660. : "=f" (ret)
  661. : "f" (__m1), "f" (__m2)
  662. );
  663. return ret;
  664. }
  665. extern __inline __m64 FUNCTION_ATTRIBS
  666. _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
  667. {
  668. __m64 ret;
  669. asm("pcmpgtw %0, %1, %2\n\t"
  670. : "=f" (ret)
  671. : "f" (__m1), "f" (__m2)
  672. );
  673. return ret;
  674. }
  675. extern __inline __m64 FUNCTION_ATTRIBS
  676. _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
  677. {
  678. __m64 ret;
  679. asm("pcmpltb %0, %1, %2\n\t"
  680. : "=f" (ret)
  681. : "f" (__m1), "f" (__m2)
  682. );
  683. return ret;
  684. }
  685. extern __inline __m64 FUNCTION_ATTRIBS
  686. _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
  687. {
  688. __m64 ret;
  689. asm("pcmplth %0, %1, %2\n\t"
  690. : "=f" (ret)
  691. : "f" (__m1), "f" (__m2)
  692. );
  693. return ret;
  694. }
  695. extern __inline __m64 FUNCTION_ATTRIBS
  696. _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
  697. {
  698. __m64 ret;
  699. asm("pcmpltw %0, %1, %2\n\t"
  700. : "=f" (ret)
  701. : "f" (__m1), "f" (__m2)
  702. );
  703. return ret;
  704. }
  705. /********** Miscellaneous Operations **********/
  706. extern __inline __m64 FUNCTION_ATTRIBS
  707. _mm_packs_pi16(__m64 __m1, __m64 __m2)
  708. {
  709. __m64 ret;
  710. asm("packsshb %0, %1, %2\n\t"
  711. : "=f" (ret)
  712. : "f" (__m1), "f" (__m2)
  713. );
  714. return ret;
  715. }
  716. extern __inline __m64 FUNCTION_ATTRIBS
  717. _mm_packs_pi32(__m64 __m1, __m64 __m2)
  718. {
  719. __m64 ret;
  720. asm("packsswh %0, %1, %2\n\t"
  721. : "=f" (ret)
  722. : "f" (__m1), "f" (__m2)
  723. );
  724. return ret;
  725. }
  726. extern __inline __m64 FUNCTION_ATTRIBS
  727. _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
  728. {
  729. __m64 ret;
  730. asm("packsswh %0, %1, %2\n\t"
  731. : "=f" (ret)
  732. : "f" (__m1), "f" (__m2)
  733. );
  734. return ret;
  735. }
  736. extern __inline __m64 FUNCTION_ATTRIBS
  737. _mm_packs_pu16(__m64 __m1, __m64 __m2)
  738. {
  739. __m64 ret;
  740. asm("packushb %0, %1, %2\n\t"
  741. : "=f" (ret)
  742. : "f" (__m1), "f" (__m2)
  743. );
  744. return ret;
  745. }
  746. extern __inline __m64 FUNCTION_ATTRIBS
  747. _mm_extract_pi16(__m64 __m, int64_t __pos)
  748. {
  749. __m64 ret;
  750. asm("pextrh %0, %1, %2\n\t"
  751. : "=f" (ret)
  752. : "f" (__m), "f" (*(__m64 *)&__pos)
  753. );
  754. return ret;
  755. }
  756. extern __inline __m64 FUNCTION_ATTRIBS
  757. _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
  758. {
  759. __m64 ret;
  760. switch (__pos) {
  761. case 0:
  762. asm("pinsrh_0 %0, %1, %2\n\t"
  763. : "=f" (ret)
  764. : "f" (__m1), "f" (__m2), "i" (__pos)
  765. );
  766. break;
  767. case 1:
  768. asm("pinsrh_1 %0, %1, %2\n\t"
  769. : "=f" (ret)
  770. : "f" (__m1), "f" (__m2), "i" (__pos)
  771. );
  772. break;
  773. case 2:
  774. asm("pinsrh_2 %0, %1, %2\n\t"
  775. : "=f" (ret)
  776. : "f" (__m1), "f" (__m2), "i" (__pos)
  777. );
  778. break;
  779. case 3:
  780. asm("pinsrh_3 %0, %1, %2\n\t"
  781. : "=f" (ret)
  782. : "f" (__m1), "f" (__m2), "i" (__pos)
  783. );
  784. break;
  785. }
  786. return ret;
  787. }
  788. extern __inline __m64 FUNCTION_ATTRIBS
  789. _mm_shuffle_pi16(__m64 __m, int64_t __n)
  790. {
  791. __m64 ret;
  792. asm("pshufh %0, %1, %2\n\t"
  793. : "=f" (ret)
  794. : "f" (__m), "f" (*(__m64 *)&__n)
  795. );
  796. return ret;
  797. }
  798. extern __inline __m64 FUNCTION_ATTRIBS
  799. _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
  800. {
  801. __m64 ret;
  802. asm("punpckhbh %0, %1, %2\n\t"
  803. : "=f" (ret)
  804. : "f" (__m1), "f" (__m2)
  805. );
  806. return ret;
  807. }
  808. extern __inline __m64 FUNCTION_ATTRIBS
  809. _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
  810. {
  811. __m64 ret;
  812. asm("punpckhbh %0, %1, %2\n\t"
  813. : "=f" (ret)
  814. : "f" (__m1), "f" (__m2)
  815. );
  816. return ret;
  817. }
  818. extern __inline __m64 FUNCTION_ATTRIBS
  819. _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
  820. {
  821. __m64 ret;
  822. asm("punpckhhw %0, %1, %2\n\t"
  823. : "=f" (ret)
  824. : "f" (__m1), "f" (__m2)
  825. );
  826. return ret;
  827. }
  828. extern __inline __m64 FUNCTION_ATTRIBS
  829. _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
  830. {
  831. __m64 ret;
  832. asm("punpckhhw %0, %1, %2\n\t"
  833. : "=f" (ret)
  834. : "f" (__m1), "f" (__m2)
  835. );
  836. return ret;
  837. }
  838. extern __inline __m64 FUNCTION_ATTRIBS
  839. _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
  840. {
  841. __m64 ret;
  842. asm("punpckhwd %0, %1, %2\n\t"
  843. : "=f" (ret)
  844. : "f" (__m1), "f" (__m2)
  845. );
  846. return ret;
  847. }
  848. extern __inline __m64 FUNCTION_ATTRIBS
  849. _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
  850. {
  851. __m64 ret;
  852. asm("punpcklbh %0, %1, %2\n\t"
  853. : "=f" (ret)
  854. : "f" (__m1), "f" (__m2)
  855. );
  856. return ret;
  857. }
  858. /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
  859. which preserves the data. */
  860. extern __inline __m64 FUNCTION_ATTRIBS
  861. _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
  862. {
  863. __m64 ret;
  864. asm("punpcklbh %0, %1, %2\n\t"
  865. : "=f" (ret)
  866. : "f" (__m1), "f" (__m2)
  867. );
  868. return ret;
  869. }
  870. /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
  871. datatype, which allows load8888 to use 32-bit loads. */
  872. extern __inline __m64 FUNCTION_ATTRIBS
  873. _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
  874. {
  875. __m64 ret;
  876. asm("punpcklbh %0, %1, %2\n\t"
  877. : "=f" (ret)
  878. : "f" (__m1), "f" (__m2)
  879. );
  880. return ret;
  881. }
  882. extern __inline __m64 FUNCTION_ATTRIBS
  883. _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
  884. {
  885. __m64 ret;
  886. asm("punpcklhw %0, %1, %2\n\t"
  887. : "=f" (ret)
  888. : "f" (__m1), "f" (__m2)
  889. );
  890. return ret;
  891. }
  892. extern __inline __m64 FUNCTION_ATTRIBS
  893. _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
  894. {
  895. __m64 ret;
  896. asm("punpcklhw %0, %1, %2\n\t"
  897. : "=f" (ret)
  898. : "f" (__m1), "f" (__m2)
  899. );
  900. return ret;
  901. }
  902. extern __inline __m64 FUNCTION_ATTRIBS
  903. _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
  904. {
  905. __m64 ret;
  906. asm("punpcklwd %0, %1, %2\n\t"
  907. : "=f" (ret)
  908. : "f" (__m1), "f" (__m2)
  909. );
  910. return ret;
  911. }
  912. extern __inline __m64 FUNCTION_ATTRIBS
  913. _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
  914. {
  915. __m64 ret;
  916. asm("punpcklwd %0, %1, %2\n\t"
  917. : "=f" (ret)
  918. : "f" (__m1), "f" (__m2)
  919. );
  920. return ret;
  921. }
  922. extern __inline void FUNCTION_ATTRIBS
  923. _mm_store_pi32(__m32 *dest, __m64 src)
  924. {
  925. src = _mm_packs_pu16(src, _mm_setzero_si64());
  926. asm("swc1 %1, %0\n\t"
  927. : "=m" (*dest)
  928. : "f" (src)
  929. : "memory"
  930. );
  931. }
  932. extern __inline void FUNCTION_ATTRIBS
  933. _mm_store_si64(__m64 *dest, __m64 src)
  934. {
  935. asm("sdc1 %1, %0 \n\t"
  936. : "=m" (*dest)
  937. : "f" (src)
  938. : "memory"
  939. );
  940. }
  941. extern __inline void FUNCTION_ATTRIBS
  942. _mm_storeu_si64(__m64 *dest, __m64 src)
  943. {
  944. asm("gssdlc1 %1, 7(%0) \n\t"
  945. "gssdrc1 %1, 0(%0) \n\t"
  946. :
  947. : "r" (dest), "f" (src)
  948. : "memory"
  949. );
  950. }
  951. extern __inline __m64 FUNCTION_ATTRIBS
  952. _mm_load_si32(const __m32 *src)
  953. {
  954. __m32 ret;
  955. asm("lwc1 %0, %1\n\t"
  956. : "=f" (ret)
  957. : "m" (*src)
  958. );
  959. return ret;
  960. }
  961. extern __inline __m64 FUNCTION_ATTRIBS
  962. _mm_load_si64(const __m64 *src)
  963. {
  964. __m64 ret;
  965. asm("ldc1 %0, %1\n\t"
  966. : "=f" (ret)
  967. : "m" (*src)
  968. : "memory"
  969. );
  970. return ret;
  971. }
  972. extern __inline __m64 FUNCTION_ATTRIBS
  973. _mm_loadu_si64(const __m64 *src)
  974. {
  975. __m64 ret;
  976. asm("gsldlc1 %0, 7(%1)\n\t"
  977. "gsldrc1 %0, 0(%1)\n\t"
  978. : "=f" (ret)
  979. : "r" (src)
  980. : "memory"
  981. );
  982. return ret;
  983. }
  984. extern __inline __m64 FUNCTION_ATTRIBS
  985. _mm_loadlo_pi8(const uint32_t *src)
  986. {
  987. return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
  988. }
  989. extern __inline __m64 FUNCTION_ATTRIBS
  990. _mm_loadlo_pi8_f(__m64 src)
  991. {
  992. return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
  993. }
  994. extern __inline __m64 FUNCTION_ATTRIBS
  995. _mm_loadhi_pi8_f(__m64 src)
  996. {
  997. return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
  998. }
  999. extern __inline __m64 FUNCTION_ATTRIBS
  1000. _mm_loadlo_pi16(__m64 src)
  1001. {
  1002. return _mm_unpacklo_pi16(src, _mm_setzero_si64());
  1003. }
  1004. extern __inline __m64 FUNCTION_ATTRIBS
  1005. _mm_loadlo_pi16_f(__m64 src)
  1006. {
  1007. return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
  1008. }
  1009. extern __inline __m64 FUNCTION_ATTRIBS
  1010. _mm_loadhi_pi16(__m64 src)
  1011. {
  1012. return _mm_unpackhi_pi16(src, _mm_setzero_si64());
  1013. }
  1014. extern __inline __m64 FUNCTION_ATTRIBS
  1015. _mm_loadhi_pi16_f(__m64 src)
  1016. {
  1017. return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
  1018. }
  1019. extern __inline __m64 FUNCTION_ATTRIBS
  1020. _mm_expand_alpha(__m64 pixel)
  1021. {
  1022. return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
  1023. }
  1024. extern __inline __m64 FUNCTION_ATTRIBS
  1025. _mm_expand_alpha_rev(__m64 pixel)
  1026. {
  1027. return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
  1028. }
  1029. #endif /* __LOONGSON_MMINTRIN_H__ */