jccolext-mmi.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
  6. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  7. * All Rights Reserved.
  8. * Authors: ZhuChen <zhuchen@loongson.cn>
  9. * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
  10. * CaiWanwei <caiwanwei@loongson.cn>
  11. * ZhangLixia <zhanglixia-hf@loongson.cn>
  12. *
  13. * Based on the x86 SIMD extension for IJG JPEG library
  14. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  15. *
  16. * This software is provided 'as-is', without any express or implied
  17. * warranty. In no event will the authors be held liable for any damages
  18. * arising from the use of this software.
  19. *
  20. * Permission is granted to anyone to use this software for any purpose,
  21. * including commercial applications, and to alter it and redistribute it
  22. * freely, subject to the following restrictions:
  23. *
  24. * 1. The origin of this software must not be misrepresented; you must not
  25. * claim that you wrote the original software. If you use this software
  26. * in a product, an acknowledgment in the product documentation would be
  27. * appreciated but is not required.
  28. * 2. Altered source versions must be plainly marked as such, and must not be
  29. * misrepresented as being the original software.
  30. * 3. This notice may not be removed or altered from any source distribution.
  31. */
  32. /* This file is included by jccolor-mmi.c */
  33. #if RGB_RED == 0
  34. #define mmA re
  35. #define mmB ro
  36. #elif RGB_GREEN == 0
  37. #define mmA ge
  38. #define mmB go
  39. #elif RGB_BLUE == 0
  40. #define mmA be
  41. #define mmB bo
  42. #else
  43. #define mmA xe
  44. #define mmB xo
  45. #endif
  46. #if RGB_RED == 1
  47. #define mmC re
  48. #define mmD ro
  49. #elif RGB_GREEN == 1
  50. #define mmC ge
  51. #define mmD go
  52. #elif RGB_BLUE == 1
  53. #define mmC be
  54. #define mmD bo
  55. #else
  56. #define mmC xe
  57. #define mmD xo
  58. #endif
  59. #if RGB_RED == 2
  60. #define mmE re
  61. #define mmF ro
  62. #elif RGB_GREEN == 2
  63. #define mmE ge
  64. #define mmF go
  65. #elif RGB_BLUE == 2
  66. #define mmE be
  67. #define mmF bo
  68. #else
  69. #define mmE xe
  70. #define mmF xo
  71. #endif
  72. #if RGB_RED == 3
  73. #define mmG re
  74. #define mmH ro
  75. #elif RGB_GREEN == 3
  76. #define mmG ge
  77. #define mmH go
  78. #elif RGB_BLUE == 3
  79. #define mmG be
  80. #define mmH bo
  81. #else
  82. #define mmG xe
  83. #define mmH xo
  84. #endif
  85. void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
  86. JSAMPIMAGE output_buf, JDIMENSION output_row,
  87. int num_rows)
  88. {
  89. JSAMPROW inptr, outptr0, outptr1, outptr2;
  90. int num_cols, col;
  91. __m64 re, ro, ge, go, be, bo, xe;
  92. #if RGB_PIXELSIZE == 4
  93. __m64 xo;
  94. #endif
  95. __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
  96. __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
  97. __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
  98. __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
  99. __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
  100. __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
  101. __m64 crle, crhe, cre, crlo, crho, cro, cr;
  102. while (--num_rows >= 0) {
  103. inptr = *input_buf++;
  104. outptr0 = output_buf[0][output_row];
  105. outptr1 = output_buf[1][output_row];
  106. outptr2 = output_buf[2][output_row];
  107. output_row++;
  108. for (num_cols = image_width; num_cols > 0; num_cols -= 8,
  109. outptr0 += 8, outptr1 += 8, outptr2 += 8) {
  110. #if RGB_PIXELSIZE == 3
  111. if (num_cols < 8) {
  112. col = num_cols * 3;
  113. asm(".set noreorder\r\n"
  114. "li $8, 1\r\n"
  115. "move $9, %3\r\n"
  116. "and $10, $9, $8\r\n"
  117. "beqz $10, 1f\r\n"
  118. "nop \r\n"
  119. "subu $9, $9, 1\r\n"
  120. "xor $12, $12, $12\r\n"
  121. "move $13, %5\r\n"
  122. PTR_ADDU "$13, $13, $9\r\n"
  123. "lbu $12, 0($13)\r\n"
  124. "1: \r\n"
  125. "li $8, 2\r\n"
  126. "and $10, $9, $8\r\n"
  127. "beqz $10, 2f\r\n"
  128. "nop \r\n"
  129. "subu $9, $9, 2\r\n"
  130. "xor $11, $11, $11\r\n"
  131. "move $13, %5\r\n"
  132. PTR_ADDU "$13, $13, $9\r\n"
  133. "lhu $11, 0($13)\r\n"
  134. "sll $12, $12, 16\r\n"
  135. "or $12, $12, $11\r\n"
  136. "2: \r\n"
  137. "dmtc1 $12, %0\r\n"
  138. "li $8, 4\r\n"
  139. "and $10, $9, $8\r\n"
  140. "beqz $10, 3f\r\n"
  141. "nop \r\n"
  142. "subu $9, $9, 4\r\n"
  143. "move $13, %5\r\n"
  144. PTR_ADDU "$13, $13, $9\r\n"
  145. "lwu $14, 0($13)\r\n"
  146. "dmtc1 $14, %1\r\n"
  147. "dsll32 $12, $12, 0\r\n"
  148. "or $12, $12, $14\r\n"
  149. "dmtc1 $12, %0\r\n"
  150. "3: \r\n"
  151. "li $8, 8\r\n"
  152. "and $10, $9, $8\r\n"
  153. "beqz $10, 4f\r\n"
  154. "nop \r\n"
  155. "mov.s %1, %0\r\n"
  156. "ldc1 %0, 0(%5)\r\n"
  157. "li $9, 8\r\n"
  158. "j 5f\r\n"
  159. "nop \r\n"
  160. "4: \r\n"
  161. "li $8, 16\r\n"
  162. "and $10, $9, $8\r\n"
  163. "beqz $10, 5f\r\n"
  164. "nop \r\n"
  165. "mov.s %2, %0\r\n"
  166. "ldc1 %0, 0(%5)\r\n"
  167. "ldc1 %1, 8(%5)\r\n"
  168. "5: \r\n"
  169. "nop \r\n"
  170. ".set reorder\r\n"
  171. : "=f" (mmA), "=f" (mmG), "=f" (mmF)
  172. : "r" (col), "r" (num_rows), "r" (inptr)
  173. : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
  174. "$14", "memory"
  175. );
  176. } else {
  177. if (!(((long)inptr) & 7)) {
  178. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  179. mmG = _mm_load_si64((__m64 *)&inptr[8]);
  180. mmF = _mm_load_si64((__m64 *)&inptr[16]);
  181. } else {
  182. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  183. mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
  184. mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
  185. }
  186. inptr += RGB_PIXELSIZE * 8;
  187. }
  188. mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
  189. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  190. mmA = _mm_unpackhi_pi8(mmA, mmG);
  191. mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
  192. mmD = _mm_unpacklo_pi8(mmD, mmF);
  193. mmG = _mm_unpackhi_pi8(mmG, mmF);
  194. mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
  195. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  196. mmA = _mm_unpackhi_pi8(mmA, mmD);
  197. mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
  198. mmE = _mm_unpacklo_pi8(mmE, mmG);
  199. mmD = _mm_unpackhi_pi8(mmD, mmG);
  200. mmC = _mm_loadhi_pi8_f(mmA);
  201. mmA = _mm_loadlo_pi8_f(mmA);
  202. mmB = _mm_loadhi_pi8_f(mmE);
  203. mmE = _mm_loadlo_pi8_f(mmE);
  204. mmF = _mm_loadhi_pi8_f(mmD);
  205. mmD = _mm_loadlo_pi8_f(mmD);
  206. #else /* RGB_PIXELSIZE == 4 */
  207. if (num_cols < 8) {
  208. col = num_cols;
  209. asm(".set noreorder\r\n"
  210. "li $8, 1\r\n"
  211. "move $9, %4\r\n"
  212. "and $10, $9, $8\r\n"
  213. "beqz $10, 1f\r\n"
  214. "nop \r\n"
  215. "subu $9, $9, 1\r\n"
  216. PTR_SLL "$11, $9, 2\r\n"
  217. "move $13, %5\r\n"
  218. PTR_ADDU "$13, $13, $11\r\n"
  219. "lwc1 %0, 0($13)\r\n"
  220. "1: \r\n"
  221. "li $8, 2\r\n"
  222. "and $10, $9, $8\r\n"
  223. "beqz $10, 2f\r\n"
  224. "nop \r\n"
  225. "subu $9, $9, 2\r\n"
  226. PTR_SLL "$11, $9, 2\r\n"
  227. "move $13, %5\r\n"
  228. PTR_ADDU "$13, $13, $11\r\n"
  229. "mov.s %1, %0\r\n"
  230. "ldc1 %0, 0($13)\r\n"
  231. "2: \r\n"
  232. "li $8, 4\r\n"
  233. "and $10, $9, $8\r\n"
  234. "beqz $10, 3f\r\n"
  235. "nop \r\n"
  236. "mov.s %2, %0\r\n"
  237. "mov.s %3, %1\r\n"
  238. "ldc1 %0, 0(%5)\r\n"
  239. "ldc1 %1, 8(%5)\r\n"
  240. "3: \r\n"
  241. "nop \r\n"
  242. ".set reorder\r\n"
  243. : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
  244. : "r" (col), "r" (inptr)
  245. : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
  246. );
  247. } else {
  248. if (!(((long)inptr) & 7)) {
  249. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  250. mmF = _mm_load_si64((__m64 *)&inptr[8]);
  251. mmD = _mm_load_si64((__m64 *)&inptr[16]);
  252. mmC = _mm_load_si64((__m64 *)&inptr[24]);
  253. } else {
  254. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  255. mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
  256. mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
  257. mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
  258. }
  259. inptr += RGB_PIXELSIZE * 8;
  260. }
  261. mmB = _mm_unpackhi_pi8(mmA, mmF);
  262. mmA = _mm_unpacklo_pi8(mmA, mmF);
  263. mmG = _mm_unpackhi_pi8(mmD, mmC);
  264. mmD = _mm_unpacklo_pi8(mmD, mmC);
  265. mmE = _mm_unpackhi_pi16(mmA, mmD);
  266. mmA = _mm_unpacklo_pi16(mmA, mmD);
  267. mmH = _mm_unpackhi_pi16(mmB, mmG);
  268. mmB = _mm_unpacklo_pi16(mmB, mmG);
  269. mmC = _mm_loadhi_pi8_f(mmA);
  270. mmA = _mm_loadlo_pi8_f(mmA);
  271. mmD = _mm_loadhi_pi8_f(mmB);
  272. mmB = _mm_loadlo_pi8_f(mmB);
  273. mmG = _mm_loadhi_pi8_f(mmE);
  274. mmE = _mm_loadlo_pi8_f(mmE);
  275. mmF = _mm_unpacklo_pi8(mmH, mmH);
  276. mmH = _mm_unpackhi_pi8(mmH, mmH);
  277. mmF = _mm_srli_pi16(mmF, BYTE_BIT);
  278. mmH = _mm_srli_pi16(mmH, BYTE_BIT);
  279. #endif
  280. /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
  281. * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
  282. *
  283. * (Original)
  284. * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
  285. * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
  286. * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
  287. *
  288. * (This implementation)
  289. * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
  290. * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
  291. * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
  292. */
  293. rglo = _mm_unpacklo_pi16(ro, go);
  294. rgho = _mm_unpackhi_pi16(ro, go);
  295. ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
  296. yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
  297. cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
  298. cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
  299. blo = _mm_loadlo_pi16_f(bo);
  300. bho = _mm_loadhi_pi16_f(bo);
  301. halfblo = _mm_srli_pi32(blo, 1);
  302. halfbho = _mm_srli_pi32(bho, 1);
  303. cblo = _mm_add_pi32(cblo, halfblo);
  304. cbho = _mm_add_pi32(cbho, halfbho);
  305. cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
  306. cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
  307. cblo = _mm_srli_pi32(cblo, SCALEBITS);
  308. cbho = _mm_srli_pi32(cbho, SCALEBITS);
  309. cbo = _mm_packs_pi32(cblo, cbho);
  310. rgle = _mm_unpacklo_pi16(re, ge);
  311. rghe = _mm_unpackhi_pi16(re, ge);
  312. yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
  313. yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
  314. cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
  315. cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
  316. ble = _mm_loadlo_pi16_f(be);
  317. bhe = _mm_loadhi_pi16_f(be);
  318. halfble = _mm_srli_pi32(ble, 1);
  319. halfbhe = _mm_srli_pi32(bhe, 1);
  320. cble = _mm_add_pi32(cble, halfble);
  321. cbhe = _mm_add_pi32(cbhe, halfbhe);
  322. cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
  323. cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
  324. cble = _mm_srli_pi32(cble, SCALEBITS);
  325. cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
  326. cbe = _mm_packs_pi32(cble, cbhe);
  327. cbo = _mm_slli_pi16(cbo, BYTE_BIT);
  328. cb = _mm_or_si64(cbe, cbo);
  329. bglo = _mm_unpacklo_pi16(bo, go);
  330. bgho = _mm_unpackhi_pi16(bo, go);
  331. ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
  332. yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
  333. crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
  334. crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
  335. ylo = _mm_add_pi32(ylo_bg, ylo_rg);
  336. yho = _mm_add_pi32(yho_bg, yho_rg);
  337. ylo = _mm_add_pi32(ylo, PD_ONEHALF);
  338. yho = _mm_add_pi32(yho, PD_ONEHALF);
  339. ylo = _mm_srli_pi32(ylo, SCALEBITS);
  340. yho = _mm_srli_pi32(yho, SCALEBITS);
  341. yo = _mm_packs_pi32(ylo, yho);
  342. rlo = _mm_loadlo_pi16_f(ro);
  343. rho = _mm_loadhi_pi16_f(ro);
  344. halfrlo = _mm_srli_pi32(rlo, 1);
  345. halfrho = _mm_srli_pi32(rho, 1);
  346. crlo = _mm_add_pi32(crlo, halfrlo);
  347. crho = _mm_add_pi32(crho, halfrho);
  348. crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
  349. crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
  350. crlo = _mm_srli_pi32(crlo, SCALEBITS);
  351. crho = _mm_srli_pi32(crho, SCALEBITS);
  352. cro = _mm_packs_pi32(crlo, crho);
  353. bgle = _mm_unpacklo_pi16(be, ge);
  354. bghe = _mm_unpackhi_pi16(be, ge);
  355. yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
  356. yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
  357. crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
  358. crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
  359. yle = _mm_add_pi32(yle_bg, yle_rg);
  360. yhe = _mm_add_pi32(yhe_bg, yhe_rg);
  361. yle = _mm_add_pi32(yle, PD_ONEHALF);
  362. yhe = _mm_add_pi32(yhe, PD_ONEHALF);
  363. yle = _mm_srli_pi32(yle, SCALEBITS);
  364. yhe = _mm_srli_pi32(yhe, SCALEBITS);
  365. ye = _mm_packs_pi32(yle, yhe);
  366. yo = _mm_slli_pi16(yo, BYTE_BIT);
  367. y = _mm_or_si64(ye, yo);
  368. rle = _mm_loadlo_pi16_f(re);
  369. rhe = _mm_loadhi_pi16_f(re);
  370. halfrle = _mm_srli_pi32(rle, 1);
  371. halfrhe = _mm_srli_pi32(rhe, 1);
  372. crle = _mm_add_pi32(crle, halfrle);
  373. crhe = _mm_add_pi32(crhe, halfrhe);
  374. crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
  375. crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
  376. crle = _mm_srli_pi32(crle, SCALEBITS);
  377. crhe = _mm_srli_pi32(crhe, SCALEBITS);
  378. cre = _mm_packs_pi32(crle, crhe);
  379. cro = _mm_slli_pi16(cro, BYTE_BIT);
  380. cr = _mm_or_si64(cre, cro);
  381. _mm_store_si64((__m64 *)&outptr0[0], y);
  382. _mm_store_si64((__m64 *)&outptr1[0], cb);
  383. _mm_store_si64((__m64 *)&outptr2[0], cr);
  384. }
  385. }
  386. }
  387. #undef mmA
  388. #undef mmB
  389. #undef mmC
  390. #undef mmD
  391. #undef mmE
  392. #undef mmF
  393. #undef mmG
  394. #undef mmH