jcgryext-mmi.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
  6. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  7. * All Rights Reserved.
  8. * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
  9. *
  10. * Based on the x86 SIMD extension for IJG JPEG library
  11. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  12. *
  13. * This software is provided 'as-is', without any express or implied
  14. * warranty. In no event will the authors be held liable for any damages
  15. * arising from the use of this software.
  16. *
  17. * Permission is granted to anyone to use this software for any purpose,
  18. * including commercial applications, and to alter it and redistribute it
  19. * freely, subject to the following restrictions:
  20. *
  21. * 1. The origin of this software must not be misrepresented; you must not
  22. * claim that you wrote the original software. If you use this software
  23. * in a product, an acknowledgment in the product documentation would be
  24. * appreciated but is not required.
  25. * 2. Altered source versions must be plainly marked as such, and must not be
  26. * misrepresented as being the original software.
  27. * 3. This notice may not be removed or altered from any source distribution.
  28. */
  29. /* This file is included by jcgray-mmi.c */
  30. #if RGB_RED == 0
  31. #define mmA re
  32. #define mmB ro
  33. #elif RGB_GREEN == 0
  34. #define mmA ge
  35. #define mmB go
  36. #elif RGB_BLUE == 0
  37. #define mmA be
  38. #define mmB bo
  39. #else
  40. #define mmA xe
  41. #define mmB xo
  42. #endif
  43. #if RGB_RED == 1
  44. #define mmC re
  45. #define mmD ro
  46. #elif RGB_GREEN == 1
  47. #define mmC ge
  48. #define mmD go
  49. #elif RGB_BLUE == 1
  50. #define mmC be
  51. #define mmD bo
  52. #else
  53. #define mmC xe
  54. #define mmD xo
  55. #endif
  56. #if RGB_RED == 2
  57. #define mmE re
  58. #define mmF ro
  59. #elif RGB_GREEN == 2
  60. #define mmE ge
  61. #define mmF go
  62. #elif RGB_BLUE == 2
  63. #define mmE be
  64. #define mmF bo
  65. #else
  66. #define mmE xe
  67. #define mmF xo
  68. #endif
  69. #if RGB_RED == 3
  70. #define mmG re
  71. #define mmH ro
  72. #elif RGB_GREEN == 3
  73. #define mmG ge
  74. #define mmH go
  75. #elif RGB_BLUE == 3
  76. #define mmG be
  77. #define mmH bo
  78. #else
  79. #define mmG xe
  80. #define mmH xo
  81. #endif
  82. void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
  83. JSAMPIMAGE output_buf, JDIMENSION output_row,
  84. int num_rows)
  85. {
  86. JSAMPROW inptr, outptr;
  87. int num_cols, col;
  88. __m64 re, ro, ge, go, be, bo, xe;
  89. #if RGB_PIXELSIZE == 4
  90. __m64 xo;
  91. #endif
  92. __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
  93. __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
  94. __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
  95. while (--num_rows >= 0) {
  96. inptr = *input_buf++;
  97. outptr = output_buf[0][output_row];
  98. output_row++;
  99. for (num_cols = image_width; num_cols > 0; num_cols -= 8,
  100. outptr += 8) {
  101. #if RGB_PIXELSIZE == 3
  102. if (num_cols < 8) {
  103. col = num_cols * 3;
  104. asm(".set noreorder\r\n"
  105. "li $8, 1\r\n"
  106. "move $9, %3\r\n"
  107. "and $10, $9, $8\r\n"
  108. "beqz $10, 1f\r\n"
  109. "nop \r\n"
  110. "subu $9, $9, 1\r\n"
  111. "xor $12, $12, $12\r\n"
  112. "move $13, %5\r\n"
  113. PTR_ADDU "$13, $13, $9\r\n"
  114. "lbu $12, 0($13)\r\n"
  115. "1: \r\n"
  116. "li $8, 2\r\n"
  117. "and $10, $9, $8\r\n"
  118. "beqz $10, 2f\r\n"
  119. "nop \r\n"
  120. "subu $9, $9, 2\r\n"
  121. "xor $11, $11, $11\r\n"
  122. "move $13, %5\r\n"
  123. PTR_ADDU "$13, $13, $9\r\n"
  124. "lhu $11, 0($13)\r\n"
  125. "sll $12, $12, 16\r\n"
  126. "or $12, $12, $11\r\n"
  127. "2: \r\n"
  128. "dmtc1 $12, %0\r\n"
  129. "li $8, 4\r\n"
  130. "and $10, $9, $8\r\n"
  131. "beqz $10, 3f\r\n"
  132. "nop \r\n"
  133. "subu $9, $9, 4\r\n"
  134. "move $13, %5\r\n"
  135. PTR_ADDU "$13, $13, $9\r\n"
  136. "lwu $14, 0($13)\r\n"
  137. "dmtc1 $14, %1\r\n"
  138. "dsll32 $12, $12, 0\r\n"
  139. "or $12, $12, $14\r\n"
  140. "dmtc1 $12, %0\r\n"
  141. "3: \r\n"
  142. "li $8, 8\r\n"
  143. "and $10, $9, $8\r\n"
  144. "beqz $10, 4f\r\n"
  145. "nop \r\n"
  146. "mov.s %1, %0\r\n"
  147. "ldc1 %0, 0(%5)\r\n"
  148. "li $9, 8\r\n"
  149. "j 5f\r\n"
  150. "nop \r\n"
  151. "4: \r\n"
  152. "li $8, 16\r\n"
  153. "and $10, $9, $8\r\n"
  154. "beqz $10, 5f\r\n"
  155. "nop \r\n"
  156. "mov.s %2, %0\r\n"
  157. "ldc1 %0, 0(%5)\r\n"
  158. "ldc1 %1, 8(%5)\r\n"
  159. "5: \r\n"
  160. "nop \r\n"
  161. ".set reorder\r\n"
  162. : "=f" (mmA), "=f" (mmG), "=f" (mmF)
  163. : "r" (col), "r" (num_rows), "r" (inptr)
  164. : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
  165. "$14", "memory"
  166. );
  167. } else {
  168. if (!(((long)inptr) & 7)) {
  169. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  170. mmG = _mm_load_si64((__m64 *)&inptr[8]);
  171. mmF = _mm_load_si64((__m64 *)&inptr[16]);
  172. } else {
  173. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  174. mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
  175. mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
  176. }
  177. inptr += RGB_PIXELSIZE * 8;
  178. }
  179. mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
  180. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  181. mmA = _mm_unpackhi_pi8(mmA, mmG);
  182. mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
  183. mmD = _mm_unpacklo_pi8(mmD, mmF);
  184. mmG = _mm_unpackhi_pi8(mmG, mmF);
  185. mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
  186. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  187. mmA = _mm_unpackhi_pi8(mmA, mmD);
  188. mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
  189. mmE = _mm_unpacklo_pi8(mmE, mmG);
  190. mmD = _mm_unpackhi_pi8(mmD, mmG);
  191. mmC = _mm_loadhi_pi8_f(mmA);
  192. mmA = _mm_loadlo_pi8_f(mmA);
  193. mmB = _mm_loadhi_pi8_f(mmE);
  194. mmE = _mm_loadlo_pi8_f(mmE);
  195. mmF = _mm_loadhi_pi8_f(mmD);
  196. mmD = _mm_loadlo_pi8_f(mmD);
  197. #else /* RGB_PIXELSIZE == 4 */
  198. if (num_cols < 8) {
  199. col = num_cols;
  200. asm(".set noreorder\r\n"
  201. "li $8, 1\r\n"
  202. "move $9, %4\r\n"
  203. "and $10, $9, $8\r\n"
  204. "beqz $10, 1f\r\n"
  205. "nop \r\n"
  206. "subu $9, $9, 1\r\n"
  207. PTR_SLL "$11, $9, 2\r\n"
  208. "move $13, %5\r\n"
  209. PTR_ADDU "$13, $13, $11\r\n"
  210. "lwc1 %0, 0($13)\r\n"
  211. "1: \r\n"
  212. "li $8, 2\r\n"
  213. "and $10, $9, $8\r\n"
  214. "beqz $10, 2f\r\n"
  215. "nop \r\n"
  216. "subu $9, $9, 2\r\n"
  217. PTR_SLL "$11, $9, 2\r\n"
  218. "move $13, %5\r\n"
  219. PTR_ADDU "$13, $13, $11\r\n"
  220. "mov.s %1, %0\r\n"
  221. "ldc1 %0, 0($13)\r\n"
  222. "2: \r\n"
  223. "li $8, 4\r\n"
  224. "and $10, $9, $8\r\n"
  225. "beqz $10, 3f\r\n"
  226. "nop \r\n"
  227. "mov.s %2, %0\r\n"
  228. "mov.s %3, %1\r\n"
  229. "ldc1 %0, 0(%5)\r\n"
  230. "ldc1 %1, 8(%5)\r\n"
  231. "3: \r\n"
  232. "nop \r\n"
  233. ".set reorder\r\n"
  234. : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
  235. : "r" (col), "r" (inptr)
  236. : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
  237. );
  238. } else {
  239. if (!(((long)inptr) & 7)) {
  240. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  241. mmF = _mm_load_si64((__m64 *)&inptr[8]);
  242. mmD = _mm_load_si64((__m64 *)&inptr[16]);
  243. mmC = _mm_load_si64((__m64 *)&inptr[24]);
  244. } else {
  245. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  246. mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
  247. mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
  248. mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
  249. }
  250. inptr += RGB_PIXELSIZE * 8;
  251. }
  252. mmB = _mm_unpackhi_pi8(mmA, mmF);
  253. mmA = _mm_unpacklo_pi8(mmA, mmF);
  254. mmG = _mm_unpackhi_pi8(mmD, mmC);
  255. mmD = _mm_unpacklo_pi8(mmD, mmC);
  256. mmE = _mm_unpackhi_pi16(mmA, mmD);
  257. mmA = _mm_unpacklo_pi16(mmA, mmD);
  258. mmH = _mm_unpackhi_pi16(mmB, mmG);
  259. mmB = _mm_unpacklo_pi16(mmB, mmG);
  260. mmC = _mm_loadhi_pi8_f(mmA);
  261. mmA = _mm_loadlo_pi8_f(mmA);
  262. mmD = _mm_loadhi_pi8_f(mmB);
  263. mmB = _mm_loadlo_pi8_f(mmB);
  264. mmG = _mm_loadhi_pi8_f(mmE);
  265. mmE = _mm_loadlo_pi8_f(mmE);
  266. mmF = _mm_unpacklo_pi8(mmH, mmH);
  267. mmH = _mm_unpackhi_pi8(mmH, mmH);
  268. mmF = _mm_srli_pi16(mmF, BYTE_BIT);
  269. mmH = _mm_srli_pi16(mmH, BYTE_BIT);
  270. #endif
  271. /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
  272. * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
  273. *
  274. * (Original)
  275. * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
  276. *
  277. * (This implementation)
  278. * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
  279. */
  280. rglo = _mm_unpacklo_pi16(ro, go);
  281. rgho = _mm_unpackhi_pi16(ro, go);
  282. ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
  283. yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
  284. rgle = _mm_unpacklo_pi16(re, ge);
  285. rghe = _mm_unpackhi_pi16(re, ge);
  286. yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
  287. yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
  288. bglo = _mm_unpacklo_pi16(bo, go);
  289. bgho = _mm_unpackhi_pi16(bo, go);
  290. ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
  291. yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
  292. ylo = _mm_add_pi32(ylo_bg, ylo_rg);
  293. yho = _mm_add_pi32(yho_bg, yho_rg);
  294. ylo = _mm_add_pi32(ylo, PD_ONEHALF);
  295. yho = _mm_add_pi32(yho, PD_ONEHALF);
  296. ylo = _mm_srli_pi32(ylo, SCALEBITS);
  297. yho = _mm_srli_pi32(yho, SCALEBITS);
  298. yo = _mm_packs_pi32(ylo, yho);
  299. bgle = _mm_unpacklo_pi16(be, ge);
  300. bghe = _mm_unpackhi_pi16(be, ge);
  301. yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
  302. yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
  303. yle = _mm_add_pi32(yle_bg, yle_rg);
  304. yhe = _mm_add_pi32(yhe_bg, yhe_rg);
  305. yle = _mm_add_pi32(yle, PD_ONEHALF);
  306. yhe = _mm_add_pi32(yhe, PD_ONEHALF);
  307. yle = _mm_srli_pi32(yle, SCALEBITS);
  308. yhe = _mm_srli_pi32(yhe, SCALEBITS);
  309. ye = _mm_packs_pi32(yle, yhe);
  310. yo = _mm_slli_pi16(yo, BYTE_BIT);
  311. y = _mm_or_si64(ye, yo);
  312. _mm_store_si64((__m64 *)&outptr[0], y);
  313. }
  314. }
  315. }
  316. #undef mmA
  317. #undef mmB
  318. #undef mmC
  319. #undef mmD
  320. #undef mmE
  321. #undef mmF
  322. #undef mmG
  323. #undef mmH