jsimd_dspr2.S 145 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543
  1. /*
  2. * MIPS DSPr2 optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  5. * All Rights Reserved.
  6. * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
  7. * Darko Laus <darko.laus@imgtec.com>
  8. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  9. *
  10. * This software is provided 'as-is', without any express or implied
  11. * warranty. In no event will the authors be held liable for any damages
  12. * arising from the use of this software.
  13. *
  14. * Permission is granted to anyone to use this software for any purpose,
  15. * including commercial applications, and to alter it and redistribute it
  16. * freely, subject to the following restrictions:
  17. *
  18. * 1. The origin of this software must not be misrepresented; you must not
  19. * claim that you wrote the original software. If you use this software
  20. * in a product, an acknowledgment in the product documentation would be
  21. * appreciated but is not required.
  22. * 2. Altered source versions must be plainly marked as such, and must not be
  23. * misrepresented as being the original software.
  24. * 3. This notice may not be removed or altered from any source distribution.
  25. */
  26. #include "jsimd_dspr2_asm.h"
  27. /*****************************************************************************/
  28. LEAF_DSPR2(jsimd_c_null_convert_dspr2)
  29. /*
  30. * a0 = cinfo->image_width
  31. * a1 = input_buf
  32. * a2 = output_buf
  33. * a3 = output_row
  34. * 16(sp) = num_rows
  35. * 20(sp) = cinfo->num_components
  36. *
  37. * Null conversion for compression
  38. */
  39. SAVE_REGS_ON_STACK 8, s0, s1
  40. lw t9, 24(sp) /* t9 = num_rows */
  41. lw s0, 28(sp) /* s0 = cinfo->num_components */
  42. andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */
  43. beqz t0, 4f /* no residual */
  44. nop
  45. 0:
  46. addiu t9, t9, -1
  47. bltz t9, 7f
  48. li t1, 0
  49. 1:
  50. sll t3, t1, 2
  51. lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
  52. lw t2, 0(a1) /* t2 = inptr = *input_buf */
  53. sll t4, a3, 2
  54. lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
  55. addu t2, t2, t1
  56. addu s1, t5, a0
  57. addu t6, t5, t0
  58. 2:
  59. lbu t3, 0(t2)
  60. addiu t5, t5, 1
  61. sb t3, -1(t5)
  62. bne t6, t5, 2b
  63. addu t2, t2, s0
  64. 3:
  65. lbu t3, 0(t2)
  66. addu t4, t2, s0
  67. addu t7, t4, s0
  68. addu t8, t7, s0
  69. addu t2, t8, s0
  70. lbu t4, 0(t4)
  71. lbu t7, 0(t7)
  72. lbu t8, 0(t8)
  73. addiu t5, t5, 4
  74. sb t3, -4(t5)
  75. sb t4, -3(t5)
  76. sb t7, -2(t5)
  77. bne s1, t5, 3b
  78. sb t8, -1(t5)
  79. addiu t1, t1, 1
  80. bne t1, s0, 1b
  81. nop
  82. addiu a1, a1, 4
  83. bgez t9, 0b
  84. addiu a3, a3, 1
  85. b 7f
  86. nop
  87. 4:
  88. addiu t9, t9, -1
  89. bltz t9, 7f
  90. li t1, 0
  91. 5:
  92. sll t3, t1, 2
  93. lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
  94. lw t2, 0(a1) /* t2 = inptr = *input_buf */
  95. sll t4, a3, 2
  96. lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
  97. addu t2, t2, t1
  98. addu s1, t5, a0
  99. addu t6, t5, t0
  100. 6:
  101. lbu t3, 0(t2)
  102. addu t4, t2, s0
  103. addu t7, t4, s0
  104. addu t8, t7, s0
  105. addu t2, t8, s0
  106. lbu t4, 0(t4)
  107. lbu t7, 0(t7)
  108. lbu t8, 0(t8)
  109. addiu t5, t5, 4
  110. sb t3, -4(t5)
  111. sb t4, -3(t5)
  112. sb t7, -2(t5)
  113. bne s1, t5, 6b
  114. sb t8, -1(t5)
  115. addiu t1, t1, 1
  116. bne t1, s0, 5b
  117. nop
  118. addiu a1, a1, 4
  119. bgez t9, 4b
  120. addiu a3, a3, 1
  121. 7:
  122. RESTORE_REGS_FROM_STACK 8, s0, s1
  123. j ra
  124. nop
  125. END(jsimd_c_null_convert_dspr2)
  126. /*****************************************************************************/
  127. /*
  128. * jsimd_extrgb_ycc_convert_dspr2
  129. * jsimd_extbgr_ycc_convert_dspr2
  130. * jsimd_extrgbx_ycc_convert_dspr2
  131. * jsimd_extbgrx_ycc_convert_dspr2
  132. * jsimd_extxbgr_ycc_convert_dspr2
  133. * jsimd_extxrgb_ycc_convert_dspr2
  134. *
  135. * Colorspace conversion RGB -> YCbCr
  136. */
  137. .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
  138. r_offs, g_offs, b_offs
  139. .macro DO_RGB_TO_YCC r, g, b, inptr
  140. lbu \r, \r_offs(\inptr)
  141. lbu \g, \g_offs(\inptr)
  142. lbu \b, \b_offs(\inptr)
  143. addiu \inptr, \pixel_size
  144. .endm
  145. LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
  146. /*
  147. * a0 = cinfo->image_width
  148. * a1 = input_buf
  149. * a2 = output_buf
  150. * a3 = output_row
  151. * 16(sp) = num_rows
  152. */
  153. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  154. lw t7, 48(sp) /* t7 = num_rows */
  155. li s0, 0x4c8b /* FIX(0.29900) */
  156. li s1, 0x9646 /* FIX(0.58700) */
  157. li s2, 0x1d2f /* FIX(0.11400) */
  158. li s3, 0xffffd4cd /* -FIX(0.16874) */
  159. li s4, 0xffffab33 /* -FIX(0.33126) */
  160. li s5, 0x8000 /* FIX(0.50000) */
  161. li s6, 0xffff94d1 /* -FIX(0.41869) */
  162. li s7, 0xffffeb2f /* -FIX(0.08131) */
  163. li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */
  164. 0:
  165. addiu t7, -1 /* --num_rows */
  166. lw t6, 0(a1) /* t6 = input_buf[0] */
  167. lw t0, 0(a2)
  168. lw t1, 4(a2)
  169. lw t2, 8(a2)
  170. sll t3, a3, 2
  171. lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */
  172. lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */
  173. lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */
  174. addu t9, t2, a0 /* t9 = end address */
  175. addiu a3, 1
  176. 1:
  177. DO_RGB_TO_YCC t3, t4, t5, t6
  178. mtlo s5, $ac0
  179. mtlo t8, $ac1
  180. mtlo t8, $ac2
  181. maddu $ac0, s2, t5
  182. maddu $ac1, s5, t5
  183. maddu $ac2, s5, t3
  184. maddu $ac0, s0, t3
  185. maddu $ac1, s3, t3
  186. maddu $ac2, s6, t4
  187. maddu $ac0, s1, t4
  188. maddu $ac1, s4, t4
  189. maddu $ac2, s7, t5
  190. extr.w t3, $ac0, 16
  191. extr.w t4, $ac1, 16
  192. extr.w t5, $ac2, 16
  193. sb t3, 0(t0)
  194. sb t4, 0(t1)
  195. sb t5, 0(t2)
  196. addiu t0, 1
  197. addiu t2, 1
  198. bne t2, t9, 1b
  199. addiu t1, 1
  200. bgtz t7, 0b
  201. addiu a1, 4
  202. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  203. j ra
  204. nop
  205. END(jsimd_\colorid\()_ycc_convert_dspr2)
  206. .purgem DO_RGB_TO_YCC
  207. .endm
  208. /*-------------------------------------id -- pix R G B */
  209. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
  210. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
  211. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
  212. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
  213. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
  214. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
  215. /*****************************************************************************/
  216. /*
  217. * jsimd_ycc_extrgb_convert_dspr2
  218. * jsimd_ycc_extbgr_convert_dspr2
  219. * jsimd_ycc_extrgbx_convert_dspr2
  220. * jsimd_ycc_extbgrx_convert_dspr2
  221. * jsimd_ycc_extxbgr_convert_dspr2
  222. * jsimd_ycc_extxrgb_convert_dspr2
  223. *
  224. * Colorspace conversion YCbCr -> RGB
  225. */
  226. .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
  227. r_offs, g_offs, b_offs, a_offs
  228. .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
  229. sb \scratch0, \r_offs(\outptr)
  230. sb \scratch1, \g_offs(\outptr)
  231. sb \scratch2, \b_offs(\outptr)
  232. .if (\pixel_size == 4)
  233. li t0, 0xFF
  234. sb t0, \a_offs(\outptr)
  235. .endif
  236. addiu \outptr, \pixel_size
  237. .endm
  238. LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
  239. /*
  240. * a0 = cinfo->image_width
  241. * a1 = input_buf
  242. * a2 = input_row
  243. * a3 = output_buf
  244. * 16(sp) = num_rows
  245. */
  246. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  247. lw s1, 48(sp)
  248. li t3, 0x8000
  249. li t4, 0x166e9 /* FIX(1.40200) */
  250. li t5, 0x1c5a2 /* FIX(1.77200) */
  251. li t6, 0xffff492e /* -FIX(0.71414) */
  252. li t7, 0xffffa7e6 /* -FIX(0.34414) */
  253. repl.ph t8, 128
  254. 0:
  255. lw s0, 0(a3)
  256. lw t0, 0(a1)
  257. lw t1, 4(a1)
  258. lw t2, 8(a1)
  259. sll s5, a2, 2
  260. addiu s1, -1
  261. lwx s2, s5(t0)
  262. lwx s3, s5(t1)
  263. lwx s4, s5(t2)
  264. addu t9, s2, a0
  265. addiu a2, 1
  266. 1:
  267. lbu s7, 0(s4) /* cr */
  268. lbu s6, 0(s3) /* cb */
  269. lbu s5, 0(s2) /* y */
  270. addiu s2, 1
  271. addiu s4, 1
  272. addiu s7, -128
  273. addiu s6, -128
  274. mul t2, t7, s6
  275. mul t0, t6, s7 /* Crgtab[cr] */
  276. sll s7, 15
  277. mulq_rs.w t1, t4, s7 /* Crrtab[cr] */
  278. sll s6, 15
  279. addu t2, t3 /* Cbgtab[cb] */
  280. addu t2, t0
  281. mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */
  282. sra t2, 16
  283. addu t1, s5
  284. addu t2, s5 /* add y */
  285. ins t2, t1, 16, 16
  286. subu.ph t2, t2, t8
  287. addu t0, s5
  288. shll_s.ph t2, t2, 8
  289. subu t0, 128
  290. shra.ph t2, t2, 8
  291. shll_s.w t0, t0, 24
  292. addu.ph t2, t2, t8 /* clip & store */
  293. sra t0, t0, 24
  294. sra t1, t2, 16
  295. addiu t0, 128
  296. STORE_YCC_TO_RGB t1, t2, t0, s0
  297. bne s2, t9, 1b
  298. addiu s3, 1
  299. bgtz s1, 0b
  300. addiu a3, 4
  301. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  302. j ra
  303. nop
  304. END(jsimd_ycc_\colorid\()_convert_dspr2)
  305. .purgem STORE_YCC_TO_RGB
  306. .endm
  307. /*-------------------------------------id -- pix R G B A */
  308. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
  309. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
  310. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
  311. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
  312. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
  313. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
  314. /*****************************************************************************/
  315. /*
  316. * jsimd_extrgb_gray_convert_dspr2
  317. * jsimd_extbgr_gray_convert_dspr2
  318. * jsimd_extrgbx_gray_convert_dspr2
  319. * jsimd_extbgrx_gray_convert_dspr2
  320. * jsimd_extxbgr_gray_convert_dspr2
  321. * jsimd_extxrgb_gray_convert_dspr2
  322. *
  323. * Colorspace conversion RGB -> GRAY
  324. */
  325. .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
  326. r_offs, g_offs, b_offs
  327. .macro DO_RGB_TO_GRAY r, g, b, inptr
  328. lbu \r, \r_offs(\inptr)
  329. lbu \g, \g_offs(\inptr)
  330. lbu \b, \b_offs(\inptr)
  331. addiu \inptr, \pixel_size
  332. .endm
  333. LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
  334. /*
  335. * a0 = cinfo->image_width
  336. * a1 = input_buf
  337. * a2 = output_buf
  338. * a3 = output_row
  339. * 16(sp) = num_rows
  340. */
  341. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  342. li s0, 0x4c8b /* s0 = FIX(0.29900) */
  343. li s1, 0x9646 /* s1 = FIX(0.58700) */
  344. li s2, 0x1d2f /* s2 = FIX(0.11400) */
  345. li s7, 0x8000 /* s7 = FIX(0.50000) */
  346. lw s6, 48(sp)
  347. andi t7, a0, 3
  348. 0:
  349. addiu s6, -1 /* s6 = num_rows */
  350. lw t0, 0(a1)
  351. lw t1, 0(a2)
  352. sll t3, a3, 2
  353. lwx t1, t3(t1)
  354. addiu a3, 1
  355. addu t9, t1, a0
  356. subu t8, t9, t7
  357. beq t1, t8, 2f
  358. nop
  359. 1:
  360. DO_RGB_TO_GRAY t3, t4, t5, t0
  361. DO_RGB_TO_GRAY s3, s4, s5, t0
  362. mtlo s7, $ac0
  363. maddu $ac0, s2, t5
  364. maddu $ac0, s1, t4
  365. maddu $ac0, s0, t3
  366. mtlo s7, $ac1
  367. maddu $ac1, s2, s5
  368. maddu $ac1, s1, s4
  369. maddu $ac1, s0, s3
  370. extr.w t6, $ac0, 16
  371. DO_RGB_TO_GRAY t3, t4, t5, t0
  372. DO_RGB_TO_GRAY s3, s4, s5, t0
  373. mtlo s7, $ac0
  374. maddu $ac0, s2, t5
  375. maddu $ac0, s1, t4
  376. extr.w t2, $ac1, 16
  377. maddu $ac0, s0, t3
  378. mtlo s7, $ac1
  379. maddu $ac1, s2, s5
  380. maddu $ac1, s1, s4
  381. maddu $ac1, s0, s3
  382. extr.w t5, $ac0, 16
  383. sb t6, 0(t1)
  384. sb t2, 1(t1)
  385. extr.w t3, $ac1, 16
  386. addiu t1, 4
  387. sb t5, -2(t1)
  388. sb t3, -1(t1)
  389. bne t1, t8, 1b
  390. nop
  391. 2:
  392. beqz t7, 4f
  393. nop
  394. 3:
  395. DO_RGB_TO_GRAY t3, t4, t5, t0
  396. mtlo s7, $ac0
  397. maddu $ac0, s2, t5
  398. maddu $ac0, s1, t4
  399. maddu $ac0, s0, t3
  400. extr.w t6, $ac0, 16
  401. sb t6, 0(t1)
  402. addiu t1, 1
  403. bne t1, t9, 3b
  404. nop
  405. 4:
  406. bgtz s6, 0b
  407. addiu a1, 4
  408. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  409. j ra
  410. nop
  411. END(jsimd_\colorid\()_gray_convert_dspr2)
  412. .purgem DO_RGB_TO_GRAY
  413. .endm
  414. /*-------------------------------------id -- pix R G B */
  415. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
  416. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
  417. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
  418. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
  419. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
  420. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
  421. /*****************************************************************************/
  422. /*
  423. * jsimd_h2v2_merged_upsample_dspr2
  424. * jsimd_h2v2_extrgb_merged_upsample_dspr2
  425. * jsimd_h2v2_extrgbx_merged_upsample_dspr2
  426. * jsimd_h2v2_extbgr_merged_upsample_dspr2
  427. * jsimd_h2v2_extbgrx_merged_upsample_dspr2
  428. * jsimd_h2v2_extxbgr_merged_upsample_dspr2
  429. * jsimd_h2v2_extxrgb_merged_upsample_dspr2
  430. *
  431. * Merged h2v2 upsample routines
  432. */
  433. .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
  434. r1_offs, g1_offs, \
  435. b1_offs, a1_offs, \
  436. r2_offs, g2_offs, \
  437. b2_offs, a2_offs
  438. .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
  439. scratch5 outptr
  440. sb \scratch0, \r1_offs(\outptr)
  441. sb \scratch1, \g1_offs(\outptr)
  442. sb \scratch2, \b1_offs(\outptr)
  443. sb \scratch3, \r2_offs(\outptr)
  444. sb \scratch4, \g2_offs(\outptr)
  445. sb \scratch5, \b2_offs(\outptr)
  446. .if (\pixel_size == 8)
  447. li \scratch0, 0xFF
  448. sb \scratch0, \a1_offs(\outptr)
  449. sb \scratch0, \a2_offs(\outptr)
  450. .endif
  451. addiu \outptr, \pixel_size
  452. .endm
  453. .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
  454. sb \scratch0, \r1_offs(\outptr)
  455. sb \scratch1, \g1_offs(\outptr)
  456. sb \scratch2, \b1_offs(\outptr)
  457. .if (\pixel_size == 8)
  458. li t0, 0xFF
  459. sb t0, \a1_offs(\outptr)
  460. .endif
  461. .endm
  462. LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
  463. /*
  464. * a0 = cinfo->output_width
  465. * a1 = input_buf
  466. * a2 = in_row_group_ctr
  467. * a3 = output_buf
  468. * 16(sp) = cinfo->sample_range_limit
  469. */
  470. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  471. lw t9, 56(sp) /* cinfo->sample_range_limit */
  472. lw v0, 0(a1)
  473. lw v1, 4(a1)
  474. lw t0, 8(a1)
  475. sll t1, a2, 3
  476. addiu t2, t1, 4
  477. sll t3, a2, 2
  478. lw t4, 0(a3) /* t4 = output_buf[0] */
  479. lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */
  480. lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
  481. lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */
  482. lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */
  483. lw t7, 4(a3) /* t7 = output_buf[1] */
  484. li s1, 0xe6ea
  485. addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */
  486. addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */
  487. addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
  488. xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
  489. srl t3, a0, 1
  490. blez t3, 2f
  491. addu t0, t5, t3 /* t0 = end address */
  492. 1:
  493. lbu t3, 0(t5)
  494. lbu s3, 0(t6)
  495. addiu t5, t5, 1
  496. addiu t3, t3, -128 /* (cb - 128) */
  497. addiu s3, s3, -128 /* (cr - 128) */
  498. mult $ac1, s1, t3
  499. madd $ac1, s2, s3
  500. sll s3, s3, 15
  501. sll t3, t3, 15
  502. mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
  503. extr_r.w s5, $ac1, 16
  504. mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
  505. lbu v0, 0(t1)
  506. addiu t6, t6, 1
  507. addiu t1, t1, 2
  508. addu t3, v0, s4 /* y+cred */
  509. addu s3, v0, s5 /* y+cgreen */
  510. addu v1, v0, s6 /* y+cblue */
  511. addu t3, t9, t3 /* y+cred */
  512. addu s3, t9, s3 /* y+cgreen */
  513. addu v1, t9, v1 /* y+cblue */
  514. lbu AT, 0(t3)
  515. lbu s7, 0(s3)
  516. lbu ra, 0(v1)
  517. lbu v0, -1(t1)
  518. addu t3, v0, s4 /* y+cred */
  519. addu s3, v0, s5 /* y+cgreen */
  520. addu v1, v0, s6 /* y+cblue */
  521. addu t3, t9, t3 /* y+cred */
  522. addu s3, t9, s3 /* y+cgreen */
  523. addu v1, t9, v1 /* y+cblue */
  524. lbu t3, 0(t3)
  525. lbu s3, 0(s3)
  526. lbu v1, 0(v1)
  527. lbu v0, 0(t2)
  528. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
  529. addu t3, v0, s4 /* y+cred */
  530. addu s3, v0, s5 /* y+cgreen */
  531. addu v1, v0, s6 /* y+cblue */
  532. addu t3, t9, t3 /* y+cred */
  533. addu s3, t9, s3 /* y+cgreen */
  534. addu v1, t9, v1 /* y+cblue */
  535. lbu AT, 0(t3)
  536. lbu s7, 0(s3)
  537. lbu ra, 0(v1)
  538. lbu v0, 1(t2)
  539. addiu t2, t2, 2
  540. addu t3, v0, s4 /* y+cred */
  541. addu s3, v0, s5 /* y+cgreen */
  542. addu v1, v0, s6 /* y+cblue */
  543. addu t3, t9, t3 /* y+cred */
  544. addu s3, t9, s3 /* y+cgreen */
  545. addu v1, t9, v1 /* y+cblue */
  546. lbu t3, 0(t3)
  547. lbu s3, 0(s3)
  548. lbu v1, 0(v1)
  549. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
  550. bne t0, t5, 1b
  551. nop
  552. 2:
  553. andi t0, a0, 1
  554. beqz t0, 4f
  555. lbu t3, 0(t5)
  556. lbu s3, 0(t6)
  557. addiu t3, t3, -128 /* (cb - 128) */
  558. addiu s3, s3, -128 /* (cr - 128) */
  559. mult $ac1, s1, t3
  560. madd $ac1, s2, s3
  561. sll s3, s3, 15
  562. sll t3, t3, 15
  563. lbu v0, 0(t1)
  564. extr_r.w s5, $ac1, 16
  565. mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
  566. mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
  567. addu t3, v0, s4 /* y+cred */
  568. addu s3, v0, s5 /* y+cgreen */
  569. addu v1, v0, s6 /* y+cblue */
  570. addu t3, t9, t3 /* y+cred */
  571. addu s3, t9, s3 /* y+cgreen */
  572. addu v1, t9, v1 /* y+cblue */
  573. lbu t3, 0(t3)
  574. lbu s3, 0(s3)
  575. lbu v1, 0(v1)
  576. lbu v0, 0(t2)
  577. STORE_H2V2_1_PIXEL t3, s3, v1, t4
  578. addu t3, v0, s4 /* y+cred */
  579. addu s3, v0, s5 /* y+cgreen */
  580. addu v1, v0, s6 /* y+cblue */
  581. addu t3, t9, t3 /* y+cred */
  582. addu s3, t9, s3 /* y+cgreen */
  583. addu v1, t9, v1 /* y+cblue */
  584. lbu t3, 0(t3)
  585. lbu s3, 0(s3)
  586. lbu v1, 0(v1)
  587. STORE_H2V2_1_PIXEL t3, s3, v1, t7
  588. 4:
  589. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  590. j ra
  591. nop
  592. END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
  593. .purgem STORE_H2V2_1_PIXEL
  594. .purgem STORE_H2V2_2_PIXELS
  595. .endm
  596. /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  597. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  598. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  599. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  600. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  601. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  602. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  603. /*****************************************************************************/
  604. /*
  605. * jsimd_h2v1_merged_upsample_dspr2
  606. * jsimd_h2v1_extrgb_merged_upsample_dspr2
  607. * jsimd_h2v1_extrgbx_merged_upsample_dspr2
  608. * jsimd_h2v1_extbgr_merged_upsample_dspr2
  609. * jsimd_h2v1_extbgrx_merged_upsample_dspr2
  610. * jsimd_h2v1_extxbgr_merged_upsample_dspr2
  611. * jsimd_h2v1_extxrgb_merged_upsample_dspr2
  612. *
  613. * Merged h2v1 upsample routines
  614. */
  615. .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
  616. r1_offs, g1_offs, \
  617. b1_offs, a1_offs, \
  618. r2_offs, g2_offs, \
  619. b2_offs, a2_offs
  620. .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
  621. scratch5 outptr
  622. sb \scratch0, \r1_offs(\outptr)
  623. sb \scratch1, \g1_offs(\outptr)
  624. sb \scratch2, \b1_offs(\outptr)
  625. sb \scratch3, \r2_offs(\outptr)
  626. sb \scratch4, \g2_offs(\outptr)
  627. sb \scratch5, \b2_offs(\outptr)
  628. .if (\pixel_size == 8)
  629. li t0, 0xFF
  630. sb t0, \a1_offs(\outptr)
  631. sb t0, \a2_offs(\outptr)
  632. .endif
  633. addiu \outptr, \pixel_size
  634. .endm
  635. .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
  636. sb \scratch0, \r1_offs(\outptr)
  637. sb \scratch1, \g1_offs(\outptr)
  638. sb \scratch2, \b1_offs(\outptr)
  639. .if (\pixel_size == 8)
  640. li t0, 0xFF
  641. sb t0, \a1_offs(\outptr)
  642. .endif
  643. .endm
  644. LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
  645. /*
  646. * a0 = cinfo->output_width
  647. * a1 = input_buf
  648. * a2 = in_row_group_ctr
  649. * a3 = output_buf
  650. * 16(sp) = range_limit
  651. */
  652. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  653. li t0, 0xe6ea
  654. lw t1, 0(a1) /* t1 = input_buf[0] */
  655. lw t2, 4(a1) /* t2 = input_buf[1] */
  656. lw t3, 8(a1) /* t3 = input_buf[2] */
  657. lw t8, 56(sp) /* t8 = range_limit */
  658. addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */
  659. addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */
  660. addiu s0, t0, 0x9916 /* s0 = 0x8000 */
  661. addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
  662. xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
  663. srl t0, a0, 1
  664. sll t4, a2, 2
  665. lwx s5, t4(t1) /* s5 = inptr0 */
  666. lwx s6, t4(t2) /* s6 = inptr1 */
  667. lwx s7, t4(t3) /* s7 = inptr2 */
  668. lw t7, 0(a3) /* t7 = outptr */
  669. blez t0, 2f
  670. addu t9, s6, t0 /* t9 = end address */
  671. 1:
  672. lbu t2, 0(s6) /* t2 = cb */
  673. lbu t0, 0(s7) /* t0 = cr */
  674. lbu t1, 0(s5) /* t1 = y */
  675. addiu t2, t2, -128 /* t2 = cb - 128 */
  676. addiu t0, t0, -128 /* t0 = cr - 128 */
  677. mult $ac1, s4, t2
  678. madd $ac1, s3, t0
  679. sll t0, t0, 15
  680. sll t2, t2, 15
  681. mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
  682. extr_r.w t5, $ac1, 16
  683. mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
  684. addiu s7, s7, 1
  685. addiu s6, s6, 1
  686. addu t2, t1, t0 /* t2 = y + cred */
  687. addu t3, t1, t5 /* t3 = y + cgreen */
  688. addu t4, t1, t6 /* t4 = y + cblue */
  689. addu t2, t8, t2
  690. addu t3, t8, t3
  691. addu t4, t8, t4
  692. lbu t1, 1(s5)
  693. lbu v0, 0(t2)
  694. lbu v1, 0(t3)
  695. lbu ra, 0(t4)
  696. addu t2, t1, t0
  697. addu t3, t1, t5
  698. addu t4, t1, t6
  699. addu t2, t8, t2
  700. addu t3, t8, t3
  701. addu t4, t8, t4
  702. lbu t2, 0(t2)
  703. lbu t3, 0(t3)
  704. lbu t4, 0(t4)
  705. STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
  706. bne t9, s6, 1b
  707. addiu s5, s5, 2
  708. 2:
  709. andi t0, a0, 1
  710. beqz t0, 4f
  711. nop
  712. 3:
  713. lbu t2, 0(s6)
  714. lbu t0, 0(s7)
  715. lbu t1, 0(s5)
  716. addiu t2, t2, -128 /* (cb - 128) */
  717. addiu t0, t0, -128 /* (cr - 128) */
  718. mul t3, s4, t2
  719. mul t4, s3, t0
  720. sll t0, t0, 15
  721. sll t2, t2, 15
  722. mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */
  723. mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */
  724. addu t3, t3, s0
  725. addu t3, t4, t3
  726. sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
  727. addu t2, t1, t0 /* y + cred */
  728. addu t3, t1, t5 /* y + cgreen */
  729. addu t4, t1, t6 /* y + cblue */
  730. addu t2, t8, t2
  731. addu t3, t8, t3
  732. addu t4, t8, t4
  733. lbu t2, 0(t2)
  734. lbu t3, 0(t3)
  735. lbu t4, 0(t4)
  736. STORE_H2V1_1_PIXEL t2, t3, t4, t7
  737. 4:
  738. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  739. j ra
  740. nop
  741. END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
  742. .purgem STORE_H2V1_1_PIXEL
  743. .purgem STORE_H2V1_2_PIXELS
  744. .endm
  745. /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  746. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  747. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  748. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  749. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  750. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  751. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  752. /*****************************************************************************/
  753. /*
  754. * jsimd_h2v2_fancy_upsample_dspr2
  755. *
  756. * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  757. */
  758. LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
  759. /*
  760. * a0 = cinfo->max_v_samp_factor
  761. * a1 = downsampled_width
  762. * a2 = input_data
  763. * a3 = output_data_ptr
  764. */
  765. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  766. li s4, 0
  767. lw s2, 0(a3) /* s2 = *output_data_ptr */
  768. 0:
  769. li t9, 2
  770. lw s1, -4(a2) /* s1 = inptr1 */
  771. 1:
  772. lw s0, 0(a2) /* s0 = inptr0 */
  773. lwx s3, s4(s2)
  774. addiu s5, a1, -2 /* s5 = downsampled_width - 2 */
  775. srl t4, s5, 1
  776. sll t4, t4, 1
  777. lbu t0, 0(s0)
  778. lbu t1, 1(s0)
  779. lbu t2, 0(s1)
  780. lbu t3, 1(s1)
  781. addiu s0, 2
  782. addiu s1, 2
  783. addu t8, s0, t4 /* t8 = end address */
  784. andi s5, s5, 1 /* s5 = residual */
  785. sll t4, t0, 1
  786. sll t6, t1, 1
  787. addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */
  788. addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */
  789. addu t7, t0, t2 /* t7 = thiscolsum */
  790. addu t6, t1, t3 /* t5 = nextcolsum */
  791. sll t0, t7, 2 /* t0 = thiscolsum * 4 */
  792. subu t1, t0, t7 /* t1 = thiscolsum * 3 */
  793. shra_r.w t0, t0, 4
  794. addiu t1, 7
  795. addu t1, t1, t6
  796. srl t1, t1, 4
  797. sb t0, 0(s3)
  798. sb t1, 1(s3)
  799. beq t8, s0, 22f /* skip to final iteration if width == 3 */
  800. addiu s3, 2
  801. 2:
  802. lh t0, 0(s0) /* t0 = A3|A2 */
  803. lh t2, 0(s1) /* t2 = B3|B2 */
  804. addiu s0, 2
  805. addiu s1, 2
  806. preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */
  807. preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */
  808. shll.ph t1, t0, 1
  809. sll t3, t6, 1
  810. addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */
  811. addu t3, t3, t6 /* t3 = this * 3 */
  812. addu.ph t0, t0, t2 /* t0 = next2|next1 */
  813. addu t1, t3, t7
  814. andi t7, t0, 0xFFFF /* t7 = next1 */
  815. sll t2, t7, 1
  816. addu t2, t7, t2 /* t2 = next1*3 */
  817. addu t4, t2, t6
  818. srl t6, t0, 16 /* t6 = next2 */
  819. shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */
  820. addu t0, t3, t7
  821. addiu t0, 7
  822. srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */
  823. shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */
  824. addu t2, t2, t6
  825. addiu t2, 7
  826. srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */
  827. sb t1, 0(s3)
  828. sb t0, 1(s3)
  829. sb t4, 2(s3)
  830. sb t2, 3(s3)
  831. bne t8, s0, 2b
  832. addiu s3, 4
  833. 22:
  834. beqz s5, 4f
  835. addu t8, s0, s5
  836. 3:
  837. lbu t0, 0(s0)
  838. lbu t2, 0(s1)
  839. addiu s0, 1
  840. addiu s1, 1
  841. sll t3, t6, 1
  842. sll t1, t0, 1
  843. addu t1, t0, t1 /* t1 = inptr0 * 3 */
  844. addu t3, t3, t6 /* t3 = thiscolsum * 3 */
  845. addu t5, t1, t2
  846. addu t1, t3, t7
  847. shra_r.w t1, t1, 4
  848. addu t0, t3, t5
  849. addiu t0, 7
  850. srl t0, t0, 4
  851. sb t1, 0(s3)
  852. sb t0, 1(s3)
  853. addiu s3, 2
  854. move t7, t6
  855. bne t8, s0, 3b
  856. move t6, t5
  857. 4:
  858. sll t0, t6, 2 /* t0 = thiscolsum * 4 */
  859. subu t1, t0, t6 /* t1 = thiscolsum * 3 */
  860. addu t1, t1, t7
  861. addiu s4, 4
  862. shra_r.w t1, t1, 4
  863. addiu t0, 7
  864. srl t0, t0, 4
  865. sb t1, 0(s3)
  866. sb t0, 1(s3)
  867. addiu t9, -1
  868. addiu s3, 2
  869. bnez t9, 1b
  870. lw s1, 4(a2)
  871. srl t0, s4, 2
  872. subu t0, a0, t0
  873. bgtz t0, 0b
  874. addiu a2, 4
  875. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  876. j ra
  877. nop
  878. END(jsimd_h2v2_fancy_upsample_dspr2)
  879. /*****************************************************************************/
  880. LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
  881. /*
  882. * a0 = cinfo->max_v_samp_factor
  883. * a1 = downsampled_width
  884. * a2 = input_data
  885. * a3 = output_data_ptr
  886. */
  887. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  888. .set at
  889. beqz a0, 3f
  890. sll t0, a0, 2
  891. lw s1, 0(a3)
  892. li s3, 0x10001
  893. addu s0, s1, t0
  894. 0:
  895. addiu t8, a1, -2
  896. srl t9, t8, 2
  897. lw t7, 0(a2)
  898. lw s2, 0(s1)
  899. lbu t0, 0(t7)
  900. lbu t1, 1(t7) /* t1 = inptr[1] */
  901. sll t2, t0, 1
  902. addu t2, t2, t0 /* t2 = invalue*3 */
  903. addu t2, t2, t1
  904. shra_r.w t2, t2, 2
  905. sb t0, 0(s2)
  906. sb t2, 1(s2)
  907. beqz t9, 11f
  908. addiu s2, 2
  909. 1:
  910. ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */
  911. ulw t1, 1(t7)
  912. ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */
  913. preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */
  914. preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */
  915. preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */
  916. preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */
  917. preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */
  918. shll.ph t5, t4, 1
  919. shll.ph t6, t1, 1
  920. addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */
  921. addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */
  922. addu.ph t4, t3, s3
  923. addu.ph t0, t0, s3
  924. addu.ph t4, t4, t5
  925. addu.ph t0, t0, t6
  926. shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */
  927. shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */
  928. addu.ph t2, t2, t5
  929. addu.ph t3, t3, t6
  930. shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */
  931. shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */
  932. shll.ph t2, t2, 8
  933. shll.ph t3, t3, 8
  934. or t2, t4, t2
  935. or t3, t3, t0
  936. addiu t9, -1
  937. usw t3, 0(s2)
  938. usw t2, 4(s2)
  939. addiu s2, 8
  940. bgtz t9, 1b
  941. addiu t7, 4
  942. 11:
  943. andi t8, 3
  944. beqz t8, 22f
  945. addiu t7, 1
  946. 2:
  947. lbu t0, 0(t7)
  948. addiu t7, 1
  949. sll t1, t0, 1
  950. addu t2, t0, t1 /* t2 = invalue */
  951. lbu t3, -2(t7)
  952. lbu t4, 0(t7)
  953. addiu t3, 1
  954. addiu t4, 2
  955. addu t3, t3, t2
  956. addu t4, t4, t2
  957. srl t3, 2
  958. srl t4, 2
  959. sb t3, 0(s2)
  960. sb t4, 1(s2)
  961. addiu t8, -1
  962. bgtz t8, 2b
  963. addiu s2, 2
  964. 22:
  965. lbu t0, 0(t7)
  966. lbu t2, -1(t7)
  967. sll t1, t0, 1
  968. addu t1, t1, t0 /* t1 = invalue * 3 */
  969. addu t1, t1, t2
  970. addiu t1, 1
  971. srl t1, t1, 2
  972. sb t1, 0(s2)
  973. sb t0, 1(s2)
  974. addiu s1, 4
  975. bne s1, s0, 0b
  976. addiu a2, 4
  977. 3:
  978. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  979. j ra
  980. nop
  981. END(jsimd_h2v1_fancy_upsample_dspr2)
  982. /*****************************************************************************/
  983. LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
  984. /*
  985. * a0 = cinfo->image_width
  986. * a1 = cinfo->max_v_samp_factor
  987. * a2 = compptr->v_samp_factor
  988. * a3 = compptr->width_in_blocks
  989. * 16(sp) = input_data
  990. * 20(sp) = output_data
  991. */
  992. .set at
  993. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
  994. beqz a2, 7f
  995. lw s1, 44(sp) /* s1 = output_data */
  996. lw s0, 40(sp) /* s0 = input_data */
  997. srl s2, a0, 2
  998. andi t9, a0, 2
  999. srl t7, t9, 1
  1000. addu s2, t7, s2
  1001. sll t0, a3, 3 /* t0 = width_in_blocks*DCT */
  1002. srl t7, t0, 1
  1003. subu s2, t7, s2
  1004. 0:
  1005. andi t6, a0, 1 /* t6 = temp_index */
  1006. addiu t6, -1
  1007. lw t4, 0(s1) /* t4 = outptr */
  1008. lw t5, 0(s0) /* t5 = inptr0 */
  1009. li s3, 0 /* s3 = bias */
  1010. srl t7, a0, 1 /* t7 = image_width1 */
  1011. srl s4, t7, 2
  1012. andi t8, t7, 3
  1013. 1:
  1014. ulhu t0, 0(t5)
  1015. ulhu t1, 2(t5)
  1016. ulhu t2, 4(t5)
  1017. ulhu t3, 6(t5)
  1018. raddu.w.qb t0, t0
  1019. raddu.w.qb t1, t1
  1020. raddu.w.qb t2, t2
  1021. raddu.w.qb t3, t3
  1022. shra.ph t0, t0, 1
  1023. shra_r.ph t1, t1, 1
  1024. shra.ph t2, t2, 1
  1025. shra_r.ph t3, t3, 1
  1026. sb t0, 0(t4)
  1027. sb t1, 1(t4)
  1028. sb t2, 2(t4)
  1029. sb t3, 3(t4)
  1030. addiu s4, -1
  1031. addiu t4, 4
  1032. bgtz s4, 1b
  1033. addiu t5, 8
  1034. beqz t8, 3f
  1035. addu s4, t4, t8
  1036. 2:
  1037. ulhu t0, 0(t5)
  1038. raddu.w.qb t0, t0
  1039. addqh.w t0, t0, s3
  1040. xori s3, s3, 1
  1041. sb t0, 0(t4)
  1042. addiu t4, 1
  1043. bne t4, s4, 2b
  1044. addiu t5, 2
  1045. 3:
  1046. lbux t1, t6(t5)
  1047. sll t1, 1
  1048. addqh.w t2, t1, s3 /* t2 = pixval1 */
  1049. xori s3, s3, 1
  1050. addqh.w t3, t1, s3 /* t3 = pixval2 */
  1051. blez s2, 5f
  1052. append t3, t2, 8
  1053. addu t5, t4, s2 /* t5 = loop_end2 */
  1054. 4:
  1055. ush t3, 0(t4)
  1056. addiu s2, -1
  1057. bgtz s2, 4b
  1058. addiu t4, 2
  1059. 5:
  1060. beqz t9, 6f
  1061. nop
  1062. sb t2, 0(t4)
  1063. 6:
  1064. addiu s1, 4
  1065. addiu a2, -1
  1066. bnez a2, 0b
  1067. addiu s0, 4
  1068. 7:
  1069. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
  1070. j ra
  1071. nop
  1072. END(jsimd_h2v1_downsample_dspr2)
  1073. /*****************************************************************************/
  1074. LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
  1075. /*
  1076. * a0 = cinfo->image_width
  1077. * a1 = cinfo->max_v_samp_factor
  1078. * a2 = compptr->v_samp_factor
  1079. * a3 = compptr->width_in_blocks
  1080. * 16(sp) = input_data
  1081. * 20(sp) = output_data
  1082. */
  1083. .set at
  1084. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1085. beqz a2, 8f
  1086. lw s1, 52(sp) /* s1 = output_data */
  1087. lw s0, 48(sp) /* s0 = input_data */
  1088. andi t6, a0, 1 /* t6 = temp_index */
  1089. addiu t6, -1
  1090. srl t7, a0, 1 /* t7 = image_width1 */
  1091. srl s4, t7, 2
  1092. andi t8, t7, 3
  1093. andi t9, a0, 2
  1094. srl s2, a0, 2
  1095. srl t7, t9, 1
  1096. addu s2, t7, s2
  1097. sll t0, a3, 3 /* s2 = width_in_blocks*DCT */
  1098. srl t7, t0, 1
  1099. subu s2, t7, s2
  1100. 0:
  1101. lw t4, 0(s1) /* t4 = outptr */
  1102. lw t5, 0(s0) /* t5 = inptr0 */
  1103. lw s7, 4(s0) /* s7 = inptr1 */
  1104. li s6, 1 /* s6 = bias */
  1105. 2:
  1106. ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */
  1107. ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */
  1108. ulw t2, 4(t5)
  1109. ulw t3, 4(s7)
  1110. precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */
  1111. ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */
  1112. raddu.w.qb t1, t7
  1113. raddu.w.qb t0, t0
  1114. shra_r.w t1, t1, 2
  1115. addiu t0, 1
  1116. srl t0, 2
  1117. precrq.ph.w t7, t2, t3
  1118. ins t2, t3, 16, 16
  1119. raddu.w.qb t7, t7
  1120. raddu.w.qb t2, t2
  1121. shra_r.w t7, t7, 2
  1122. addiu t2, 1
  1123. srl t2, 2
  1124. sb t0, 0(t4)
  1125. sb t1, 1(t4)
  1126. sb t2, 2(t4)
  1127. sb t7, 3(t4)
  1128. addiu t4, 4
  1129. addiu t5, 8
  1130. addiu s4, s4, -1
  1131. bgtz s4, 2b
  1132. addiu s7, 8
  1133. beqz t8, 4f
  1134. addu t8, t4, t8
  1135. 3:
  1136. ulhu t0, 0(t5)
  1137. ulhu t1, 0(s7)
  1138. ins t0, t1, 16, 16
  1139. raddu.w.qb t0, t0
  1140. addu t0, t0, s6
  1141. srl t0, 2
  1142. xori s6, s6, 3
  1143. sb t0, 0(t4)
  1144. addiu t5, 2
  1145. addiu t4, 1
  1146. bne t8, t4, 3b
  1147. addiu s7, 2
  1148. 4:
  1149. lbux t1, t6(t5)
  1150. sll t1, 1
  1151. lbux t0, t6(s7)
  1152. sll t0, 1
  1153. addu t1, t1, t0
  1154. addu t3, t1, s6
  1155. srl t0, t3, 2 /* t2 = pixval1 */
  1156. xori s6, s6, 3
  1157. addu t2, t1, s6
  1158. srl t1, t2, 2 /* t3 = pixval2 */
  1159. blez s2, 6f
  1160. append t1, t0, 8
  1161. 5:
  1162. ush t1, 0(t4)
  1163. addiu s2, -1
  1164. bgtz s2, 5b
  1165. addiu t4, 2
  1166. 6:
  1167. beqz t9, 7f
  1168. nop
  1169. sb t0, 0(t4)
  1170. 7:
  1171. addiu s1, 4
  1172. addiu a2, -1
  1173. bnez a2, 0b
  1174. addiu s0, 8
  1175. 8:
  1176. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1177. j ra
  1178. nop
  1179. END(jsimd_h2v2_downsample_dspr2)
  1180. /*****************************************************************************/
  1181. LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
  1182. /*
  1183. * a0 = input_data
  1184. * a1 = output_data
  1185. * a2 = compptr->v_samp_factor
  1186. * a3 = cinfo->max_v_samp_factor
  1187. * 16(sp) = cinfo->smoothing_factor
  1188. * 20(sp) = compptr->width_in_blocks
  1189. * 24(sp) = cinfo->image_width
  1190. */
  1191. .set at
  1192. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1193. lw s7, 52(sp) /* compptr->width_in_blocks */
  1194. lw s0, 56(sp) /* cinfo->image_width */
  1195. lw s6, 48(sp) /* cinfo->smoothing_factor */
  1196. sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */
  1197. sll v0, s7, 1
  1198. subu v0, v0, s0
  1199. blez v0, 2f
  1200. move v1, zero
  1201. addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */
  1202. 0:
  1203. addiu t1, a0, -4
  1204. sll t2, v1, 2
  1205. lwx t1, t2(t1)
  1206. move t3, v0
  1207. addu t1, t1, s0
  1208. lbu t2, -1(t1)
  1209. 1:
  1210. addiu t3, t3, -1
  1211. sb t2, 0(t1)
  1212. bgtz t3, 1b
  1213. addiu t1, t1, 1
  1214. addiu v1, v1, 1
  1215. bne v1, t0, 0b
  1216. nop
  1217. 2:
  1218. li v0, 80
  1219. mul v0, s6, v0
  1220. li v1, 16384
  1221. move t4, zero
  1222. move t5, zero
  1223. subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */
  1224. sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */
  1225. 3:
  1226. /* Special case for first column: pretend column -1 is same as column 0 */
  1227. sll v0, t4, 2
  1228. lwx t8, v0(a1) /* outptr = output_data[outrow] */
  1229. sll v1, t5, 2
  1230. addiu t9, v1, 4
  1231. addiu s0, v1, -4
  1232. addiu s1, v1, 8
  1233. lwx s2, v1(a0) /* inptr0 = input_data[inrow] */
  1234. lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */
  1235. lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */
  1236. lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */
  1237. lh v0, 0(s2)
  1238. lh v1, 0(t9)
  1239. lh t0, 0(s0)
  1240. lh t1, 0(s1)
  1241. ins v0, v1, 16, 16
  1242. ins t0, t1, 16, 16
  1243. raddu.w.qb t2, v0
  1244. raddu.w.qb s3, t0
  1245. lbu v0, 0(s2)
  1246. lbu v1, 2(s2)
  1247. lbu t0, 0(t9)
  1248. lbu t1, 2(t9)
  1249. addu v0, v0, v1
  1250. mult $ac1, t2, t6
  1251. addu t0, t0, t1
  1252. lbu t2, 2(s0)
  1253. addu t0, t0, v0
  1254. lbu t3, 2(s1)
  1255. addu s3, t0, s3
  1256. lbu v0, 0(s0)
  1257. lbu t0, 0(s1)
  1258. sll s3, s3, 1
  1259. addu v0, v0, t2
  1260. addu t0, t0, t3
  1261. addu t0, t0, v0
  1262. addu s3, t0, s3
  1263. madd $ac1, s3, t7
  1264. extr_r.w v0, $ac1, 16
  1265. addiu t8, t8, 1
  1266. addiu s2, s2, 2
  1267. addiu t9, t9, 2
  1268. addiu s0, s0, 2
  1269. addiu s1, s1, 2
  1270. sb v0, -1(t8)
  1271. addiu s4, s7, -2
  1272. and s4, s4, 3
  1273. addu s5, s4, t8 /* end address */
  1274. 4:
  1275. lh v0, 0(s2)
  1276. lh v1, 0(t9)
  1277. lh t0, 0(s0)
  1278. lh t1, 0(s1)
  1279. ins v0, v1, 16, 16
  1280. ins t0, t1, 16, 16
  1281. raddu.w.qb t2, v0
  1282. raddu.w.qb s3, t0
  1283. lbu v0, -1(s2)
  1284. lbu v1, 2(s2)
  1285. lbu t0, -1(t9)
  1286. lbu t1, 2(t9)
  1287. addu v0, v0, v1
  1288. mult $ac1, t2, t6
  1289. addu t0, t0, t1
  1290. lbu t2, 2(s0)
  1291. addu t0, t0, v0
  1292. lbu t3, 2(s1)
  1293. addu s3, t0, s3
  1294. lbu v0, -1(s0)
  1295. lbu t0, -1(s1)
  1296. sll s3, s3, 1
  1297. addu v0, v0, t2
  1298. addu t0, t0, t3
  1299. addu t0, t0, v0
  1300. addu s3, t0, s3
  1301. madd $ac1, s3, t7
  1302. extr_r.w t2, $ac1, 16
  1303. addiu t8, t8, 1
  1304. addiu s2, s2, 2
  1305. addiu t9, t9, 2
  1306. addiu s0, s0, 2
  1307. sb t2, -1(t8)
  1308. bne s5, t8, 4b
  1309. addiu s1, s1, 2
  1310. addiu s5, s7, -2
  1311. subu s5, s5, s4
  1312. addu s5, s5, t8 /* end address */
  1313. 5:
  1314. lh v0, 0(s2)
  1315. lh v1, 0(t9)
  1316. lh t0, 0(s0)
  1317. lh t1, 0(s1)
  1318. ins v0, v1, 16, 16
  1319. ins t0, t1, 16, 16
  1320. raddu.w.qb t2, v0
  1321. raddu.w.qb s3, t0
  1322. lbu v0, -1(s2)
  1323. lbu v1, 2(s2)
  1324. lbu t0, -1(t9)
  1325. lbu t1, 2(t9)
  1326. addu v0, v0, v1
  1327. mult $ac1, t2, t6
  1328. addu t0, t0, t1
  1329. lbu t2, 2(s0)
  1330. addu t0, t0, v0
  1331. lbu t3, 2(s1)
  1332. addu s3, t0, s3
  1333. lbu v0, -1(s0)
  1334. lbu t0, -1(s1)
  1335. sll s3, s3, 1
  1336. addu v0, v0, t2
  1337. addu t0, t0, t3
  1338. lh v1, 2(t9)
  1339. addu t0, t0, v0
  1340. lh v0, 2(s2)
  1341. addu s3, t0, s3
  1342. lh t0, 2(s0)
  1343. lh t1, 2(s1)
  1344. madd $ac1, s3, t7
  1345. extr_r.w t2, $ac1, 16
  1346. ins t0, t1, 16, 16
  1347. ins v0, v1, 16, 16
  1348. raddu.w.qb s3, t0
  1349. lbu v1, 4(s2)
  1350. lbu t0, 1(t9)
  1351. lbu t1, 4(t9)
  1352. sb t2, 0(t8)
  1353. raddu.w.qb t3, v0
  1354. lbu v0, 1(s2)
  1355. addu t0, t0, t1
  1356. mult $ac1, t3, t6
  1357. addu v0, v0, v1
  1358. lbu t2, 4(s0)
  1359. addu t0, t0, v0
  1360. lbu v0, 1(s0)
  1361. addu s3, t0, s3
  1362. lbu t0, 1(s1)
  1363. lbu t3, 4(s1)
  1364. addu v0, v0, t2
  1365. sll s3, s3, 1
  1366. addu t0, t0, t3
  1367. lh v1, 4(t9)
  1368. addu t0, t0, v0
  1369. lh v0, 4(s2)
  1370. addu s3, t0, s3
  1371. lh t0, 4(s0)
  1372. lh t1, 4(s1)
  1373. madd $ac1, s3, t7
  1374. extr_r.w t2, $ac1, 16
  1375. ins t0, t1, 16, 16
  1376. ins v0, v1, 16, 16
  1377. raddu.w.qb s3, t0
  1378. lbu v1, 6(s2)
  1379. lbu t0, 3(t9)
  1380. lbu t1, 6(t9)
  1381. sb t2, 1(t8)
  1382. raddu.w.qb t3, v0
  1383. lbu v0, 3(s2)
  1384. addu t0, t0, t1
  1385. mult $ac1, t3, t6
  1386. addu v0, v0, v1
  1387. lbu t2, 6(s0)
  1388. addu t0, t0, v0
  1389. lbu v0, 3(s0)
  1390. addu s3, t0, s3
  1391. lbu t0, 3(s1)
  1392. lbu t3, 6(s1)
  1393. addu v0, v0, t2
  1394. sll s3, s3, 1
  1395. addu t0, t0, t3
  1396. lh v1, 6(t9)
  1397. addu t0, t0, v0
  1398. lh v0, 6(s2)
  1399. addu s3, t0, s3
  1400. lh t0, 6(s0)
  1401. lh t1, 6(s1)
  1402. madd $ac1, s3, t7
  1403. extr_r.w t3, $ac1, 16
  1404. ins t0, t1, 16, 16
  1405. ins v0, v1, 16, 16
  1406. raddu.w.qb s3, t0
  1407. lbu v1, 8(s2)
  1408. lbu t0, 5(t9)
  1409. lbu t1, 8(t9)
  1410. sb t3, 2(t8)
  1411. raddu.w.qb t2, v0
  1412. lbu v0, 5(s2)
  1413. addu t0, t0, t1
  1414. mult $ac1, t2, t6
  1415. addu v0, v0, v1
  1416. lbu t2, 8(s0)
  1417. addu t0, t0, v0
  1418. lbu v0, 5(s0)
  1419. addu s3, t0, s3
  1420. lbu t0, 5(s1)
  1421. lbu t3, 8(s1)
  1422. addu v0, v0, t2
  1423. sll s3, s3, 1
  1424. addu t0, t0, t3
  1425. addiu t8, t8, 4
  1426. addu t0, t0, v0
  1427. addiu s2, s2, 8
  1428. addu s3, t0, s3
  1429. addiu t9, t9, 8
  1430. madd $ac1, s3, t7
  1431. extr_r.w t1, $ac1, 16
  1432. addiu s0, s0, 8
  1433. addiu s1, s1, 8
  1434. bne s5, t8, 5b
  1435. sb t1, -1(t8)
  1436. /* Special case for last column */
  1437. lh v0, 0(s2)
  1438. lh v1, 0(t9)
  1439. lh t0, 0(s0)
  1440. lh t1, 0(s1)
  1441. ins v0, v1, 16, 16
  1442. ins t0, t1, 16, 16
  1443. raddu.w.qb t2, v0
  1444. raddu.w.qb s3, t0
  1445. lbu v0, -1(s2)
  1446. lbu v1, 1(s2)
  1447. lbu t0, -1(t9)
  1448. lbu t1, 1(t9)
  1449. addu v0, v0, v1
  1450. mult $ac1, t2, t6
  1451. addu t0, t0, t1
  1452. lbu t2, 1(s0)
  1453. addu t0, t0, v0
  1454. lbu t3, 1(s1)
  1455. addu s3, t0, s3
  1456. lbu v0, -1(s0)
  1457. lbu t0, -1(s1)
  1458. sll s3, s3, 1
  1459. addu v0, v0, t2
  1460. addu t0, t0, t3
  1461. addu t0, t0, v0
  1462. addu s3, t0, s3
  1463. madd $ac1, s3, t7
  1464. extr_r.w t0, $ac1, 16
  1465. addiu t5, t5, 2
  1466. sb t0, 0(t8)
  1467. addiu t4, t4, 1
  1468. bne t4, a2, 3b
  1469. addiu t5, t5, 2
  1470. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1471. j ra
  1472. nop
  1473. END(jsimd_h2v2_smooth_downsample_dspr2)
  1474. /*****************************************************************************/
  1475. LEAF_DSPR2(jsimd_int_upsample_dspr2)
  1476. /*
  1477. * a0 = upsample->h_expand[compptr->component_index]
  1478. * a1 = upsample->v_expand[compptr->component_index]
  1479. * a2 = input_data
  1480. * a3 = output_data_ptr
  1481. * 16(sp) = cinfo->output_width
  1482. * 20(sp) = cinfo->max_v_samp_factor
  1483. */
  1484. .set at
  1485. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  1486. lw s0, 0(a3) /* s0 = output_data */
  1487. lw s1, 32(sp) /* s1 = cinfo->output_width */
  1488. lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */
  1489. li t6, 0 /* t6 = inrow */
  1490. beqz s2, 10f
  1491. li s3, 0 /* s3 = outrow */
  1492. 0:
  1493. addu t0, a2, t6
  1494. addu t7, s0, s3
  1495. lw t3, 0(t0) /* t3 = inptr */
  1496. lw t8, 0(t7) /* t8 = outptr */
  1497. beqz s1, 4f
  1498. addu t5, t8, s1 /* t5 = outend */
  1499. 1:
  1500. lb t2, 0(t3) /* t2 = invalue = *inptr++ */
  1501. addiu t3, 1
  1502. beqz a0, 3f
  1503. move t0, a0 /* t0 = h_expand */
  1504. 2:
  1505. sb t2, 0(t8)
  1506. addiu t0, -1
  1507. bgtz t0, 2b
  1508. addiu t8, 1
  1509. 3:
  1510. bgt t5, t8, 1b
  1511. nop
  1512. 4:
  1513. addiu t9, a1, -1 /* t9 = v_expand - 1 */
  1514. blez t9, 9f
  1515. nop
  1516. 5:
  1517. lw t3, 0(s0)
  1518. lw t4, 4(s0)
  1519. subu t0, s1, 0xF
  1520. blez t0, 7f
  1521. addu t5, t3, s1 /* t5 = end address */
  1522. andi t7, s1, 0xF /* t7 = residual */
  1523. subu t8, t5, t7
  1524. 6:
  1525. ulw t0, 0(t3)
  1526. ulw t1, 4(t3)
  1527. ulw t2, 8(t3)
  1528. usw t0, 0(t4)
  1529. ulw t0, 12(t3)
  1530. usw t1, 4(t4)
  1531. usw t2, 8(t4)
  1532. usw t0, 12(t4)
  1533. addiu t3, 16
  1534. bne t3, t8, 6b
  1535. addiu t4, 16
  1536. beqz t7, 8f
  1537. nop
  1538. 7:
  1539. lbu t0, 0(t3)
  1540. sb t0, 0(t4)
  1541. addiu t3, 1
  1542. bne t3, t5, 7b
  1543. addiu t4, 1
  1544. 8:
  1545. addiu t9, -1
  1546. bgtz t9, 5b
  1547. addiu s0, 8
  1548. 9:
  1549. addu s3, s3, a1
  1550. bne s3, s2, 0b
  1551. addiu t6, 1
  1552. 10:
  1553. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  1554. j ra
  1555. nop
  1556. END(jsimd_int_upsample_dspr2)
  1557. /*****************************************************************************/
  1558. LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
  1559. /*
  1560. * a0 = cinfo->max_v_samp_factor
  1561. * a1 = cinfo->output_width
  1562. * a2 = input_data
  1563. * a3 = output_data_ptr
  1564. */
  1565. lw t7, 0(a3) /* t7 = output_data */
  1566. andi t8, a1, 0xf /* t8 = residual */
  1567. sll t0, a0, 2
  1568. blez a0, 4f
  1569. addu t9, t7, t0 /* t9 = output_data end address */
  1570. 0:
  1571. lw t5, 0(t7) /* t5 = outptr */
  1572. lw t6, 0(a2) /* t6 = inptr */
  1573. addu t3, t5, a1 /* t3 = outptr + output_width (end address) */
  1574. subu t3, t8 /* t3 = end address - residual */
  1575. beq t5, t3, 2f
  1576. move t4, t8
  1577. 1:
  1578. ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */
  1579. ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */
  1580. srl t1, t0, 16 /* t1 = |X|X|P3|P2| */
  1581. ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */
  1582. ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */
  1583. ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */
  1584. ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */
  1585. usw t0, 0(t5)
  1586. usw t1, 4(t5)
  1587. srl t0, t2, 16 /* t0 = |X|X|P7|P6| */
  1588. ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */
  1589. ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */
  1590. ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */
  1591. ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */
  1592. usw t2, 8(t5)
  1593. usw t0, 12(t5)
  1594. addiu t5, 16
  1595. bne t5, t3, 1b
  1596. addiu t6, 8
  1597. beqz t8, 3f
  1598. move t4, t8
  1599. 2:
  1600. lbu t1, 0(t6)
  1601. sb t1, 0(t5)
  1602. sb t1, 1(t5)
  1603. addiu t4, -2
  1604. addiu t6, 1
  1605. bgtz t4, 2b
  1606. addiu t5, 2
  1607. 3:
  1608. addiu t7, 4
  1609. bne t9, t7, 0b
  1610. addiu a2, 4
  1611. 4:
  1612. j ra
  1613. nop
  1614. END(jsimd_h2v1_upsample_dspr2)
  1615. /*****************************************************************************/
  1616. LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
  1617. /*
  1618. * a0 = cinfo->max_v_samp_factor
  1619. * a1 = cinfo->output_width
  1620. * a2 = input_data
  1621. * a3 = output_data_ptr
  1622. */
  1623. lw t7, 0(a3)
  1624. blez a0, 7f
  1625. andi t9, a1, 0xf /* t9 = residual */
  1626. 0:
  1627. lw t6, 0(a2) /* t6 = inptr */
  1628. lw t5, 0(t7) /* t5 = outptr */
  1629. addu t8, t5, a1 /* t8 = outptr end address */
  1630. subu t8, t9 /* t8 = end address - residual */
  1631. beq t5, t8, 2f
  1632. move t4, t9
  1633. 1:
  1634. ulw t0, 0(t6)
  1635. srl t1, t0, 16
  1636. ins t0, t0, 16, 16
  1637. ins t0, t0, 8, 16
  1638. ins t1, t1, 16, 16
  1639. ins t1, t1, 8, 16
  1640. ulw t2, 4(t6)
  1641. usw t0, 0(t5)
  1642. usw t1, 4(t5)
  1643. srl t3, t2, 16
  1644. ins t2, t2, 16, 16
  1645. ins t2, t2, 8, 16
  1646. ins t3, t3, 16, 16
  1647. ins t3, t3, 8, 16
  1648. usw t2, 8(t5)
  1649. usw t3, 12(t5)
  1650. addiu t5, 16
  1651. bne t5, t8, 1b
  1652. addiu t6, 8
  1653. beqz t9, 3f
  1654. move t4, t9
  1655. 2:
  1656. lbu t0, 0(t6)
  1657. sb t0, 0(t5)
  1658. sb t0, 1(t5)
  1659. addiu t4, -2
  1660. addiu t6, 1
  1661. bgtz t4, 2b
  1662. addiu t5, 2
  1663. 3:
  1664. lw t6, 0(t7) /* t6 = outptr[0] */
  1665. lw t5, 4(t7) /* t5 = outptr[1] */
  1666. addu t4, t6, a1 /* t4 = new end address */
  1667. beq a1, t9, 5f
  1668. subu t8, t4, t9
  1669. 4:
  1670. ulw t0, 0(t6)
  1671. ulw t1, 4(t6)
  1672. ulw t2, 8(t6)
  1673. usw t0, 0(t5)
  1674. ulw t0, 12(t6)
  1675. usw t1, 4(t5)
  1676. usw t2, 8(t5)
  1677. usw t0, 12(t5)
  1678. addiu t6, 16
  1679. bne t6, t8, 4b
  1680. addiu t5, 16
  1681. beqz t9, 6f
  1682. nop
  1683. 5:
  1684. lbu t0, 0(t6)
  1685. sb t0, 0(t5)
  1686. addiu t6, 1
  1687. bne t6, t4, 5b
  1688. addiu t5, 1
  1689. 6:
  1690. addiu t7, 8
  1691. addiu a0, -2
  1692. bgtz a0, 0b
  1693. addiu a2, 4
  1694. 7:
  1695. j ra
  1696. nop
  1697. END(jsimd_h2v2_upsample_dspr2)
  1698. /*****************************************************************************/
  1699. LEAF_DSPR2(jsimd_idct_islow_dspr2)
  1700. /*
  1701. * a0 = coef_block
  1702. * a1 = compptr->dcttable
  1703. * a2 = output
  1704. * a3 = range_limit
  1705. */
  1706. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1707. addiu sp, sp, -256
  1708. move v0, sp
  1709. addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */
  1710. 1:
  1711. lh s4, 32(a0) /* s4 = inptr[16] */
  1712. lh s5, 64(a0) /* s5 = inptr[32] */
  1713. lh s6, 96(a0) /* s6 = inptr[48] */
  1714. lh t1, 112(a0) /* t1 = inptr[56] */
  1715. lh t7, 16(a0) /* t7 = inptr[8] */
  1716. lh t5, 80(a0) /* t5 = inptr[40] */
  1717. lh t3, 48(a0) /* t3 = inptr[24] */
  1718. or s4, s4, t1
  1719. or s4, s4, t3
  1720. or s4, s4, t5
  1721. or s4, s4, t7
  1722. or s4, s4, s5
  1723. or s4, s4, s6
  1724. bnez s4, 2f
  1725. addiu v1, v1, -1
  1726. lh s5, 0(a1) /* quantptr[DCTSIZE*0] */
  1727. lh s6, 0(a0) /* inptr[DCTSIZE*0] */
  1728. mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */
  1729. sll s5, s5, 2
  1730. sw s5, 0(v0)
  1731. sw s5, 32(v0)
  1732. sw s5, 64(v0)
  1733. sw s5, 96(v0)
  1734. sw s5, 128(v0)
  1735. sw s5, 160(v0)
  1736. sw s5, 192(v0)
  1737. b 3f
  1738. sw s5, 224(v0)
  1739. 2:
  1740. lh t0, 112(a1)
  1741. lh t2, 48(a1)
  1742. lh t4, 80(a1)
  1743. lh t6, 16(a1)
  1744. mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7],
  1745. quantptr[DCTSIZE*7]) */
  1746. mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3],
  1747. quantptr[DCTSIZE*3]) */
  1748. mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5],
  1749. quantptr[DCTSIZE*5]) */
  1750. mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1],
  1751. quantptr[DCTSIZE*1]) */
  1752. lh t4, 32(a1)
  1753. lh t5, 32(a0)
  1754. lh t6, 96(a1)
  1755. lh t7, 96(a0)
  1756. addu s0, t0, t1 /* z3 = tmp0 + tmp2 */
  1757. addu s1, t1, t2 /* z2 = tmp1 + tmp2 */
  1758. addu s2, t2, t3 /* z4 = tmp1 + tmp3 */
  1759. addu s3, s0, s2 /* z3 + z4 */
  1760. addiu t9, zero, 9633 /* FIX_1_175875602 */
  1761. mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
  1762. addu t8, t0, t3 /* z1 = tmp0 + tmp3 */
  1763. addiu t9, zero, 2446 /* FIX_0_298631336 */
  1764. mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
  1765. addiu t9, zero, 16819 /* FIX_2_053119869 */
  1766. mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
  1767. addiu t9, zero, 25172 /* FIX_3_072711026 */
  1768. mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
  1769. addiu t9, zero, 12299 /* FIX_1_501321110 */
  1770. mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
  1771. addiu t9, zero, 16069 /* FIX_1_961570560 */
  1772. mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
  1773. addiu t9, zero, 3196 /* FIX_0_390180644 */
  1774. mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
  1775. addiu t9, zero, 7373 /* FIX_0_899976223 */
  1776. mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
  1777. addiu t9, zero, 20995 /* FIX_2_562915447 */
  1778. mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
  1779. subu s0, s3, s0 /* z3 += z5 */
  1780. addu t0, t0, s0 /* tmp0 += z3 */
  1781. addu t1, t1, s0 /* tmp2 += z3 */
  1782. subu s2, s3, s2 /* z4 += z5 */
  1783. addu t2, t2, s2 /* tmp1 += z4 */
  1784. addu t3, t3, s2 /* tmp3 += z4 */
  1785. subu t0, t0, t8 /* tmp0 += z1 */
  1786. subu t1, t1, s1 /* tmp2 += z2 */
  1787. subu t2, t2, s1 /* tmp1 += z2 */
  1788. subu t3, t3, t8 /* tmp3 += z1 */
  1789. mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2],
  1790. quantptr[DCTSIZE*2]) */
  1791. addiu t9, zero, 6270 /* FIX_0_765366865 */
  1792. mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6],
  1793. quantptr[DCTSIZE*6]) */
  1794. lh t4, 0(a1)
  1795. lh t5, 0(a0)
  1796. lh t6, 64(a1)
  1797. lh t7, 64(a0)
  1798. mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */
  1799. mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0],
  1800. quantptr[DCTSIZE*0]) */
  1801. mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4],
  1802. quantptr[DCTSIZE*4]) */
  1803. addiu t9, zero, 4433 /* FIX_0_541196100 */
  1804. addu s3, s0, s1 /* z2 + z3 */
  1805. mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
  1806. addiu t9, zero, 15137 /* FIX_1_847759065 */
  1807. mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */
  1808. addu t4, t5, t6
  1809. subu t5, t5, t6
  1810. sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */
  1811. sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */
  1812. addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
  1813. subu t6, s3, t8 /* tmp2 =
  1814. z1 + MULTIPLY(z3, -FIX_1_847759065) */
  1815. addu s0, t4, t7
  1816. subu s1, t4, t7
  1817. addu s2, t5, t6
  1818. subu s3, t5, t6
  1819. addu t4, s0, t3
  1820. subu s0, s0, t3
  1821. addu t3, s2, t1
  1822. subu s2, s2, t1
  1823. addu t1, s3, t2
  1824. subu s3, s3, t2
  1825. addu t2, s1, t0
  1826. subu s1, s1, t0
  1827. shra_r.w t4, t4, 11
  1828. shra_r.w t3, t3, 11
  1829. shra_r.w t1, t1, 11
  1830. shra_r.w t2, t2, 11
  1831. shra_r.w s1, s1, 11
  1832. shra_r.w s3, s3, 11
  1833. shra_r.w s2, s2, 11
  1834. shra_r.w s0, s0, 11
  1835. sw t4, 0(v0)
  1836. sw t3, 32(v0)
  1837. sw t1, 64(v0)
  1838. sw t2, 96(v0)
  1839. sw s1, 128(v0)
  1840. sw s3, 160(v0)
  1841. sw s2, 192(v0)
  1842. sw s0, 224(v0)
  1843. 3:
  1844. addiu a1, a1, 2
  1845. addiu a0, a0, 2
  1846. bgtz v1, 1b
  1847. addiu v0, v0, 4
  1848. move v0, sp
  1849. addiu v1, zero, 8
  1850. 4:
  1851. lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */
  1852. lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */
  1853. lw t2, 0(v0) /* (JLONG)wsptr[0] */
  1854. lw t3, 16(v0) /* (JLONG)wsptr[4] */
  1855. lw s4, 4(v0) /* (JLONG)wsptr[1] */
  1856. lw s5, 12(v0) /* (JLONG)wsptr[3] */
  1857. lw s6, 20(v0) /* (JLONG)wsptr[5] */
  1858. lw s7, 28(v0) /* (JLONG)wsptr[7] */
  1859. or s4, s4, t0
  1860. or s4, s4, t1
  1861. or s4, s4, t3
  1862. or s4, s4, s7
  1863. or s4, s4, s5
  1864. or s4, s4, s6
  1865. bnez s4, 5f
  1866. addiu v1, v1, -1
  1867. shra_r.w s5, t2, 5
  1868. andi s5, s5, 0x3ff
  1869. lbux s5, s5(a3)
  1870. lw s1, 0(a2)
  1871. replv.qb s5, s5
  1872. usw s5, 0(s1)
  1873. usw s5, 4(s1)
  1874. b 6f
  1875. nop
  1876. 5:
  1877. addu t4, t0, t1 /* z2 + z3 */
  1878. addiu t8, zero, 4433 /* FIX_0_541196100 */
  1879. mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
  1880. addiu t8, zero, 15137 /* FIX_1_847759065 */
  1881. mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */
  1882. addiu t8, zero, 6270 /* FIX_0_765366865 */
  1883. mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */
  1884. addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
  1885. subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
  1886. sll t4, t4, 13 /* tmp0 =
  1887. (wsptr[0] + wsptr[4]) << CONST_BITS */
  1888. sll t2, t2, 13 /* tmp1 =
  1889. (wsptr[0] - wsptr[4]) << CONST_BITS */
  1890. subu t1, t5, t1 /* tmp2 =
  1891. z1 + MULTIPLY(z3, -FIX_1_847759065) */
  1892. subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */
  1893. addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */
  1894. addu t5, t5, t0 /* tmp3 =
  1895. z1 + MULTIPLY(z2, FIX_0_765366865) */
  1896. subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */
  1897. addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */
  1898. lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */
  1899. lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */
  1900. lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */
  1901. lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */
  1902. addu s0, t4, t6 /* z3 = tmp0 + tmp2 */
  1903. addiu t8, zero, 9633 /* FIX_1_175875602 */
  1904. addu s1, t5, t7 /* z4 = tmp1 + tmp3 */
  1905. addu s2, s0, s1 /* z3 + z4 */
  1906. mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
  1907. addu s3, t4, t7 /* z1 = tmp0 + tmp3 */
  1908. addu t9, t5, t6 /* z2 = tmp1 + tmp2 */
  1909. addiu t8, zero, 16069 /* FIX_1_961570560 */
  1910. mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
  1911. addiu t8, zero, 3196 /* FIX_0_390180644 */
  1912. mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
  1913. addiu t8, zero, 2446 /* FIX_0_298631336 */
  1914. mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
  1915. addiu t8, zero, 7373 /* FIX_0_899976223 */
  1916. mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
  1917. addiu t8, zero, 16819 /* FIX_2_053119869 */
  1918. mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
  1919. addiu t8, zero, 20995 /* FIX_2_562915447 */
  1920. mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
  1921. addiu t8, zero, 25172 /* FIX_3_072711026 */
  1922. mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
  1923. addiu t8, zero, 12299 /* FIX_1_501321110 */
  1924. mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
  1925. subu s0, s2, s0 /* z3 += z5 */
  1926. subu s1, s2, s1 /* z4 += z5 */
  1927. addu t4, t4, s0
  1928. subu t4, t4, s3 /* tmp0 */
  1929. addu t5, t5, s1
  1930. subu t5, t5, t9 /* tmp1 */
  1931. addu t6, t6, s0
  1932. subu t6, t6, t9 /* tmp2 */
  1933. addu t7, t7, s1
  1934. subu t7, t7, s3 /* tmp3 */
  1935. addu s0, t0, t7
  1936. subu t0, t0, t7
  1937. addu t7, t2, t6
  1938. subu t2, t2, t6
  1939. addu t6, t3, t5
  1940. subu t3, t3, t5
  1941. addu t5, t1, t4
  1942. subu t1, t1, t4
  1943. shra_r.w s0, s0, 18
  1944. shra_r.w t7, t7, 18
  1945. shra_r.w t6, t6, 18
  1946. shra_r.w t5, t5, 18
  1947. shra_r.w t1, t1, 18
  1948. shra_r.w t3, t3, 18
  1949. shra_r.w t2, t2, 18
  1950. shra_r.w t0, t0, 18
  1951. andi s0, s0, 0x3ff
  1952. andi t7, t7, 0x3ff
  1953. andi t6, t6, 0x3ff
  1954. andi t5, t5, 0x3ff
  1955. andi t1, t1, 0x3ff
  1956. andi t3, t3, 0x3ff
  1957. andi t2, t2, 0x3ff
  1958. andi t0, t0, 0x3ff
  1959. lw s1, 0(a2)
  1960. lbux s0, s0(a3)
  1961. lbux t7, t7(a3)
  1962. lbux t6, t6(a3)
  1963. lbux t5, t5(a3)
  1964. lbux t1, t1(a3)
  1965. lbux t3, t3(a3)
  1966. lbux t2, t2(a3)
  1967. lbux t0, t0(a3)
  1968. sb s0, 0(s1)
  1969. sb t7, 1(s1)
  1970. sb t6, 2(s1)
  1971. sb t5, 3(s1)
  1972. sb t1, 4(s1)
  1973. sb t3, 5(s1)
  1974. sb t2, 6(s1)
  1975. sb t0, 7(s1)
  1976. 6:
  1977. addiu v0, v0, 32
  1978. bgtz v1, 4b
  1979. addiu a2, a2, 4
  1980. addiu sp, sp, 256
  1981. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1982. j ra
  1983. nop
  1984. END(jsimd_idct_islow_dspr2)
  1985. /*****************************************************************************/
  1986. LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
  1987. /*
  1988. * a0 = inptr
  1989. * a1 = quantptr
  1990. * a2 = wsptr
  1991. * a3 = mips_idct_ifast_coefs
  1992. */
  1993. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1994. addiu t9, a0, 16 /* end address */
  1995. or AT, a3, zero
  1996. 0:
  1997. lw s0, 0(a1) /* quantptr[DCTSIZE*0] */
  1998. lw t0, 0(a0) /* inptr[DCTSIZE*0] */
  1999. lw t1, 16(a0) /* inptr[DCTSIZE*1] */
  2000. muleq_s.w.phl v0, t0, s0 /* tmp0 ... */
  2001. lw t2, 32(a0) /* inptr[DCTSIZE*2] */
  2002. lw t3, 48(a0) /* inptr[DCTSIZE*3] */
  2003. lw t4, 64(a0) /* inptr[DCTSIZE*4] */
  2004. lw t5, 80(a0) /* inptr[DCTSIZE*5] */
  2005. muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */
  2006. lw t6, 96(a0) /* inptr[DCTSIZE*6] */
  2007. lw t7, 112(a0) /* inptr[DCTSIZE*7] */
  2008. or s4, t1, t2
  2009. or s5, t3, t4
  2010. bnez s4, 1f
  2011. ins t0, v0, 16, 16 /* ... tmp0 */
  2012. bnez s5, 1f
  2013. or s6, t5, t6
  2014. or s6, s6, t7
  2015. bnez s6, 1f
  2016. sw t0, 0(a2) /* wsptr[DCTSIZE*0] */
  2017. sw t0, 16(a2) /* wsptr[DCTSIZE*1] */
  2018. sw t0, 32(a2) /* wsptr[DCTSIZE*2] */
  2019. sw t0, 48(a2) /* wsptr[DCTSIZE*3] */
  2020. sw t0, 64(a2) /* wsptr[DCTSIZE*4] */
  2021. sw t0, 80(a2) /* wsptr[DCTSIZE*5] */
  2022. sw t0, 96(a2) /* wsptr[DCTSIZE*6] */
  2023. sw t0, 112(a2) /* wsptr[DCTSIZE*7] */
  2024. addiu a0, a0, 4
  2025. b 2f
  2026. addiu a1, a1, 4
  2027. 1:
  2028. lw s1, 32(a1) /* quantptr[DCTSIZE*2] */
  2029. lw s2, 64(a1) /* quantptr[DCTSIZE*4] */
  2030. muleq_s.w.phl v0, t2, s1 /* tmp1 ... */
  2031. muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */
  2032. lw s0, 16(a1) /* quantptr[DCTSIZE*1] */
  2033. lw s1, 48(a1) /* quantptr[DCTSIZE*3] */
  2034. lw s3, 96(a1) /* quantptr[DCTSIZE*6] */
  2035. muleq_s.w.phl v1, t4, s2 /* tmp2 ... */
  2036. muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */
  2037. lw s2, 80(a1) /* quantptr[DCTSIZE*5] */
  2038. lw t8, 4(AT) /* FIX(1.414213562) */
  2039. ins t2, v0, 16, 16 /* ... tmp1 */
  2040. muleq_s.w.phl v0, t6, s3 /* tmp3 ... */
  2041. muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */
  2042. ins t4, v1, 16, 16 /* ... tmp2 */
  2043. addq.ph s4, t0, t4 /* tmp10 */
  2044. subq.ph s5, t0, t4 /* tmp11 */
  2045. ins t6, v0, 16, 16 /* ... tmp3 */
  2046. subq.ph s6, t2, t6 /* tmp12 ... */
  2047. addq.ph s7, t2, t6 /* tmp13 */
  2048. mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
  2049. addq.ph t0, s4, s7 /* tmp0 */
  2050. subq.ph t6, s4, s7 /* tmp3 */
  2051. muleq_s.w.phl v0, t1, s0 /* tmp4 ... */
  2052. muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */
  2053. shll_s.ph s6, s6, 1 /* x2 */
  2054. lw s3, 112(a1) /* quantptr[DCTSIZE*7] */
  2055. subq.ph s6, s6, s7 /* ... tmp12 */
  2056. muleq_s.w.phl v1, t7, s3 /* tmp7 ... */
  2057. muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */
  2058. ins t1, v0, 16, 16 /* ... tmp4 */
  2059. addq.ph t2, s5, s6 /* tmp1 */
  2060. subq.ph t4, s5, s6 /* tmp2 */
  2061. muleq_s.w.phl v0, t5, s2 /* tmp6 ... */
  2062. muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */
  2063. ins t7, v1, 16, 16 /* ... tmp7 */
  2064. addq.ph s5, t1, t7 /* z11 */
  2065. subq.ph s6, t1, t7 /* z12 */
  2066. muleq_s.w.phl v1, t3, s1 /* tmp5 ... */
  2067. muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */
  2068. ins t5, v0, 16, 16 /* ... tmp6 */
  2069. ins t3, v1, 16, 16 /* ... tmp5 */
  2070. addq.ph s7, t5, t3 /* z13 */
  2071. subq.ph v0, t5, t3 /* z10 */
  2072. addq.ph t7, s5, s7 /* tmp7 */
  2073. subq.ph s5, s5, s7 /* tmp11 ... */
  2074. addq.ph v1, v0, s6 /* z5 ... */
  2075. mulq_s.ph s5, s5, t8 /* ... tmp11 */
  2076. lw t8, 8(AT) /* FIX(1.847759065) */
  2077. lw s4, 0(AT) /* FIX(1.082392200) */
  2078. addq.ph s0, t0, t7
  2079. subq.ph s1, t0, t7
  2080. mulq_s.ph v1, v1, t8 /* ... z5 */
  2081. shll_s.ph s5, s5, 1 /* x2 */
  2082. lw t8, 12(AT) /* FIX(-2.613125930) */
  2083. sw s0, 0(a2) /* wsptr[DCTSIZE*0] */
  2084. shll_s.ph v0, v0, 1 /* x4 */
  2085. mulq_s.ph v0, v0, t8 /* tmp12 ... */
  2086. mulq_s.ph s4, s6, s4 /* tmp10 ... */
  2087. shll_s.ph v1, v1, 1 /* x2 */
  2088. addiu a0, a0, 4
  2089. addiu a1, a1, 4
  2090. sw s1, 112(a2) /* wsptr[DCTSIZE*7] */
  2091. shll_s.ph s6, v0, 1 /* x4 */
  2092. shll_s.ph s4, s4, 1 /* x2 */
  2093. addq.ph s6, s6, v1 /* ... tmp12 */
  2094. subq.ph t5, s6, t7 /* tmp6 */
  2095. subq.ph s4, s4, v1 /* ... tmp10 */
  2096. subq.ph t3, s5, t5 /* tmp5 */
  2097. addq.ph s2, t2, t5
  2098. addq.ph t1, s4, t3 /* tmp4 */
  2099. subq.ph s3, t2, t5
  2100. sw s2, 16(a2) /* wsptr[DCTSIZE*1] */
  2101. sw s3, 96(a2) /* wsptr[DCTSIZE*6] */
  2102. addq.ph v0, t4, t3
  2103. subq.ph v1, t4, t3
  2104. sw v0, 32(a2) /* wsptr[DCTSIZE*2] */
  2105. sw v1, 80(a2) /* wsptr[DCTSIZE*5] */
  2106. addq.ph v0, t6, t1
  2107. subq.ph v1, t6, t1
  2108. sw v0, 64(a2) /* wsptr[DCTSIZE*4] */
  2109. sw v1, 48(a2) /* wsptr[DCTSIZE*3] */
  2110. 2:
  2111. bne a0, t9, 0b
  2112. addiu a2, a2, 4
  2113. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2114. j ra
  2115. nop
  2116. END(jsimd_idct_ifast_cols_dspr2)
  2117. /*****************************************************************************/
  2118. LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
  2119. /*
  2120. * a0 = wsptr
  2121. * a1 = output_buf
  2122. * a2 = output_col
  2123. * a3 = mips_idct_ifast_coefs
  2124. */
  2125. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2126. addiu t9, a0, 128 /* end address */
  2127. lui s8, 0x8080
  2128. ori s8, s8, 0x8080
  2129. 0:
  2130. lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */
  2131. lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */
  2132. lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */
  2133. lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */
  2134. lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */
  2135. lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */
  2136. lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */
  2137. lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */
  2138. lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */
  2139. precrq.ph.w t1, s0, t0 /* B b */
  2140. ins t0, s0, 16, 16 /* A a */
  2141. bnez t1, 1f
  2142. or s0, t2, s2
  2143. bnez s0, 1f
  2144. or s0, t4, s4
  2145. bnez s0, 1f
  2146. or s0, t6, s6
  2147. bnez s0, 1f
  2148. shll_s.ph s0, t0, 2 /* A a */
  2149. lw a3, 0(a1)
  2150. lw AT, 4(a1)
  2151. precrq.ph.w t0, s0, s0 /* A A */
  2152. ins s0, s0, 16, 16 /* a a */
  2153. addu a3, a3, a2
  2154. addu AT, AT, a2
  2155. precrq.qb.ph t0, t0, t0 /* A A A A */
  2156. precrq.qb.ph s0, s0, s0 /* a a a a */
  2157. addu.qb s0, s0, s8
  2158. addu.qb t0, t0, s8
  2159. sw s0, 0(a3)
  2160. sw s0, 4(a3)
  2161. sw t0, 0(AT)
  2162. sw t0, 4(AT)
  2163. addiu a0, a0, 32
  2164. bne a0, t9, 0b
  2165. addiu a1, a1, 8
  2166. b 2f
  2167. nop
  2168. 1:
  2169. precrq.ph.w t3, s2, t2
  2170. ins t2, s2, 16, 16
  2171. precrq.ph.w t5, s4, t4
  2172. ins t4, s4, 16, 16
  2173. precrq.ph.w t7, s6, t6
  2174. ins t6, s6, 16, 16
  2175. lw t8, 4(AT) /* FIX(1.414213562) */
  2176. addq.ph s4, t0, t4 /* tmp10 */
  2177. subq.ph s5, t0, t4 /* tmp11 */
  2178. subq.ph s6, t2, t6 /* tmp12 ... */
  2179. addq.ph s7, t2, t6 /* tmp13 */
  2180. mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
  2181. addq.ph t0, s4, s7 /* tmp0 */
  2182. subq.ph t6, s4, s7 /* tmp3 */
  2183. shll_s.ph s6, s6, 1 /* x2 */
  2184. subq.ph s6, s6, s7 /* ... tmp12 */
  2185. addq.ph t2, s5, s6 /* tmp1 */
  2186. subq.ph t4, s5, s6 /* tmp2 */
  2187. addq.ph s5, t1, t7 /* z11 */
  2188. subq.ph s6, t1, t7 /* z12 */
  2189. addq.ph s7, t5, t3 /* z13 */
  2190. subq.ph v0, t5, t3 /* z10 */
  2191. addq.ph t7, s5, s7 /* tmp7 */
  2192. subq.ph s5, s5, s7 /* tmp11 ... */
  2193. addq.ph v1, v0, s6 /* z5 ... */
  2194. mulq_s.ph s5, s5, t8 /* ... tmp11 */
  2195. lw t8, 8(AT) /* FIX(1.847759065) */
  2196. lw s4, 0(AT) /* FIX(1.082392200) */
  2197. addq.ph s0, t0, t7 /* tmp0 + tmp7 */
  2198. subq.ph s7, t0, t7 /* tmp0 - tmp7 */
  2199. mulq_s.ph v1, v1, t8 /* ... z5 */
  2200. lw a3, 0(a1)
  2201. lw t8, 12(AT) /* FIX(-2.613125930) */
  2202. shll_s.ph s5, s5, 1 /* x2 */
  2203. addu a3, a3, a2
  2204. shll_s.ph v0, v0, 1 /* x4 */
  2205. mulq_s.ph v0, v0, t8 /* tmp12 ... */
  2206. mulq_s.ph s4, s6, s4 /* tmp10 ... */
  2207. shll_s.ph v1, v1, 1 /* x2 */
  2208. addiu a0, a0, 32
  2209. addiu a1, a1, 8
  2210. shll_s.ph s6, v0, 1 /* x4 */
  2211. shll_s.ph s4, s4, 1 /* x2 */
  2212. addq.ph s6, s6, v1 /* ... tmp12 */
  2213. shll_s.ph s0, s0, 2
  2214. subq.ph t5, s6, t7 /* tmp6 */
  2215. subq.ph s4, s4, v1 /* ... tmp10 */
  2216. subq.ph t3, s5, t5 /* tmp5 */
  2217. shll_s.ph s7, s7, 2
  2218. addq.ph t1, s4, t3 /* tmp4 */
  2219. addq.ph s1, t2, t5 /* tmp1 + tmp6 */
  2220. subq.ph s6, t2, t5 /* tmp1 - tmp6 */
  2221. addq.ph s2, t4, t3 /* tmp2 + tmp5 */
  2222. subq.ph s5, t4, t3 /* tmp2 - tmp5 */
  2223. addq.ph s4, t6, t1 /* tmp3 + tmp4 */
  2224. subq.ph s3, t6, t1 /* tmp3 - tmp4 */
  2225. shll_s.ph s1, s1, 2
  2226. shll_s.ph s2, s2, 2
  2227. shll_s.ph s3, s3, 2
  2228. shll_s.ph s4, s4, 2
  2229. shll_s.ph s5, s5, 2
  2230. shll_s.ph s6, s6, 2
  2231. precrq.ph.w t0, s1, s0 /* B A */
  2232. ins s0, s1, 16, 16 /* b a */
  2233. precrq.ph.w t2, s3, s2 /* D C */
  2234. ins s2, s3, 16, 16 /* d c */
  2235. precrq.ph.w t4, s5, s4 /* F E */
  2236. ins s4, s5, 16, 16 /* f e */
  2237. precrq.ph.w t6, s7, s6 /* H G */
  2238. ins s6, s7, 16, 16 /* h g */
  2239. precrq.qb.ph t0, t2, t0 /* D C B A */
  2240. precrq.qb.ph s0, s2, s0 /* d c b a */
  2241. precrq.qb.ph t4, t6, t4 /* H G F E */
  2242. precrq.qb.ph s4, s6, s4 /* h g f e */
  2243. addu.qb s0, s0, s8
  2244. addu.qb s4, s4, s8
  2245. sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */
  2246. sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */
  2247. lw a3, -4(a1)
  2248. addu.qb t0, t0, s8
  2249. addu a3, a3, a2
  2250. addu.qb t4, t4, s8
  2251. sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */
  2252. bne a0, t9, 0b
  2253. sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */
  2254. 2:
  2255. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2256. j ra
  2257. nop
  2258. END(jsimd_idct_ifast_rows_dspr2)
  2259. /*****************************************************************************/
  2260. LEAF_DSPR2(jsimd_fdct_islow_dspr2)
  2261. /*
  2262. * a0 = data
  2263. */
  2264. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2265. lui t0, 6437
  2266. ori t0, 2260
  2267. lui t1, 9633
  2268. ori t1, 11363
  2269. lui t2, 0xd39e
  2270. ori t2, 0xe6dc
  2271. lui t3, 0xf72d
  2272. ori t3, 9633
  2273. lui t4, 2261
  2274. ori t4, 9633
  2275. lui t5, 0xd39e
  2276. ori t5, 6437
  2277. lui t6, 9633
  2278. ori t6, 0xd39d
  2279. lui t7, 0xe6dc
  2280. ori t7, 2260
  2281. lui t8, 4433
  2282. ori t8, 10703
  2283. lui t9, 0xd630
  2284. ori t9, 4433
  2285. li s8, 8
  2286. move a1, a0
  2287. 1:
  2288. lw s0, 0(a1) /* tmp0 = 1|0 */
  2289. lw s1, 4(a1) /* tmp1 = 3|2 */
  2290. lw s2, 8(a1) /* tmp2 = 5|4 */
  2291. lw s3, 12(a1) /* tmp3 = 7|6 */
  2292. packrl.ph s1, s1, s1 /* tmp1 = 2|3 */
  2293. packrl.ph s3, s3, s3 /* tmp3 = 6|7 */
  2294. subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */
  2295. subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */
  2296. mult $0, $0 /* ac0 = 0 */
  2297. dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */
  2298. dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */
  2299. mult $ac1, $0, $0 /* ac1 = 0 */
  2300. dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */
  2301. dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */
  2302. mult $ac2, $0, $0 /* ac2 = 0 */
  2303. dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */
  2304. dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */
  2305. mult $ac3, $0, $0 /* ac3 = 0 */
  2306. dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */
  2307. dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */
  2308. addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */
  2309. addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */
  2310. extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
  2311. extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
  2312. extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */
  2313. extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */
  2314. addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
  2315. subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
  2316. sh s0, 2(a1)
  2317. sh s1, 6(a1)
  2318. sh s2, 10(a1)
  2319. sh s3, 14(a1)
  2320. mult $0, $0 /* ac0 = 0 */
  2321. dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */
  2322. mult $ac1, $0, $0 /* ac1 = 0 */
  2323. dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */
  2324. sra s4, s5, 16 /* tmp4 = t11 */
  2325. addiu a1, a1, 16
  2326. addiu s8, s8, -1
  2327. extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
  2328. extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
  2329. addu s2, s5, s4 /* tmp2 = t10 + t11 */
  2330. subu s3, s5, s4 /* tmp3 = t10 - t11 */
  2331. sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */
  2332. sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */
  2333. sh s2, -16(a1)
  2334. sh s3, -8(a1)
  2335. sh s0, -12(a1)
  2336. bgtz s8, 1b
  2337. sh s1, -4(a1)
  2338. li t0, 2260
  2339. li t1, 11363
  2340. li t2, 9633
  2341. li t3, 6436
  2342. li t4, 6437
  2343. li t5, 2261
  2344. li t6, 11362
  2345. li t7, 2259
  2346. li t8, 4433
  2347. li t9, 10703
  2348. li a1, 10704
  2349. li s8, 8
  2350. 2:
  2351. lh a2, 0(a0) /* 0 */
  2352. lh a3, 16(a0) /* 8 */
  2353. lh v0, 32(a0) /* 16 */
  2354. lh v1, 48(a0) /* 24 */
  2355. lh s4, 64(a0) /* 32 */
  2356. lh s5, 80(a0) /* 40 */
  2357. lh s6, 96(a0) /* 48 */
  2358. lh s7, 112(a0) /* 56 */
  2359. addu s2, v0, s5 /* tmp2 = 16 + 40 */
  2360. subu s5, v0, s5 /* tmp5 = 16 - 40 */
  2361. addu s3, v1, s4 /* tmp3 = 24 + 32 */
  2362. subu s4, v1, s4 /* tmp4 = 24 - 32 */
  2363. addu s0, a2, s7 /* tmp0 = 0 + 56 */
  2364. subu s7, a2, s7 /* tmp7 = 0 - 56 */
  2365. addu s1, a3, s6 /* tmp1 = 8 + 48 */
  2366. subu s6, a3, s6 /* tmp6 = 8 - 48 */
  2367. addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */
  2368. subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */
  2369. addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */
  2370. subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */
  2371. mult s7, t1 /* ac0 = tmp7 * c1 */
  2372. madd s4, t0 /* ac0 += tmp4 * c0 */
  2373. madd s5, t4 /* ac0 += tmp5 * c4 */
  2374. madd s6, t2 /* ac0 += tmp6 * c2 */
  2375. mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */
  2376. msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */
  2377. msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */
  2378. msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */
  2379. mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */
  2380. madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */
  2381. madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */
  2382. msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */
  2383. mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */
  2384. msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */
  2385. madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */
  2386. msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */
  2387. extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */
  2388. extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */
  2389. extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */
  2390. extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */
  2391. addiu s8, s8, -1
  2392. addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */
  2393. subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */
  2394. sh s0, 16(a0)
  2395. sh s1, 48(a0)
  2396. sh s2, 80(a0)
  2397. sh s3, 112(a0)
  2398. mult v0, t8 /* ac0 = tmp12 * c8 */
  2399. madd v1, t9 /* ac0 += tmp13 * c9 */
  2400. mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */
  2401. msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */
  2402. addiu a0, a0, 2
  2403. extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */
  2404. extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */
  2405. shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */
  2406. shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */
  2407. sh s4, -2(a0)
  2408. sh s5, 62(a0)
  2409. sh s6, 30(a0)
  2410. bgtz s8, 2b
  2411. sh s7, 94(a0)
  2412. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2413. jr ra
  2414. nop
  2415. END(jsimd_fdct_islow_dspr2)
  2416. /**************************************************************************/
  2417. LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
  2418. /*
  2419. * a0 = data
  2420. */
  2421. .set at
  2422. SAVE_REGS_ON_STACK 8, s0, s1
  2423. li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) |
  2424. (334 & 0xffff) */
  2425. li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) |
  2426. (139 & 0xffff) */
  2427. li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) |
  2428. (98 & 0xffff) */
  2429. li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) |
  2430. (181 & 0xffff) */
  2431. move v0, a0
  2432. addiu v1, v0, 128 /* end address */
  2433. 0:
  2434. lw t0, 0(v0) /* tmp0 = 1|0 */
  2435. lw t1, 4(v0) /* tmp1 = 3|2 */
  2436. lw t2, 8(v0) /* tmp2 = 5|4 */
  2437. lw t3, 12(v0) /* tmp3 = 7|6 */
  2438. packrl.ph t1, t1, t1 /* tmp1 = 2|3 */
  2439. packrl.ph t3, t3, t3 /* tmp3 = 6|7 */
  2440. subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */
  2441. subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */
  2442. addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */
  2443. addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */
  2444. addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
  2445. subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
  2446. sra t4, t8, 16 /* tmp4 = t11 */
  2447. mult $0, $0 /* ac0 = 0 */
  2448. dpa.w.ph $ac0, t9, s1
  2449. mult $ac1, $0, $0 /* ac1 = 0 */
  2450. dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */
  2451. dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */
  2452. mult $ac2, $0, $0 /* ac2 = 0 */
  2453. dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */
  2454. mult $ac3, $0, $0 /* ac3 = 0 */
  2455. dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */
  2456. precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */
  2457. addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */
  2458. subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */
  2459. extr.w t4, $ac0, 8
  2460. mult $0, $0 /* ac0 = 0 */
  2461. dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */
  2462. extr.w t0, $ac1, 8 /* t0 = z5 */
  2463. extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */
  2464. extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */
  2465. extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */
  2466. add t6, t1, t0 /* t6 = z2 */
  2467. add t7, t7, t0 /* t7 = z4 */
  2468. subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */
  2469. addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */
  2470. addq.ph t1, t0, t6 /* t1 = z13 + z2 */
  2471. subq.ph t6, t0, t6 /* t6 = z13 - z2 */
  2472. addq.ph t0, t8, t7 /* t0 = z11 + z4 */
  2473. subq.ph t7, t8, t7 /* t7 = z11 - z4 */
  2474. addq.ph t5, t4, t9
  2475. subq.ph t4, t9, t4
  2476. sh t2, 0(v0)
  2477. sh t5, 4(v0)
  2478. sh t3, 8(v0)
  2479. sh t4, 12(v0)
  2480. sh t1, 10(v0)
  2481. sh t6, 6(v0)
  2482. sh t0, 2(v0)
  2483. sh t7, 14(v0)
  2484. addiu v0, 16
  2485. bne v1, v0, 0b
  2486. nop
  2487. move v0, a0
  2488. addiu v1, v0, 16
  2489. 1:
  2490. lh t0, 0(v0) /* 0 */
  2491. lh t1, 16(v0) /* 8 */
  2492. lh t2, 32(v0) /* 16 */
  2493. lh t3, 48(v0) /* 24 */
  2494. lh t4, 64(v0) /* 32 */
  2495. lh t5, 80(v0) /* 40 */
  2496. lh t6, 96(v0) /* 48 */
  2497. lh t7, 112(v0) /* 56 */
  2498. add t8, t0, t7 /* t8 = tmp0 */
  2499. sub t7, t0, t7 /* t7 = tmp7 */
  2500. add t0, t1, t6 /* t0 = tmp1 */
  2501. sub t1, t1, t6 /* t1 = tmp6 */
  2502. add t6, t2, t5 /* t6 = tmp2 */
  2503. sub t5, t2, t5 /* t5 = tmp5 */
  2504. add t2, t3, t4 /* t2 = tmp3 */
  2505. sub t3, t3, t4 /* t3 = tmp4 */
  2506. add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */
  2507. sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */
  2508. sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */
  2509. ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */
  2510. add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */
  2511. mult $0, $0 /* ac0 = 0 */
  2512. dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */
  2513. add s0, t4, t2 /* t8 = tmp10+tmp11 */
  2514. sub t4, t4, t2 /* t4 = tmp10-tmp11 */
  2515. sh s0, 0(v0)
  2516. sh t4, 64(v0)
  2517. extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13,
  2518. FIX_0_707106781) */
  2519. addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */
  2520. subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */
  2521. sh t4, 32(v0)
  2522. sh t8, 96(v0)
  2523. add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */
  2524. add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */
  2525. add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */
  2526. andi t4, a1, 0xffff
  2527. mul s0, t1, t4
  2528. sra s0, s0, 8 /* s0 = z4 =
  2529. MULTIPLY(tmp12, FIX_1_306562965) */
  2530. ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */
  2531. mult $0, $0 /* ac0 = 0 */
  2532. mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */
  2533. extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12,
  2534. FIX_0_382683433) */
  2535. add t2, t7, t8 /* t2 = tmp7 + z5 */
  2536. sub t7, t7, t8 /* t7 = tmp7 - z5 */
  2537. andi t4, a2, 0xffff
  2538. mul t8, t3, t4
  2539. sra t8, t8, 8 /* t8 = z2 =
  2540. MULTIPLY(tmp10, FIX_0_541196100) */
  2541. andi t4, s1, 0xffff
  2542. mul t6, t0, t4
  2543. sra t6, t6, 8 /* t6 = z3 =
  2544. MULTIPLY(tmp11, FIX_0_707106781) */
  2545. add t0, t6, t8 /* t0 = z3 + z2 */
  2546. sub t1, t6, t8 /* t1 = z3 - z2 */
  2547. add t3, t6, s0 /* t3 = z3 + z4 */
  2548. sub t4, t6, s0 /* t4 = z3 - z4 */
  2549. sub t5, t2, t1 /* t5 = dataptr[5] */
  2550. sub t6, t7, t0 /* t6 = dataptr[3] */
  2551. add t3, t2, t3 /* t3 = dataptr[1] */
  2552. add t4, t7, t4 /* t4 = dataptr[7] */
  2553. sh t5, 80(v0)
  2554. sh t6, 48(v0)
  2555. sh t3, 16(v0)
  2556. sh t4, 112(v0)
  2557. addiu v0, 2
  2558. bne v0, v1, 1b
  2559. nop
  2560. RESTORE_REGS_FROM_STACK 8, s0, s1
  2561. j ra
  2562. nop
  2563. END(jsimd_fdct_ifast_dspr2)
  2564. /*****************************************************************************/
  2565. LEAF_DSPR2(jsimd_quantize_dspr2)
  2566. /*
  2567. * a0 = coef_block
  2568. * a1 = divisors
  2569. * a2 = workspace
  2570. */
  2571. .set at
  2572. SAVE_REGS_ON_STACK 16, s0, s1, s2
  2573. addiu v0, a2, 124 /* v0 = workspace_end */
  2574. lh t0, 0(a2)
  2575. lh t1, 0(a1)
  2576. lh t2, 128(a1)
  2577. sra t3, t0, 15
  2578. sll t3, t3, 1
  2579. addiu t3, t3, 1
  2580. mul t0, t0, t3
  2581. lh t4, 384(a1)
  2582. lh t5, 130(a1)
  2583. lh t6, 2(a2)
  2584. lh t7, 2(a1)
  2585. lh t8, 386(a1)
  2586. 1:
  2587. andi t1, 0xffff
  2588. add t9, t0, t2
  2589. andi t9, 0xffff
  2590. mul v1, t9, t1
  2591. sra s0, t6, 15
  2592. sll s0, s0, 1
  2593. addiu s0, s0, 1
  2594. addiu t9, t4, 16
  2595. srav v1, v1, t9
  2596. mul v1, v1, t3
  2597. mul t6, t6, s0
  2598. andi t7, 0xffff
  2599. addiu a2, a2, 4
  2600. addiu a1, a1, 4
  2601. add s1, t6, t5
  2602. andi s1, 0xffff
  2603. sh v1, 0(a0)
  2604. mul s2, s1, t7
  2605. addiu s1, t8, 16
  2606. srav s2, s2, s1
  2607. mul s2, s2, s0
  2608. lh t0, 0(a2)
  2609. lh t1, 0(a1)
  2610. sra t3, t0, 15
  2611. sll t3, t3, 1
  2612. addiu t3, t3, 1
  2613. mul t0, t0, t3
  2614. lh t2, 128(a1)
  2615. lh t4, 384(a1)
  2616. lh t5, 130(a1)
  2617. lh t8, 386(a1)
  2618. lh t6, 2(a2)
  2619. lh t7, 2(a1)
  2620. sh s2, 2(a0)
  2621. lh t0, 0(a2)
  2622. sra t3, t0, 15
  2623. sll t3, t3, 1
  2624. addiu t3, t3, 1
  2625. mul t0, t0, t3
  2626. bne a2, v0, 1b
  2627. addiu a0, a0, 4
  2628. andi t1, 0xffff
  2629. add t9, t0, t2
  2630. andi t9, 0xffff
  2631. mul v1, t9, t1
  2632. sra s0, t6, 15
  2633. sll s0, s0, 1
  2634. addiu s0, s0, 1
  2635. addiu t9, t4, 16
  2636. srav v1, v1, t9
  2637. mul v1, v1, t3
  2638. mul t6, t6, s0
  2639. andi t7, 0xffff
  2640. sh v1, 0(a0)
  2641. add s1, t6, t5
  2642. andi s1, 0xffff
  2643. mul s2, s1, t7
  2644. addiu s1, t8, 16
  2645. addiu a2, a2, 4
  2646. addiu a1, a1, 4
  2647. srav s2, s2, s1
  2648. mul s2, s2, s0
  2649. sh s2, 2(a0)
  2650. RESTORE_REGS_FROM_STACK 16, s0, s1, s2
  2651. j ra
  2652. nop
  2653. END(jsimd_quantize_dspr2)
  2654. #ifndef __mips_soft_float
  2655. /*****************************************************************************/
  2656. LEAF_DSPR2(jsimd_quantize_float_dspr2)
  2657. /*
  2658. * a0 = coef_block
  2659. * a1 = divisors
  2660. * a2 = workspace
  2661. */
  2662. .set at
  2663. li t1, 0x46800100 /* integer representation 16384.5 */
  2664. mtc1 t1, f0
  2665. li t0, 63
  2666. 0:
  2667. lwc1 f2, 0(a2)
  2668. lwc1 f10, 0(a1)
  2669. lwc1 f4, 4(a2)
  2670. lwc1 f12, 4(a1)
  2671. lwc1 f6, 8(a2)
  2672. lwc1 f14, 8(a1)
  2673. lwc1 f8, 12(a2)
  2674. lwc1 f16, 12(a1)
  2675. madd.s f2, f0, f2, f10
  2676. madd.s f4, f0, f4, f12
  2677. madd.s f6, f0, f6, f14
  2678. madd.s f8, f0, f8, f16
  2679. lwc1 f10, 16(a1)
  2680. lwc1 f12, 20(a1)
  2681. trunc.w.s f2, f2
  2682. trunc.w.s f4, f4
  2683. trunc.w.s f6, f6
  2684. trunc.w.s f8, f8
  2685. lwc1 f14, 24(a1)
  2686. lwc1 f16, 28(a1)
  2687. mfc1 t1, f2
  2688. mfc1 t2, f4
  2689. mfc1 t3, f6
  2690. mfc1 t4, f8
  2691. lwc1 f2, 16(a2)
  2692. lwc1 f4, 20(a2)
  2693. lwc1 f6, 24(a2)
  2694. lwc1 f8, 28(a2)
  2695. madd.s f2, f0, f2, f10
  2696. madd.s f4, f0, f4, f12
  2697. madd.s f6, f0, f6, f14
  2698. madd.s f8, f0, f8, f16
  2699. addiu t1, t1, -16384
  2700. addiu t2, t2, -16384
  2701. addiu t3, t3, -16384
  2702. addiu t4, t4, -16384
  2703. trunc.w.s f2, f2
  2704. trunc.w.s f4, f4
  2705. trunc.w.s f6, f6
  2706. trunc.w.s f8, f8
  2707. sh t1, 0(a0)
  2708. sh t2, 2(a0)
  2709. sh t3, 4(a0)
  2710. sh t4, 6(a0)
  2711. mfc1 t1, f2
  2712. mfc1 t2, f4
  2713. mfc1 t3, f6
  2714. mfc1 t4, f8
  2715. addiu t0, t0, -8
  2716. addiu a2, a2, 32
  2717. addiu a1, a1, 32
  2718. addiu t1, t1, -16384
  2719. addiu t2, t2, -16384
  2720. addiu t3, t3, -16384
  2721. addiu t4, t4, -16384
  2722. sh t1, 8(a0)
  2723. sh t2, 10(a0)
  2724. sh t3, 12(a0)
  2725. sh t4, 14(a0)
  2726. bgez t0, 0b
  2727. addiu a0, a0, 16
  2728. j ra
  2729. nop
  2730. END(jsimd_quantize_float_dspr2)
  2731. #endif
  2732. /*****************************************************************************/
  2733. LEAF_DSPR2(jsimd_idct_2x2_dspr2)
  2734. /*
  2735. * a0 = compptr->dct_table
  2736. * a1 = coef_block
  2737. * a2 = output_buf
  2738. * a3 = output_col
  2739. */
  2740. .set at
  2741. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  2742. addiu sp, sp, -40
  2743. move v0, sp
  2744. addiu s2, zero, 29692
  2745. addiu s3, zero, -10426
  2746. addiu s4, zero, 6967
  2747. addiu s5, zero, -5906
  2748. lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */
  2749. lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */
  2750. lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */
  2751. lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */
  2752. mul t4, t5, t0
  2753. lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */
  2754. lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */
  2755. mul t6, t6, t1
  2756. mul t5, t5, t0
  2757. lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */
  2758. lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */
  2759. lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */
  2760. lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */
  2761. mul t7, t7, t2
  2762. mult zero, zero
  2763. mul t8, t8, t3
  2764. li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */
  2765. li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */
  2766. ins t6, t5, 16, 16 /* t6 = t5|t6 */
  2767. sll t4, t4, 15
  2768. dpa.w.ph $ac0, t6, s0
  2769. lh t1, 2(a1)
  2770. lh t6, 2(a0)
  2771. ins t8, t7, 16, 16 /* t8 = t7|t8 */
  2772. dpa.w.ph $ac0, t8, s1
  2773. mflo t0, $ac0
  2774. mul t5, t6, t1
  2775. lh t1, 18(a1)
  2776. lh t6, 18(a0)
  2777. lh t2, 50(a1)
  2778. lh t7, 50(a0)
  2779. mul t6, t6, t1
  2780. subu t8, t4, t0
  2781. mul t7, t7, t2
  2782. addu t0, t4, t0
  2783. shra_r.w t0, t0, 13
  2784. lh t1, 82(a1)
  2785. lh t2, 82(a0)
  2786. lh t3, 114(a1)
  2787. lh t4, 114(a0)
  2788. shra_r.w t8, t8, 13
  2789. mul t1, t1, t2
  2790. mul t3, t3, t4
  2791. sw t0, 0(v0)
  2792. sw t8, 20(v0)
  2793. sll t4, t5, 15
  2794. ins t7, t6, 16, 16
  2795. mult zero, zero
  2796. dpa.w.ph $ac0, t7, s0
  2797. ins t3, t1, 16, 16
  2798. lh t1, 6(a1)
  2799. lh t6, 6(a0)
  2800. dpa.w.ph $ac0, t3, s1
  2801. mflo t0, $ac0
  2802. mul t5, t6, t1
  2803. lh t1, 22(a1)
  2804. lh t6, 22(a0)
  2805. lh t2, 54(a1)
  2806. lh t7, 54(a0)
  2807. mul t6, t6, t1
  2808. subu t8, t4, t0
  2809. mul t7, t7, t2
  2810. addu t0, t4, t0
  2811. shra_r.w t0, t0, 13
  2812. lh t1, 86(a1)
  2813. lh t2, 86(a0)
  2814. lh t3, 118(a1)
  2815. lh t4, 118(a0)
  2816. shra_r.w t8, t8, 13
  2817. mul t1, t1, t2
  2818. mul t3, t3, t4
  2819. sw t0, 4(v0)
  2820. sw t8, 24(v0)
  2821. sll t4, t5, 15
  2822. ins t7, t6, 16, 16
  2823. mult zero, zero
  2824. dpa.w.ph $ac0, t7, s0
  2825. ins t3, t1, 16, 16
  2826. lh t1, 10(a1)
  2827. lh t6, 10(a0)
  2828. dpa.w.ph $ac0, t3, s1
  2829. mflo t0, $ac0
  2830. mul t5, t6, t1
  2831. lh t1, 26(a1)
  2832. lh t6, 26(a0)
  2833. lh t2, 58(a1)
  2834. lh t7, 58(a0)
  2835. mul t6, t6, t1
  2836. subu t8, t4, t0
  2837. mul t7, t7, t2
  2838. addu t0, t4, t0
  2839. shra_r.w t0, t0, 13
  2840. lh t1, 90(a1)
  2841. lh t2, 90(a0)
  2842. lh t3, 122(a1)
  2843. lh t4, 122(a0)
  2844. shra_r.w t8, t8, 13
  2845. mul t1, t1, t2
  2846. mul t3, t3, t4
  2847. sw t0, 8(v0)
  2848. sw t8, 28(v0)
  2849. sll t4, t5, 15
  2850. ins t7, t6, 16, 16
  2851. mult zero, zero
  2852. dpa.w.ph $ac0, t7, s0
  2853. ins t3, t1, 16, 16
  2854. lh t1, 14(a1)
  2855. lh t6, 14(a0)
  2856. dpa.w.ph $ac0, t3, s1
  2857. mflo t0, $ac0
  2858. mul t5, t6, t1
  2859. lh t1, 30(a1)
  2860. lh t6, 30(a0)
  2861. lh t2, 62(a1)
  2862. lh t7, 62(a0)
  2863. mul t6, t6, t1
  2864. subu t8, t4, t0
  2865. mul t7, t7, t2
  2866. addu t0, t4, t0
  2867. shra_r.w t0, t0, 13
  2868. lh t1, 94(a1)
  2869. lh t2, 94(a0)
  2870. lh t3, 126(a1)
  2871. lh t4, 126(a0)
  2872. shra_r.w t8, t8, 13
  2873. mul t1, t1, t2
  2874. mul t3, t3, t4
  2875. sw t0, 12(v0)
  2876. sw t8, 32(v0)
  2877. sll t4, t5, 15
  2878. ins t7, t6, 16, 16
  2879. mult zero, zero
  2880. dpa.w.ph $ac0, t7, s0
  2881. ins t3, t1, 16, 16
  2882. dpa.w.ph $ac0, t3, s1
  2883. mflo t0, $ac0
  2884. lw t9, 0(a2)
  2885. lw t3, 0(v0)
  2886. lw t7, 4(v0)
  2887. lw t1, 8(v0)
  2888. addu t9, t9, a3
  2889. sll t3, t3, 15
  2890. subu t8, t4, t0
  2891. addu t0, t4, t0
  2892. shra_r.w t0, t0, 13
  2893. shra_r.w t8, t8, 13
  2894. sw t0, 16(v0)
  2895. sw t8, 36(v0)
  2896. lw t5, 12(v0)
  2897. lw t6, 16(v0)
  2898. mult t7, s2
  2899. madd t1, s3
  2900. madd t5, s4
  2901. madd t6, s5
  2902. lw t5, 24(v0)
  2903. lw t7, 28(v0)
  2904. mflo t0, $ac0
  2905. lw t8, 32(v0)
  2906. lw t2, 36(v0)
  2907. mult $ac1, t5, s2
  2908. madd $ac1, t7, s3
  2909. madd $ac1, t8, s4
  2910. madd $ac1, t2, s5
  2911. addu t1, t3, t0
  2912. subu t6, t3, t0
  2913. shra_r.w t1, t1, 20
  2914. shra_r.w t6, t6, 20
  2915. mflo t4, $ac1
  2916. shll_s.w t1, t1, 24
  2917. shll_s.w t6, t6, 24
  2918. sra t1, t1, 24
  2919. sra t6, t6, 24
  2920. addiu t1, t1, 128
  2921. addiu t6, t6, 128
  2922. lw t0, 20(v0)
  2923. sb t1, 0(t9)
  2924. sb t6, 1(t9)
  2925. sll t0, t0, 15
  2926. lw t9, 4(a2)
  2927. addu t1, t0, t4
  2928. subu t6, t0, t4
  2929. addu t9, t9, a3
  2930. shra_r.w t1, t1, 20
  2931. shra_r.w t6, t6, 20
  2932. shll_s.w t1, t1, 24
  2933. shll_s.w t6, t6, 24
  2934. sra t1, t1, 24
  2935. sra t6, t6, 24
  2936. addiu t1, t1, 128
  2937. addiu t6, t6, 128
  2938. sb t1, 0(t9)
  2939. sb t6, 1(t9)
  2940. addiu sp, sp, 40
  2941. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  2942. j ra
  2943. nop
  2944. END(jsimd_idct_2x2_dspr2)
  2945. /*****************************************************************************/
  2946. LEAF_DSPR2(jsimd_idct_4x4_dspr2)
  2947. /*
  2948. * a0 = compptr->dct_table
  2949. * a1 = coef_block
  2950. * a2 = output_buf
  2951. * a3 = output_col
  2952. * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes)
  2953. */
  2954. .set at
  2955. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2956. lw v1, 48(sp)
  2957. move t0, a1
  2958. move t1, v1
  2959. li t9, 4
  2960. li s0, 0x2e75f93e
  2961. li s1, 0x21f9ba79
  2962. li s2, 0xecc2efb0
  2963. li s3, 0x52031ccd
  2964. 0:
  2965. lh s6, 32(t0) /* inptr[DCTSIZE*2] */
  2966. lh t6, 32(a0) /* quantptr[DCTSIZE*2] */
  2967. lh s7, 96(t0) /* inptr[DCTSIZE*6] */
  2968. lh t7, 96(a0) /* quantptr[DCTSIZE*6] */
  2969. mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
  2970. quantptr[DCTSIZE*2]) */
  2971. lh s4, 0(t0) /* inptr[DCTSIZE*0] */
  2972. mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
  2973. quantptr[DCTSIZE*6]) */
  2974. lh s5, 0(a0) /* quantptr[0] */
  2975. li s6, 15137
  2976. li s7, 6270
  2977. mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
  2978. mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
  2979. quantptr[DCTSIZE*2]) */
  2980. lh t5, 112(t0) /* inptr[DCTSIZE*7] */
  2981. mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
  2982. quantptr[DCTSIZE*6]) */
  2983. lh s4, 112(a0) /* quantptr[DCTSIZE*7] */
  2984. lh v0, 80(t0) /* inptr[DCTSIZE*5] */
  2985. lh s5, 80(a0) /* quantptr[DCTSIZE*5] */
  2986. lh s6, 48(a0) /* quantptr[DCTSIZE*3] */
  2987. sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
  2988. lh s7, 16(a0) /* quantptr[DCTSIZE*1] */
  2989. lh t8, 16(t0) /* inptr[DCTSIZE*1] */
  2990. subu t6, t6, t7 /* tmp2 =
  2991. MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
  2992. lh t7, 48(t0) /* inptr[DCTSIZE*3] */
  2993. mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
  2994. quantptr[DCTSIZE*7]) */
  2995. mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] *
  2996. quantptr[DCTSIZE*5]) */
  2997. mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
  2998. quantptr[DCTSIZE*3]) */
  2999. mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
  3000. quantptr[DCTSIZE*1]) */
  3001. addu t3, t2, t6 /* tmp10 = tmp0 + z2 */
  3002. subu t4, t2, t6 /* tmp10 = tmp0 - z2 */
  3003. mult $ac0, zero, zero
  3004. mult $ac1, zero, zero
  3005. ins t5, v0, 16, 16
  3006. ins t7, t8, 16, 16
  3007. addiu t9, t9, -1
  3008. dpa.w.ph $ac0, t5, s0
  3009. dpa.w.ph $ac0, t7, s1
  3010. dpa.w.ph $ac1, t5, s2
  3011. dpa.w.ph $ac1, t7, s3
  3012. mflo s4, $ac0
  3013. mflo s5, $ac1
  3014. addiu a0, a0, 2
  3015. addiu t1, t1, 4
  3016. addiu t0, t0, 2
  3017. addu t6, t4, s4
  3018. subu t5, t4, s4
  3019. addu s6, t3, s5
  3020. subu s7, t3, s5
  3021. shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */
  3022. shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */
  3023. shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
  3024. shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
  3025. sw t6, 28(t1)
  3026. sw t5, 60(t1)
  3027. sw s6, -4(t1)
  3028. bgtz t9, 0b
  3029. sw s7, 92(t1)
  3030. /* second loop three pass */
  3031. li t9, 3
  3032. 1:
  3033. lh s6, 34(t0) /* inptr[DCTSIZE*2] */
  3034. lh t6, 34(a0) /* quantptr[DCTSIZE*2] */
  3035. lh s7, 98(t0) /* inptr[DCTSIZE*6] */
  3036. lh t7, 98(a0) /* quantptr[DCTSIZE*6] */
  3037. mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
  3038. quantptr[DCTSIZE*2]) */
  3039. lh s4, 2(t0) /* inptr[DCTSIZE*0] */
  3040. mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
  3041. quantptr[DCTSIZE*6]) */
  3042. lh s5, 2(a0) /* quantptr[DCTSIZE*0] */
  3043. li s6, 15137
  3044. li s7, 6270
  3045. mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
  3046. mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
  3047. quantptr[DCTSIZE*2]) */
  3048. lh t5, 114(t0) /* inptr[DCTSIZE*7] */
  3049. mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
  3050. quantptr[DCTSIZE*6]) */
  3051. lh s4, 114(a0) /* quantptr[DCTSIZE*7] */
  3052. lh s5, 82(a0) /* quantptr[DCTSIZE*5] */
  3053. lh t6, 82(t0) /* inptr[DCTSIZE*5] */
  3054. sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
  3055. lh s6, 50(a0) /* quantptr[DCTSIZE*3] */
  3056. lh t8, 18(t0) /* inptr[DCTSIZE*1] */
  3057. subu v0, v0, t7 /* tmp2 =
  3058. MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
  3059. lh t7, 50(t0) /* inptr[DCTSIZE*3] */
  3060. lh s7, 18(a0) /* quantptr[DCTSIZE*1] */
  3061. mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
  3062. quantptr[DCTSIZE*7]) */
  3063. mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] *
  3064. quantptr[DCTSIZE*5]) */
  3065. mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
  3066. quantptr[DCTSIZE*3]) */
  3067. mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
  3068. quantptr[DCTSIZE*1]) */
  3069. addu t3, t2, v0 /* tmp10 = tmp0 + z2 */
  3070. subu t4, t2, v0 /* tmp10 = tmp0 - z2 */
  3071. mult $ac0, zero, zero
  3072. mult $ac1, zero, zero
  3073. ins t5, t6, 16, 16
  3074. ins t7, t8, 16, 16
  3075. dpa.w.ph $ac0, t5, s0
  3076. dpa.w.ph $ac0, t7, s1
  3077. dpa.w.ph $ac1, t5, s2
  3078. dpa.w.ph $ac1, t7, s3
  3079. mflo t5, $ac0
  3080. mflo t6, $ac1
  3081. addiu t9, t9, -1
  3082. addiu t0, t0, 2
  3083. addiu a0, a0, 2
  3084. addiu t1, t1, 4
  3085. addu s5, t4, t5
  3086. subu s4, t4, t5
  3087. addu s6, t3, t6
  3088. subu s7, t3, t6
  3089. shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */
  3090. shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */
  3091. shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
  3092. shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
  3093. sw s5, 32(t1)
  3094. sw s4, 64(t1)
  3095. sw s6, 0(t1)
  3096. bgtz t9, 1b
  3097. sw s7, 96(t1)
  3098. move t1, v1
  3099. li s4, 15137
  3100. lw s6, 8(t1) /* wsptr[2] */
  3101. li s5, 6270
  3102. lw s7, 24(t1) /* wsptr[6] */
  3103. mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
  3104. FIX_1_847759065) */
  3105. lw t2, 0(t1) /* wsptr[0] */
  3106. mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
  3107. -FIX_0_765366865) */
  3108. lh t5, 28(t1) /* wsptr[7] */
  3109. lh t6, 20(t1) /* wsptr[5] */
  3110. lh t7, 12(t1) /* wsptr[3] */
  3111. lh t8, 4(t1) /* wsptr[1] */
  3112. ins t5, t6, 16, 16
  3113. ins t7, t8, 16, 16
  3114. mult $ac0, zero, zero
  3115. dpa.w.ph $ac0, t5, s0
  3116. dpa.w.ph $ac0, t7, s1
  3117. mult $ac1, zero, zero
  3118. dpa.w.ph $ac1, t5, s2
  3119. dpa.w.ph $ac1, t7, s3
  3120. sll t2, t2, 14 /* tmp0 =
  3121. ((JLONG)wsptr[0]) << (CONST_BITS+1) */
  3122. mflo s6, $ac0
  3123. /* MULTIPLY(wsptr[2], FIX_1_847759065) +
  3124. MULTIPLY(wsptr[6], -FIX_0_765366865) */
  3125. subu s4, s4, s5
  3126. addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
  3127. mflo s7, $ac1
  3128. subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
  3129. addu t7, t4, s6
  3130. subu t8, t4, s6
  3131. addu t5, t3, s7
  3132. subu t6, t3, s7
  3133. shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
  3134. shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
  3135. shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
  3136. shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
  3137. sll s4, t9, 2
  3138. lw v0, 0(a2) /* output_buf[ctr] */
  3139. shll_s.w t5, t5, 24
  3140. shll_s.w t6, t6, 24
  3141. shll_s.w t7, t7, 24
  3142. shll_s.w t8, t8, 24
  3143. sra t5, t5, 24
  3144. sra t6, t6, 24
  3145. sra t7, t7, 24
  3146. sra t8, t8, 24
  3147. addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
  3148. addiu t5, t5, 128
  3149. addiu t6, t6, 128
  3150. addiu t7, t7, 128
  3151. addiu t8, t8, 128
  3152. sb t5, 0(v0)
  3153. sb t7, 1(v0)
  3154. sb t8, 2(v0)
  3155. sb t6, 3(v0)
  3156. /* 2 */
  3157. li s4, 15137
  3158. lw s6, 40(t1) /* wsptr[2] */
  3159. li s5, 6270
  3160. lw s7, 56(t1) /* wsptr[6] */
  3161. mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
  3162. FIX_1_847759065) */
  3163. lw t2, 32(t1) /* wsptr[0] */
  3164. mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
  3165. -FIX_0_765366865) */
  3166. lh t5, 60(t1) /* wsptr[7] */
  3167. lh t6, 52(t1) /* wsptr[5] */
  3168. lh t7, 44(t1) /* wsptr[3] */
  3169. lh t8, 36(t1) /* wsptr[1] */
  3170. ins t5, t6, 16, 16
  3171. ins t7, t8, 16, 16
  3172. mult $ac0, zero, zero
  3173. dpa.w.ph $ac0, t5, s0
  3174. dpa.w.ph $ac0, t7, s1
  3175. mult $ac1, zero, zero
  3176. dpa.w.ph $ac1, t5, s2
  3177. dpa.w.ph $ac1, t7, s3
  3178. sll t2, t2, 14 /* tmp0 =
  3179. ((JLONG)wsptr[0]) << (CONST_BITS+1) */
  3180. mflo s6, $ac0
  3181. /* MULTIPLY(wsptr[2], FIX_1_847759065) +
  3182. MULTIPLY(wsptr[6], -FIX_0_765366865) */
  3183. subu s4, s4, s5
  3184. addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
  3185. mflo s7, $ac1
  3186. subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
  3187. addu t7, t4, s6
  3188. subu t8, t4, s6
  3189. addu t5, t3, s7
  3190. subu t6, t3, s7
  3191. shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2,
  3192. CONST_BITS-PASS1_BITS+1) */
  3193. shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2,
  3194. CONST_BITS-PASS1_BITS+1) */
  3195. shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1,
  3196. CONST_BITS-PASS1_BITS+1) */
  3197. shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1,
  3198. CONST_BITS-PASS1_BITS+1) */
  3199. sll s4, t9, 2
  3200. lw v0, 4(a2) /* output_buf[ctr] */
  3201. shll_s.w t5, t5, 24
  3202. shll_s.w t6, t6, 24
  3203. shll_s.w t7, t7, 24
  3204. shll_s.w t8, t8, 24
  3205. sra t5, t5, 24
  3206. sra t6, t6, 24
  3207. sra t7, t7, 24
  3208. sra t8, t8, 24
  3209. addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
  3210. addiu t5, t5, 128
  3211. addiu t6, t6, 128
  3212. addiu t7, t7, 128
  3213. addiu t8, t8, 128
  3214. sb t5, 0(v0)
  3215. sb t7, 1(v0)
  3216. sb t8, 2(v0)
  3217. sb t6, 3(v0)
  3218. /* 3 */
  3219. li s4, 15137
  3220. lw s6, 72(t1) /* wsptr[2] */
  3221. li s5, 6270
  3222. lw s7, 88(t1) /* wsptr[6] */
  3223. mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
  3224. FIX_1_847759065) */
  3225. lw t2, 64(t1) /* wsptr[0] */
  3226. mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
  3227. -FIX_0_765366865) */
  3228. lh t5, 92(t1) /* wsptr[7] */
  3229. lh t6, 84(t1) /* wsptr[5] */
  3230. lh t7, 76(t1) /* wsptr[3] */
  3231. lh t8, 68(t1) /* wsptr[1] */
  3232. ins t5, t6, 16, 16
  3233. ins t7, t8, 16, 16
  3234. mult $ac0, zero, zero
  3235. dpa.w.ph $ac0, t5, s0
  3236. dpa.w.ph $ac0, t7, s1
  3237. mult $ac1, zero, zero
  3238. dpa.w.ph $ac1, t5, s2
  3239. dpa.w.ph $ac1, t7, s3
  3240. sll t2, t2, 14 /* tmp0 =
  3241. ((JLONG)wsptr[0]) << (CONST_BITS+1) */
  3242. mflo s6, $ac0
  3243. /* MULTIPLY(wsptr[2], FIX_1_847759065) +
  3244. MULTIPLY(wsptr[6], -FIX_0_765366865) */
  3245. subu s4, s4, s5
  3246. addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
  3247. mflo s7, $ac1
  3248. subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
  3249. addu t7, t4, s6
  3250. subu t8, t4, s6
  3251. addu t5, t3, s7
  3252. subu t6, t3, s7
  3253. shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
  3254. shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
  3255. shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
  3256. shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
  3257. sll s4, t9, 2
  3258. lw v0, 8(a2) /* output_buf[ctr] */
  3259. shll_s.w t5, t5, 24
  3260. shll_s.w t6, t6, 24
  3261. shll_s.w t7, t7, 24
  3262. shll_s.w t8, t8, 24
  3263. sra t5, t5, 24
  3264. sra t6, t6, 24
  3265. sra t7, t7, 24
  3266. sra t8, t8, 24
  3267. addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
  3268. addiu t5, t5, 128
  3269. addiu t6, t6, 128
  3270. addiu t7, t7, 128
  3271. addiu t8, t8, 128
  3272. sb t5, 0(v0)
  3273. sb t7, 1(v0)
  3274. sb t8, 2(v0)
  3275. sb t6, 3(v0)
  3276. li s4, 15137
  3277. lw s6, 104(t1) /* wsptr[2] */
  3278. li s5, 6270
  3279. lw s7, 120(t1) /* wsptr[6] */
  3280. mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
  3281. FIX_1_847759065) */
  3282. lw t2, 96(t1) /* wsptr[0] */
  3283. mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
  3284. -FIX_0_765366865) */
  3285. lh t5, 124(t1) /* wsptr[7] */
  3286. lh t6, 116(t1) /* wsptr[5] */
  3287. lh t7, 108(t1) /* wsptr[3] */
  3288. lh t8, 100(t1) /* wsptr[1] */
  3289. ins t5, t6, 16, 16
  3290. ins t7, t8, 16, 16
  3291. mult $ac0, zero, zero
  3292. dpa.w.ph $ac0, t5, s0
  3293. dpa.w.ph $ac0, t7, s1
  3294. mult $ac1, zero, zero
  3295. dpa.w.ph $ac1, t5, s2
  3296. dpa.w.ph $ac1, t7, s3
  3297. sll t2, t2, 14 /* tmp0 =
  3298. ((JLONG)wsptr[0]) << (CONST_BITS+1) */
  3299. mflo s6, $ac0
  3300. /* MULTIPLY(wsptr[2], FIX_1_847759065) +
  3301. MULTIPLY(wsptr[6], -FIX_0_765366865) */
  3302. subu s4, s4, s5
  3303. addu t3, t2, s4 /* tmp10 = tmp0 + z2; */
  3304. mflo s7, $ac1
  3305. subu t4, t2, s4 /* tmp10 = tmp0 - z2; */
  3306. addu t7, t4, s6
  3307. subu t8, t4, s6
  3308. addu t5, t3, s7
  3309. subu t6, t3, s7
  3310. shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
  3311. shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
  3312. shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
  3313. shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
  3314. sll s4, t9, 2
  3315. lw v0, 12(a2) /* output_buf[ctr] */
  3316. shll_s.w t5, t5, 24
  3317. shll_s.w t6, t6, 24
  3318. shll_s.w t7, t7, 24
  3319. shll_s.w t8, t8, 24
  3320. sra t5, t5, 24
  3321. sra t6, t6, 24
  3322. sra t7, t7, 24
  3323. sra t8, t8, 24
  3324. addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
  3325. addiu t5, t5, 128
  3326. addiu t6, t6, 128
  3327. addiu t7, t7, 128
  3328. addiu t8, t8, 128
  3329. sb t5, 0(v0)
  3330. sb t7, 1(v0)
  3331. sb t8, 2(v0)
  3332. sb t6, 3(v0)
  3333. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3334. j ra
  3335. nop
  3336. END(jsimd_idct_4x4_dspr2)
  3337. /*****************************************************************************/
  3338. LEAF_DSPR2(jsimd_idct_6x6_dspr2)
  3339. /*
  3340. * a0 = compptr->dct_table
  3341. * a1 = coef_block
  3342. * a2 = output_buf
  3343. * a3 = output_col
  3344. */
  3345. .set at
  3346. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3347. addiu sp, sp, -144
  3348. move v0, sp
  3349. addiu v1, v0, 24
  3350. addiu t9, zero, 5793
  3351. addiu s0, zero, 10033
  3352. addiu s1, zero, 2998
  3353. 1:
  3354. lh s2, 0(a0) /* q0 = quantptr[ 0] */
  3355. lh s3, 32(a0) /* q1 = quantptr[16] */
  3356. lh s4, 64(a0) /* q2 = quantptr[32] */
  3357. lh t2, 64(a1) /* tmp2 = inptr[32] */
  3358. lh t1, 32(a1) /* tmp1 = inptr[16] */
  3359. lh t0, 0(a1) /* tmp0 = inptr[ 0] */
  3360. mul t2, t2, s4 /* tmp2 = tmp2 * q2 */
  3361. mul t1, t1, s3 /* tmp1 = tmp1 * q1 */
  3362. mul t0, t0, s2 /* tmp0 = tmp0 * q0 */
  3363. lh t6, 16(a1) /* z1 = inptr[ 8] */
  3364. lh t8, 80(a1) /* z3 = inptr[40] */
  3365. lh t7, 48(a1) /* z2 = inptr[24] */
  3366. lh s2, 16(a0) /* q0 = quantptr[ 8] */
  3367. lh s4, 80(a0) /* q2 = quantptr[40] */
  3368. lh s3, 48(a0) /* q1 = quantptr[24] */
  3369. mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */
  3370. mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */
  3371. sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
  3372. mul t6, t6, s2 /* z1 = z1 * q0 */
  3373. mul t8, t8, s4 /* z3 = z3 * q2 */
  3374. mul t7, t7, s3 /* z2 = z2 * q1 */
  3375. addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */
  3376. sll t2, t2, 1 /* tmp2 = tmp2 << 2 */
  3377. subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */
  3378. subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */
  3379. addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */
  3380. addu t1, t6, t8 /* tmp1 = z1 + z3 */
  3381. mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */
  3382. shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
  3383. subu t2, t6, t8 /* tmp2 = z1 - z3 */
  3384. subu t2, t2, t7 /* tmp2 = tmp2 - z2 */
  3385. sll t2, t2, 2 /* tmp2 = tmp2 << 2 */
  3386. addu t0, t6, t7 /* tmp0 = z1 + z2 */
  3387. sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
  3388. subu s2, t8, t7 /* q0 = z3 - z2 */
  3389. sll s2, s2, 13 /* q0 = q0 << 13 */
  3390. addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */
  3391. addu t1, s2, t1 /* tmp1 = q0 + tmp1 */
  3392. addu s2, t4, t2 /* q0 = tmp11 + tmp2 */
  3393. subu s3, t4, t2 /* q1 = tmp11 - tmp2 */
  3394. addu t6, t3, t0 /* z1 = tmp10 + tmp0 */
  3395. subu t7, t3, t0 /* z2 = tmp10 - tmp0 */
  3396. addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */
  3397. subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */
  3398. shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */
  3399. shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */
  3400. shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
  3401. shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */
  3402. sw s2, 24(v0)
  3403. sw s3, 96(v0)
  3404. sw t6, 0(v0)
  3405. sw t7, 120(v0)
  3406. sw t4, 48(v0)
  3407. sw t5, 72(v0)
  3408. addiu v0, v0, 4
  3409. addiu a1, a1, 2
  3410. bne v0, v1, 1b
  3411. addiu a0, a0, 2
  3412. /* Pass 2: process 6 rows from work array, store into output array. */
  3413. move v0, sp
  3414. addiu v1, v0, 144
  3415. 2:
  3416. lw t0, 0(v0)
  3417. lw t2, 16(v0)
  3418. lw s5, 0(a2)
  3419. addiu t0, t0, 16
  3420. sll t0, t0, 13
  3421. mul t3, t2, t9
  3422. lw t6, 4(v0)
  3423. lw t8, 20(v0)
  3424. lw t7, 12(v0)
  3425. addu s5, s5, a3
  3426. addu s6, t6, t8
  3427. mul s6, s6, s1
  3428. addu t1, t0, t3
  3429. subu t4, t0, t3
  3430. subu t4, t4, t3
  3431. lw t3, 8(v0)
  3432. mul t0, t3, s0
  3433. addu s7, t6, t7
  3434. sll s7, s7, 13
  3435. addu s7, s6, s7
  3436. subu t2, t8, t7
  3437. sll t2, t2, 13
  3438. addu t2, s6, t2
  3439. subu s6, t6, t7
  3440. subu s6, s6, t8
  3441. sll s6, s6, 13
  3442. addu t3, t1, t0
  3443. subu t5, t1, t0
  3444. addu t6, t3, s7
  3445. subu t3, t3, s7
  3446. addu t7, t4, s6
  3447. subu t4, t4, s6
  3448. addu t8, t5, t2
  3449. subu t5, t5, t2
  3450. shll_s.w t6, t6, 6
  3451. shll_s.w t3, t3, 6
  3452. shll_s.w t7, t7, 6
  3453. shll_s.w t4, t4, 6
  3454. shll_s.w t8, t8, 6
  3455. shll_s.w t5, t5, 6
  3456. sra t6, t6, 24
  3457. addiu t6, t6, 128
  3458. sra t3, t3, 24
  3459. addiu t3, t3, 128
  3460. sb t6, 0(s5)
  3461. sra t7, t7, 24
  3462. addiu t7, t7, 128
  3463. sb t3, 5(s5)
  3464. sra t4, t4, 24
  3465. addiu t4, t4, 128
  3466. sb t7, 1(s5)
  3467. sra t8, t8, 24
  3468. addiu t8, t8, 128
  3469. sb t4, 4(s5)
  3470. addiu v0, v0, 24
  3471. sra t5, t5, 24
  3472. addiu t5, t5, 128
  3473. sb t8, 2(s5)
  3474. addiu a2, a2, 4
  3475. bne v0, v1, 2b
  3476. sb t5, 3(s5)
  3477. addiu sp, sp, 144
  3478. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3479. j ra
  3480. nop
  3481. END(jsimd_idct_6x6_dspr2)
  3482. /*****************************************************************************/
  3483. LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
  3484. /*
  3485. * a0 = compptr->dct_table
  3486. * a1 = coef_block
  3487. * a2 = workspace
  3488. */
  3489. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3490. li a3, 8
  3491. 1:
  3492. /* odd part */
  3493. lh t0, 48(a1)
  3494. lh t1, 48(a0)
  3495. lh t2, 16(a1)
  3496. lh t3, 16(a0)
  3497. lh t4, 80(a1)
  3498. lh t5, 80(a0)
  3499. lh t6, 112(a1)
  3500. lh t7, 112(a0)
  3501. mul t0, t0, t1 /* z2 */
  3502. mul t1, t2, t3 /* z1 */
  3503. mul t2, t4, t5 /* z3 */
  3504. mul t3, t6, t7 /* z4 */
  3505. li t4, 10703 /* FIX(1.306562965) */
  3506. li t5, 4433 /* FIX_0_541196100 */
  3507. li t6, 7053 /* FIX(0.860918669) */
  3508. mul t4, t0, t4 /* tmp11 */
  3509. mul t5, t0, t5 /* -tmp14 */
  3510. addu t7, t1, t2 /* tmp10 */
  3511. addu t8, t7, t3 /* tmp10 + z4 */
  3512. mul t6, t6, t8 /* tmp15 */
  3513. li t8, 2139 /* FIX(0.261052384) */
  3514. mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */
  3515. li t7, 2295 /* FIX(0.280143716) */
  3516. mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */
  3517. addu t9, t2, t3 /* z3 + z4 */
  3518. li s0, 8565 /* FIX(1.045510580) */
  3519. mul t9, t9, s0 /* -tmp13 */
  3520. li s0, 12112 /* FIX(1.478575242) */
  3521. mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */
  3522. li s1, 12998 /* FIX(1.586706681) */
  3523. mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
  3524. li s2, 5540 /* FIX(0.676326758) */
  3525. mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
  3526. li s3, 16244 /* FIX(1.982889723) */
  3527. mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
  3528. subu t1, t1, t3 /* z1-=z4 */
  3529. subu t0, t0, t2 /* z2-=z3 */
  3530. addu t2, t0, t1 /* z1+z2 */
  3531. li t3, 4433 /* FIX_0_541196100 */
  3532. mul t2, t2, t3 /* z3 */
  3533. li t3, 6270 /* FIX_0_765366865 */
  3534. mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
  3535. li t3, 15137 /* FIX_0_765366865 */
  3536. mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
  3537. addu t8, t6, t8 /* tmp12 */
  3538. addu t3, t8, t4 /* tmp12 + tmp11 */
  3539. addu t3, t3, t7 /* tmp10 */
  3540. subu t8, t8, t9 /* tmp12 + tmp13 */
  3541. addu s0, t5, s0
  3542. subu t8, t8, s0 /* tmp12 */
  3543. subu t9, t6, t9
  3544. subu s1, s1, t4
  3545. addu t9, t9, s1 /* tmp13 */
  3546. subu t6, t6, t5
  3547. subu t6, t6, s2
  3548. subu t6, t6, s3 /* tmp15 */
  3549. /* even part start */
  3550. lh t4, 64(a1)
  3551. lh t5, 64(a0)
  3552. lh t7, 32(a1)
  3553. lh s0, 32(a0)
  3554. lh s1, 0(a1)
  3555. lh s2, 0(a0)
  3556. lh s3, 96(a1)
  3557. lh v0, 96(a0)
  3558. mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4],
  3559. quantptr[DCTSIZE*4]) */
  3560. mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2],
  3561. quantptr[DCTSIZE*2]) */
  3562. mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0],
  3563. quantptr[DCTSIZE*0]) */
  3564. mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6],
  3565. quantptr[DCTSIZE*6]) */
  3566. /* odd part end */
  3567. addu t1, t2, t1 /* tmp11 */
  3568. subu t0, t2, t0 /* tmp14 */
  3569. /* update counter and pointers */
  3570. addiu a3, a3, -1
  3571. addiu a0, a0, 2
  3572. addiu a1, a1, 2
  3573. /* even part rest */
  3574. li s1, 10033
  3575. li s2, 11190
  3576. mul t4, t4, s1 /* z4 */
  3577. mul s1, t5, s2 /* z4 */
  3578. sll t5, t5, 13 /* z1 */
  3579. sll t7, t7, 13
  3580. addiu t7, t7, 1024 /* z3 */
  3581. sll s0, s0, 13 /* z2 */
  3582. addu s2, t7, t4 /* tmp10 */
  3583. subu t4, t7, t4 /* tmp11 */
  3584. subu s3, t5, s0 /* tmp12 */
  3585. addu t2, t7, s3 /* tmp21 */
  3586. subu s3, t7, s3 /* tmp24 */
  3587. addu t7, s1, s0 /* tmp12 */
  3588. addu v0, s2, t7 /* tmp20 */
  3589. subu s2, s2, t7 /* tmp25 */
  3590. subu s1, s1, t5 /* z4 - z1 */
  3591. subu s1, s1, s0 /* tmp12 */
  3592. addu s0, t4, s1 /* tmp22 */
  3593. subu t4, t4, s1 /* tmp23 */
  3594. /* final output stage */
  3595. addu t5, v0, t3
  3596. subu v0, v0, t3
  3597. addu t3, t2, t1
  3598. subu t2, t2, t1
  3599. addu t1, s0, t8
  3600. subu s0, s0, t8
  3601. addu t8, t4, t9
  3602. subu t4, t4, t9
  3603. addu t9, s3, t0
  3604. subu s3, s3, t0
  3605. addu t0, s2, t6
  3606. subu s2, s2, t6
  3607. sra t5, t5, 11
  3608. sra t3, t3, 11
  3609. sra t1, t1, 11
  3610. sra t8, t8, 11
  3611. sra t9, t9, 11
  3612. sra t0, t0, 11
  3613. sra s2, s2, 11
  3614. sra s3, s3, 11
  3615. sra t4, t4, 11
  3616. sra s0, s0, 11
  3617. sra t2, t2, 11
  3618. sra v0, v0, 11
  3619. sw t5, 0(a2)
  3620. sw t3, 32(a2)
  3621. sw t1, 64(a2)
  3622. sw t8, 96(a2)
  3623. sw t9, 128(a2)
  3624. sw t0, 160(a2)
  3625. sw s2, 192(a2)
  3626. sw s3, 224(a2)
  3627. sw t4, 256(a2)
  3628. sw s0, 288(a2)
  3629. sw t2, 320(a2)
  3630. sw v0, 352(a2)
  3631. bgtz a3, 1b
  3632. addiu a2, a2, 4
  3633. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3634. j ra
  3635. nop
  3636. END(jsimd_idct_12x12_pass1_dspr2)
  3637. /*****************************************************************************/
  3638. LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
  3639. /*
  3640. * a0 = workspace
  3641. * a1 = output
  3642. */
  3643. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3644. li a3, 12
  3645. 1:
  3646. /* Odd part */
  3647. lw t0, 12(a0)
  3648. lw t1, 4(a0)
  3649. lw t2, 20(a0)
  3650. lw t3, 28(a0)
  3651. li t4, 10703 /* FIX(1.306562965) */
  3652. li t5, 4433 /* FIX_0_541196100 */
  3653. mul t4, t0, t4 /* tmp11 */
  3654. mul t5, t0, t5 /* -tmp14 */
  3655. addu t6, t1, t2 /* tmp10 */
  3656. li t7, 2139 /* FIX(0.261052384) */
  3657. mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */
  3658. addu t6, t6, t3 /* tmp10 + z4 */
  3659. li t8, 7053 /* FIX(0.860918669) */
  3660. mul t6, t6, t8 /* tmp15 */
  3661. li t8, 2295 /* FIX(0.280143716) */
  3662. mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */
  3663. addu t9, t2, t3 /* z3 + z4 */
  3664. li s0, 8565 /* FIX(1.045510580) */
  3665. mul t9, t9, s0 /* -tmp13 */
  3666. li s0, 12112 /* FIX(1.478575242) */
  3667. mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */
  3668. li s1, 12998 /* FIX(1.586706681) */
  3669. mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
  3670. li s2, 5540 /* FIX(0.676326758) */
  3671. mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
  3672. li s3, 16244 /* FIX(1.982889723) */
  3673. mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
  3674. subu t1, t1, t3 /* z1 -= z4 */
  3675. subu t0, t0, t2 /* z2 -= z3 */
  3676. addu t2, t1, t0 /* z1 + z2 */
  3677. li t3, 4433 /* FIX_0_541196100 */
  3678. mul t2, t2, t3 /* z3 */
  3679. li t3, 6270 /* FIX_0_765366865 */
  3680. mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
  3681. li t3, 15137 /* FIX_1_847759065 */
  3682. mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
  3683. addu t3, t6, t7 /* tmp12 */
  3684. addu t7, t3, t4
  3685. addu t7, t7, t8 /* tmp10 */
  3686. subu t3, t3, t9
  3687. subu t3, t3, t5
  3688. subu t3, t3, s0 /* tmp12 */
  3689. subu t9, t6, t9
  3690. subu t9, t9, t4
  3691. addu t9, t9, s1 /* tmp13 */
  3692. subu t6, t6, t5
  3693. subu t6, t6, s2
  3694. subu t6, t6, s3 /* tmp15 */
  3695. addu t1, t2, t1 /* tmp11 */
  3696. subu t0, t2, t0 /* tmp14 */
  3697. /* even part */
  3698. lw t2, 16(a0) /* z4 */
  3699. lw t4, 8(a0) /* z1 */
  3700. lw t5, 0(a0) /* z3 */
  3701. lw t8, 24(a0) /* z2 */
  3702. li s0, 10033 /* FIX(1.224744871) */
  3703. li s1, 11190 /* FIX(1.366025404) */
  3704. mul t2, t2, s0 /* z4 */
  3705. mul s0, t4, s1 /* z4 */
  3706. addiu t5, t5, 0x10
  3707. sll t5, t5, 13 /* z3 */
  3708. sll t4, t4, 13 /* z1 */
  3709. sll t8, t8, 13 /* z2 */
  3710. subu s1, t4, t8 /* tmp12 */
  3711. addu s2, t5, t2 /* tmp10 */
  3712. subu t2, t5, t2 /* tmp11 */
  3713. addu s3, t5, s1 /* tmp21 */
  3714. subu s1, t5, s1 /* tmp24 */
  3715. addu t5, s0, t8 /* tmp12 */
  3716. addu v0, s2, t5 /* tmp20 */
  3717. subu t5, s2, t5 /* tmp25 */
  3718. subu t4, s0, t4
  3719. subu t4, t4, t8 /* tmp12 */
  3720. addu t8, t2, t4 /* tmp22 */
  3721. subu t2, t2, t4 /* tmp23 */
  3722. /* increment counter and pointers */
  3723. addiu a3, a3, -1
  3724. addiu a0, a0, 32
  3725. /* Final stage */
  3726. addu t4, v0, t7
  3727. subu v0, v0, t7
  3728. addu t7, s3, t1
  3729. subu s3, s3, t1
  3730. addu t1, t8, t3
  3731. subu t8, t8, t3
  3732. addu t3, t2, t9
  3733. subu t2, t2, t9
  3734. addu t9, s1, t0
  3735. subu s1, s1, t0
  3736. addu t0, t5, t6
  3737. subu t5, t5, t6
  3738. sll t4, t4, 4
  3739. sll t7, t7, 4
  3740. sll t1, t1, 4
  3741. sll t3, t3, 4
  3742. sll t9, t9, 4
  3743. sll t0, t0, 4
  3744. sll t5, t5, 4
  3745. sll s1, s1, 4
  3746. sll t2, t2, 4
  3747. sll t8, t8, 4
  3748. sll s3, s3, 4
  3749. sll v0, v0, 4
  3750. shll_s.w t4, t4, 2
  3751. shll_s.w t7, t7, 2
  3752. shll_s.w t1, t1, 2
  3753. shll_s.w t3, t3, 2
  3754. shll_s.w t9, t9, 2
  3755. shll_s.w t0, t0, 2
  3756. shll_s.w t5, t5, 2
  3757. shll_s.w s1, s1, 2
  3758. shll_s.w t2, t2, 2
  3759. shll_s.w t8, t8, 2
  3760. shll_s.w s3, s3, 2
  3761. shll_s.w v0, v0, 2
  3762. srl t4, t4, 24
  3763. srl t7, t7, 24
  3764. srl t1, t1, 24
  3765. srl t3, t3, 24
  3766. srl t9, t9, 24
  3767. srl t0, t0, 24
  3768. srl t5, t5, 24
  3769. srl s1, s1, 24
  3770. srl t2, t2, 24
  3771. srl t8, t8, 24
  3772. srl s3, s3, 24
  3773. srl v0, v0, 24
  3774. lw t6, 0(a1)
  3775. addiu t4, t4, 0x80
  3776. addiu t7, t7, 0x80
  3777. addiu t1, t1, 0x80
  3778. addiu t3, t3, 0x80
  3779. addiu t9, t9, 0x80
  3780. addiu t0, t0, 0x80
  3781. addiu t5, t5, 0x80
  3782. addiu s1, s1, 0x80
  3783. addiu t2, t2, 0x80
  3784. addiu t8, t8, 0x80
  3785. addiu s3, s3, 0x80
  3786. addiu v0, v0, 0x80
  3787. sb t4, 0(t6)
  3788. sb t7, 1(t6)
  3789. sb t1, 2(t6)
  3790. sb t3, 3(t6)
  3791. sb t9, 4(t6)
  3792. sb t0, 5(t6)
  3793. sb t5, 6(t6)
  3794. sb s1, 7(t6)
  3795. sb t2, 8(t6)
  3796. sb t8, 9(t6)
  3797. sb s3, 10(t6)
  3798. sb v0, 11(t6)
  3799. bgtz a3, 1b
  3800. addiu a1, a1, 4
  3801. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3802. jr ra
  3803. nop
  3804. END(jsimd_idct_12x12_pass2_dspr2)
  3805. /*****************************************************************************/
  3806. LEAF_DSPR2(jsimd_convsamp_dspr2)
  3807. /*
  3808. * a0 = sample_data
  3809. * a1 = start_col
  3810. * a2 = workspace
  3811. */
  3812. lw t0, 0(a0)
  3813. li t7, 0xff80ff80
  3814. addu t0, t0, a1
  3815. ulw t1, 0(t0)
  3816. ulw t2, 4(t0)
  3817. preceu.ph.qbr t3, t1
  3818. preceu.ph.qbl t4, t1
  3819. lw t0, 4(a0)
  3820. preceu.ph.qbr t5, t2
  3821. preceu.ph.qbl t6, t2
  3822. addu t0, t0, a1
  3823. addu.ph t3, t3, t7
  3824. addu.ph t4, t4, t7
  3825. ulw t1, 0(t0)
  3826. ulw t2, 4(t0)
  3827. addu.ph t5, t5, t7
  3828. addu.ph t6, t6, t7
  3829. usw t3, 0(a2)
  3830. usw t4, 4(a2)
  3831. preceu.ph.qbr t3, t1
  3832. preceu.ph.qbl t4, t1
  3833. usw t5, 8(a2)
  3834. usw t6, 12(a2)
  3835. lw t0, 8(a0)
  3836. preceu.ph.qbr t5, t2
  3837. preceu.ph.qbl t6, t2
  3838. addu t0, t0, a1
  3839. addu.ph t3, t3, t7
  3840. addu.ph t4, t4, t7
  3841. ulw t1, 0(t0)
  3842. ulw t2, 4(t0)
  3843. addu.ph t5, t5, t7
  3844. addu.ph t6, t6, t7
  3845. usw t3, 16(a2)
  3846. usw t4, 20(a2)
  3847. preceu.ph.qbr t3, t1
  3848. preceu.ph.qbl t4, t1
  3849. usw t5, 24(a2)
  3850. usw t6, 28(a2)
  3851. lw t0, 12(a0)
  3852. preceu.ph.qbr t5, t2
  3853. preceu.ph.qbl t6, t2
  3854. addu t0, t0, a1
  3855. addu.ph t3, t3, t7
  3856. addu.ph t4, t4, t7
  3857. ulw t1, 0(t0)
  3858. ulw t2, 4(t0)
  3859. addu.ph t5, t5, t7
  3860. addu.ph t6, t6, t7
  3861. usw t3, 32(a2)
  3862. usw t4, 36(a2)
  3863. preceu.ph.qbr t3, t1
  3864. preceu.ph.qbl t4, t1
  3865. usw t5, 40(a2)
  3866. usw t6, 44(a2)
  3867. lw t0, 16(a0)
  3868. preceu.ph.qbr t5, t2
  3869. preceu.ph.qbl t6, t2
  3870. addu t0, t0, a1
  3871. addu.ph t3, t3, t7
  3872. addu.ph t4, t4, t7
  3873. ulw t1, 0(t0)
  3874. ulw t2, 4(t0)
  3875. addu.ph t5, t5, t7
  3876. addu.ph t6, t6, t7
  3877. usw t3, 48(a2)
  3878. usw t4, 52(a2)
  3879. preceu.ph.qbr t3, t1
  3880. preceu.ph.qbl t4, t1
  3881. usw t5, 56(a2)
  3882. usw t6, 60(a2)
  3883. lw t0, 20(a0)
  3884. preceu.ph.qbr t5, t2
  3885. preceu.ph.qbl t6, t2
  3886. addu t0, t0, a1
  3887. addu.ph t3, t3, t7
  3888. addu.ph t4, t4, t7
  3889. ulw t1, 0(t0)
  3890. ulw t2, 4(t0)
  3891. addu.ph t5, t5, t7
  3892. addu.ph t6, t6, t7
  3893. usw t3, 64(a2)
  3894. usw t4, 68(a2)
  3895. preceu.ph.qbr t3, t1
  3896. preceu.ph.qbl t4, t1
  3897. usw t5, 72(a2)
  3898. usw t6, 76(a2)
  3899. lw t0, 24(a0)
  3900. preceu.ph.qbr t5, t2
  3901. preceu.ph.qbl t6, t2
  3902. addu t0, t0, a1
  3903. addu.ph t3, t3, t7
  3904. addu.ph t4, t4, t7
  3905. ulw t1, 0(t0)
  3906. ulw t2, 4(t0)
  3907. addu.ph t5, t5, t7
  3908. addu.ph t6, t6, t7
  3909. usw t3, 80(a2)
  3910. usw t4, 84(a2)
  3911. preceu.ph.qbr t3, t1
  3912. preceu.ph.qbl t4, t1
  3913. usw t5, 88(a2)
  3914. usw t6, 92(a2)
  3915. lw t0, 28(a0)
  3916. preceu.ph.qbr t5, t2
  3917. preceu.ph.qbl t6, t2
  3918. addu t0, t0, a1
  3919. addu.ph t3, t3, t7
  3920. addu.ph t4, t4, t7
  3921. ulw t1, 0(t0)
  3922. ulw t2, 4(t0)
  3923. addu.ph t5, t5, t7
  3924. addu.ph t6, t6, t7
  3925. usw t3, 96(a2)
  3926. usw t4, 100(a2)
  3927. preceu.ph.qbr t3, t1
  3928. preceu.ph.qbl t4, t1
  3929. usw t5, 104(a2)
  3930. usw t6, 108(a2)
  3931. preceu.ph.qbr t5, t2
  3932. preceu.ph.qbl t6, t2
  3933. addu.ph t3, t3, t7
  3934. addu.ph t4, t4, t7
  3935. addu.ph t5, t5, t7
  3936. addu.ph t6, t6, t7
  3937. usw t3, 112(a2)
  3938. usw t4, 116(a2)
  3939. usw t5, 120(a2)
  3940. usw t6, 124(a2)
  3941. j ra
  3942. nop
  3943. END(jsimd_convsamp_dspr2)
  3944. #ifndef __mips_soft_float
  3945. /*****************************************************************************/
  3946. LEAF_DSPR2(jsimd_convsamp_float_dspr2)
  3947. /*
  3948. * a0 = sample_data
  3949. * a1 = start_col
  3950. * a2 = workspace
  3951. */
  3952. .set at
  3953. lw t0, 0(a0)
  3954. addu t0, t0, a1
  3955. lbu t1, 0(t0)
  3956. lbu t2, 1(t0)
  3957. lbu t3, 2(t0)
  3958. lbu t4, 3(t0)
  3959. lbu t5, 4(t0)
  3960. lbu t6, 5(t0)
  3961. lbu t7, 6(t0)
  3962. lbu t8, 7(t0)
  3963. addiu t1, t1, -128
  3964. addiu t2, t2, -128
  3965. addiu t3, t3, -128
  3966. addiu t4, t4, -128
  3967. addiu t5, t5, -128
  3968. addiu t6, t6, -128
  3969. addiu t7, t7, -128
  3970. addiu t8, t8, -128
  3971. mtc1 t1, f2
  3972. mtc1 t2, f4
  3973. mtc1 t3, f6
  3974. mtc1 t4, f8
  3975. mtc1 t5, f10
  3976. mtc1 t6, f12
  3977. mtc1 t7, f14
  3978. mtc1 t8, f16
  3979. cvt.s.w f2, f2
  3980. cvt.s.w f4, f4
  3981. cvt.s.w f6, f6
  3982. cvt.s.w f8, f8
  3983. cvt.s.w f10, f10
  3984. cvt.s.w f12, f12
  3985. cvt.s.w f14, f14
  3986. cvt.s.w f16, f16
  3987. lw t0, 4(a0)
  3988. swc1 f2, 0(a2)
  3989. swc1 f4, 4(a2)
  3990. swc1 f6, 8(a2)
  3991. addu t0, t0, a1
  3992. swc1 f8, 12(a2)
  3993. swc1 f10, 16(a2)
  3994. swc1 f12, 20(a2)
  3995. swc1 f14, 24(a2)
  3996. swc1 f16, 28(a2)
  3997. /* elemr 1 */
  3998. lbu t1, 0(t0)
  3999. lbu t2, 1(t0)
  4000. lbu t3, 2(t0)
  4001. lbu t4, 3(t0)
  4002. lbu t5, 4(t0)
  4003. lbu t6, 5(t0)
  4004. lbu t7, 6(t0)
  4005. lbu t8, 7(t0)
  4006. addiu t1, t1, -128
  4007. addiu t2, t2, -128
  4008. addiu t3, t3, -128
  4009. addiu t4, t4, -128
  4010. addiu t5, t5, -128
  4011. addiu t6, t6, -128
  4012. addiu t7, t7, -128
  4013. addiu t8, t8, -128
  4014. mtc1 t1, f2
  4015. mtc1 t2, f4
  4016. mtc1 t3, f6
  4017. mtc1 t4, f8
  4018. mtc1 t5, f10
  4019. mtc1 t6, f12
  4020. mtc1 t7, f14
  4021. mtc1 t8, f16
  4022. cvt.s.w f2, f2
  4023. cvt.s.w f4, f4
  4024. cvt.s.w f6, f6
  4025. cvt.s.w f8, f8
  4026. cvt.s.w f10, f10
  4027. cvt.s.w f12, f12
  4028. cvt.s.w f14, f14
  4029. cvt.s.w f16, f16
  4030. lw t0, 8(a0)
  4031. swc1 f2, 32(a2)
  4032. swc1 f4, 36(a2)
  4033. swc1 f6, 40(a2)
  4034. addu t0, t0, a1
  4035. swc1 f8, 44(a2)
  4036. swc1 f10, 48(a2)
  4037. swc1 f12, 52(a2)
  4038. swc1 f14, 56(a2)
  4039. swc1 f16, 60(a2)
  4040. /* elemr 2 */
  4041. lbu t1, 0(t0)
  4042. lbu t2, 1(t0)
  4043. lbu t3, 2(t0)
  4044. lbu t4, 3(t0)
  4045. lbu t5, 4(t0)
  4046. lbu t6, 5(t0)
  4047. lbu t7, 6(t0)
  4048. lbu t8, 7(t0)
  4049. addiu t1, t1, -128
  4050. addiu t2, t2, -128
  4051. addiu t3, t3, -128
  4052. addiu t4, t4, -128
  4053. addiu t5, t5, -128
  4054. addiu t6, t6, -128
  4055. addiu t7, t7, -128
  4056. addiu t8, t8, -128
  4057. mtc1 t1, f2
  4058. mtc1 t2, f4
  4059. mtc1 t3, f6
  4060. mtc1 t4, f8
  4061. mtc1 t5, f10
  4062. mtc1 t6, f12
  4063. mtc1 t7, f14
  4064. mtc1 t8, f16
  4065. cvt.s.w f2, f2
  4066. cvt.s.w f4, f4
  4067. cvt.s.w f6, f6
  4068. cvt.s.w f8, f8
  4069. cvt.s.w f10, f10
  4070. cvt.s.w f12, f12
  4071. cvt.s.w f14, f14
  4072. cvt.s.w f16, f16
  4073. lw t0, 12(a0)
  4074. swc1 f2, 64(a2)
  4075. swc1 f4, 68(a2)
  4076. swc1 f6, 72(a2)
  4077. addu t0, t0, a1
  4078. swc1 f8, 76(a2)
  4079. swc1 f10, 80(a2)
  4080. swc1 f12, 84(a2)
  4081. swc1 f14, 88(a2)
  4082. swc1 f16, 92(a2)
  4083. /* elemr 3 */
  4084. lbu t1, 0(t0)
  4085. lbu t2, 1(t0)
  4086. lbu t3, 2(t0)
  4087. lbu t4, 3(t0)
  4088. lbu t5, 4(t0)
  4089. lbu t6, 5(t0)
  4090. lbu t7, 6(t0)
  4091. lbu t8, 7(t0)
  4092. addiu t1, t1, -128
  4093. addiu t2, t2, -128
  4094. addiu t3, t3, -128
  4095. addiu t4, t4, -128
  4096. addiu t5, t5, -128
  4097. addiu t6, t6, -128
  4098. addiu t7, t7, -128
  4099. addiu t8, t8, -128
  4100. mtc1 t1, f2
  4101. mtc1 t2, f4
  4102. mtc1 t3, f6
  4103. mtc1 t4, f8
  4104. mtc1 t5, f10
  4105. mtc1 t6, f12
  4106. mtc1 t7, f14
  4107. mtc1 t8, f16
  4108. cvt.s.w f2, f2
  4109. cvt.s.w f4, f4
  4110. cvt.s.w f6, f6
  4111. cvt.s.w f8, f8
  4112. cvt.s.w f10, f10
  4113. cvt.s.w f12, f12
  4114. cvt.s.w f14, f14
  4115. cvt.s.w f16, f16
  4116. lw t0, 16(a0)
  4117. swc1 f2, 96(a2)
  4118. swc1 f4, 100(a2)
  4119. swc1 f6, 104(a2)
  4120. addu t0, t0, a1
  4121. swc1 f8, 108(a2)
  4122. swc1 f10, 112(a2)
  4123. swc1 f12, 116(a2)
  4124. swc1 f14, 120(a2)
  4125. swc1 f16, 124(a2)
  4126. /* elemr 4 */
  4127. lbu t1, 0(t0)
  4128. lbu t2, 1(t0)
  4129. lbu t3, 2(t0)
  4130. lbu t4, 3(t0)
  4131. lbu t5, 4(t0)
  4132. lbu t6, 5(t0)
  4133. lbu t7, 6(t0)
  4134. lbu t8, 7(t0)
  4135. addiu t1, t1, -128
  4136. addiu t2, t2, -128
  4137. addiu t3, t3, -128
  4138. addiu t4, t4, -128
  4139. addiu t5, t5, -128
  4140. addiu t6, t6, -128
  4141. addiu t7, t7, -128
  4142. addiu t8, t8, -128
  4143. mtc1 t1, f2
  4144. mtc1 t2, f4
  4145. mtc1 t3, f6
  4146. mtc1 t4, f8
  4147. mtc1 t5, f10
  4148. mtc1 t6, f12
  4149. mtc1 t7, f14
  4150. mtc1 t8, f16
  4151. cvt.s.w f2, f2
  4152. cvt.s.w f4, f4
  4153. cvt.s.w f6, f6
  4154. cvt.s.w f8, f8
  4155. cvt.s.w f10, f10
  4156. cvt.s.w f12, f12
  4157. cvt.s.w f14, f14
  4158. cvt.s.w f16, f16
  4159. lw t0, 20(a0)
  4160. swc1 f2, 128(a2)
  4161. swc1 f4, 132(a2)
  4162. swc1 f6, 136(a2)
  4163. addu t0, t0, a1
  4164. swc1 f8, 140(a2)
  4165. swc1 f10, 144(a2)
  4166. swc1 f12, 148(a2)
  4167. swc1 f14, 152(a2)
  4168. swc1 f16, 156(a2)
  4169. /* elemr 5 */
  4170. lbu t1, 0(t0)
  4171. lbu t2, 1(t0)
  4172. lbu t3, 2(t0)
  4173. lbu t4, 3(t0)
  4174. lbu t5, 4(t0)
  4175. lbu t6, 5(t0)
  4176. lbu t7, 6(t0)
  4177. lbu t8, 7(t0)
  4178. addiu t1, t1, -128
  4179. addiu t2, t2, -128
  4180. addiu t3, t3, -128
  4181. addiu t4, t4, -128
  4182. addiu t5, t5, -128
  4183. addiu t6, t6, -128
  4184. addiu t7, t7, -128
  4185. addiu t8, t8, -128
  4186. mtc1 t1, f2
  4187. mtc1 t2, f4
  4188. mtc1 t3, f6
  4189. mtc1 t4, f8
  4190. mtc1 t5, f10
  4191. mtc1 t6, f12
  4192. mtc1 t7, f14
  4193. mtc1 t8, f16
  4194. cvt.s.w f2, f2
  4195. cvt.s.w f4, f4
  4196. cvt.s.w f6, f6
  4197. cvt.s.w f8, f8
  4198. cvt.s.w f10, f10
  4199. cvt.s.w f12, f12
  4200. cvt.s.w f14, f14
  4201. cvt.s.w f16, f16
  4202. lw t0, 24(a0)
  4203. swc1 f2, 160(a2)
  4204. swc1 f4, 164(a2)
  4205. swc1 f6, 168(a2)
  4206. addu t0, t0, a1
  4207. swc1 f8, 172(a2)
  4208. swc1 f10, 176(a2)
  4209. swc1 f12, 180(a2)
  4210. swc1 f14, 184(a2)
  4211. swc1 f16, 188(a2)
  4212. /* elemr 6 */
  4213. lbu t1, 0(t0)
  4214. lbu t2, 1(t0)
  4215. lbu t3, 2(t0)
  4216. lbu t4, 3(t0)
  4217. lbu t5, 4(t0)
  4218. lbu t6, 5(t0)
  4219. lbu t7, 6(t0)
  4220. lbu t8, 7(t0)
  4221. addiu t1, t1, -128
  4222. addiu t2, t2, -128
  4223. addiu t3, t3, -128
  4224. addiu t4, t4, -128
  4225. addiu t5, t5, -128
  4226. addiu t6, t6, -128
  4227. addiu t7, t7, -128
  4228. addiu t8, t8, -128
  4229. mtc1 t1, f2
  4230. mtc1 t2, f4
  4231. mtc1 t3, f6
  4232. mtc1 t4, f8
  4233. mtc1 t5, f10
  4234. mtc1 t6, f12
  4235. mtc1 t7, f14
  4236. mtc1 t8, f16
  4237. cvt.s.w f2, f2
  4238. cvt.s.w f4, f4
  4239. cvt.s.w f6, f6
  4240. cvt.s.w f8, f8
  4241. cvt.s.w f10, f10
  4242. cvt.s.w f12, f12
  4243. cvt.s.w f14, f14
  4244. cvt.s.w f16, f16
  4245. lw t0, 28(a0)
  4246. swc1 f2, 192(a2)
  4247. swc1 f4, 196(a2)
  4248. swc1 f6, 200(a2)
  4249. addu t0, t0, a1
  4250. swc1 f8, 204(a2)
  4251. swc1 f10, 208(a2)
  4252. swc1 f12, 212(a2)
  4253. swc1 f14, 216(a2)
  4254. swc1 f16, 220(a2)
  4255. /* elemr 7 */
  4256. lbu t1, 0(t0)
  4257. lbu t2, 1(t0)
  4258. lbu t3, 2(t0)
  4259. lbu t4, 3(t0)
  4260. lbu t5, 4(t0)
  4261. lbu t6, 5(t0)
  4262. lbu t7, 6(t0)
  4263. lbu t8, 7(t0)
  4264. addiu t1, t1, -128
  4265. addiu t2, t2, -128
  4266. addiu t3, t3, -128
  4267. addiu t4, t4, -128
  4268. addiu t5, t5, -128
  4269. addiu t6, t6, -128
  4270. addiu t7, t7, -128
  4271. addiu t8, t8, -128
  4272. mtc1 t1, f2
  4273. mtc1 t2, f4
  4274. mtc1 t3, f6
  4275. mtc1 t4, f8
  4276. mtc1 t5, f10
  4277. mtc1 t6, f12
  4278. mtc1 t7, f14
  4279. mtc1 t8, f16
  4280. cvt.s.w f2, f2
  4281. cvt.s.w f4, f4
  4282. cvt.s.w f6, f6
  4283. cvt.s.w f8, f8
  4284. cvt.s.w f10, f10
  4285. cvt.s.w f12, f12
  4286. cvt.s.w f14, f14
  4287. cvt.s.w f16, f16
  4288. swc1 f2, 224(a2)
  4289. swc1 f4, 228(a2)
  4290. swc1 f6, 232(a2)
  4291. swc1 f8, 236(a2)
  4292. swc1 f10, 240(a2)
  4293. swc1 f12, 244(a2)
  4294. swc1 f14, 248(a2)
  4295. swc1 f16, 252(a2)
  4296. j ra
  4297. nop
  4298. END(jsimd_convsamp_float_dspr2)
  4299. #endif
  4300. /*****************************************************************************/