| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862 |
- import itertools
- import math
- import warnings
- import threading
- from collections import namedtuple
- import numpy as np
- from numpy import (isscalar, log, around, zeros,
- arange, sort, amin, amax, sqrt, array,
- pi, exp, ravel, count_nonzero)
- from scipy import optimize, special, interpolate, stats
- from scipy._lib._bunch import _make_tuple_bunch
- from scipy._lib._util import _rename_parameter, _contains_nan, _get_nan
- from scipy._lib.deprecation import _NoValue
- import scipy._lib.array_api_extra as xpx
- from scipy._lib._array_api import (
- array_namespace,
- is_marray,
- xp_capabilities,
- is_numpy,
- is_jax,
- is_dask,
- xp_size,
- xp_vector_norm,
- xp_promote,
- xp_result_type,
- xp_device,
- xp_ravel,
- _length_nonmasked,
- )
- from ._ansari_swilk_statistics import gscale, swilk
- from . import _stats_py, _wilcoxon
- from ._fit import FitResult
- from ._stats_py import (_get_pvalue, SignificanceResult, # noqa:F401
- _SimpleNormal, _SimpleChi2, _SimpleF)
- from .contingency import chi2_contingency
- from . import distributions
- from ._distn_infrastructure import rv_generic
- from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
- __all__ = ['mvsdist',
- 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
- 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
- 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene',
- 'fligner', 'mood', 'wilcoxon', 'median_test',
- 'circmean', 'circvar', 'circstd', 'anderson_ksamp',
- 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
- 'yeojohnson_normplot', 'directional_stats',
- 'false_discovery_control'
- ]
- Mean = namedtuple('Mean', ('statistic', 'minmax'))
- Variance = namedtuple('Variance', ('statistic', 'minmax'))
- Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
- @xp_capabilities(np_only=True)
- def bayes_mvs(data, alpha=0.90):
- r"""
- Bayesian confidence intervals for the mean, var, and std.
- Parameters
- ----------
- data : array_like
- Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
- Requires 2 or more data points.
- alpha : float, optional
- Probability that the returned confidence interval contains
- the true parameter.
- Returns
- -------
- mean_cntr, var_cntr, std_cntr : tuple
- The three results are for the mean, variance and standard deviation,
- respectively. Each result is a tuple of the form::
- (center, (lower, upper))
- with ``center`` the mean of the conditional pdf of the value given the
- data, and ``(lower, upper)`` a confidence interval, centered on the
- median, containing the estimate to a probability ``alpha``.
- See Also
- --------
- mvsdist
- Notes
- -----
- Each tuple of mean, variance, and standard deviation estimates represent
- the (center, (lower, upper)) with center the mean of the conditional pdf
- of the value given the data and (lower, upper) is a confidence interval
- centered on the median, containing the estimate to a probability
- ``alpha``.
- Converts data to 1-D and assumes all data has the same mean and variance.
- Uses Jeffrey's prior for variance and std.
- Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
- References
- ----------
- T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
- standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
- 2006.
- Examples
- --------
- First a basic example to demonstrate the outputs:
- >>> from scipy import stats
- >>> data = [6, 9, 12, 7, 8, 8, 13]
- >>> mean, var, std = stats.bayes_mvs(data)
- >>> mean
- Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
- >>> var
- Variance(statistic=10.0, minmax=(3.176724206, 24.45910382))
- >>> std
- Std_dev(statistic=2.9724954732045084,
- minmax=(1.7823367265645143, 4.945614605014631))
- Now we generate some normally distributed random data, and get estimates of
- mean and standard deviation with 95% confidence intervals for those
- estimates:
- >>> n_samples = 100000
- >>> data = stats.norm.rvs(size=n_samples)
- >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
- >>> import matplotlib.pyplot as plt
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
- >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
- >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
- ... alpha=0.2, label=r'Estimated mean (95% limits)')
- >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
- >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
- ... label=r'Estimated scale (95% limits)')
- >>> ax.legend(fontsize=10)
- >>> ax.set_xlim([-4, 4])
- >>> ax.set_ylim([0, 0.5])
- >>> plt.show()
- """
- m, v, s = mvsdist(data)
- if alpha >= 1 or alpha <= 0:
- raise ValueError(f"0 < alpha < 1 is required, but {alpha=} was given.")
- m_res = Mean(m.mean(), m.interval(alpha))
- v_res = Variance(v.mean(), v.interval(alpha))
- s_res = Std_dev(s.mean(), s.interval(alpha))
- return m_res, v_res, s_res
- @xp_capabilities(np_only=True)
- def mvsdist(data):
- """
- 'Frozen' distributions for mean, variance, and standard deviation of data.
- Parameters
- ----------
- data : array_like
- Input array. Converted to 1-D using ravel.
- Requires 2 or more data-points.
- Returns
- -------
- mdist : "frozen" distribution object
- Distribution object representing the mean of the data.
- vdist : "frozen" distribution object
- Distribution object representing the variance of the data.
- sdist : "frozen" distribution object
- Distribution object representing the standard deviation of the data.
- See Also
- --------
- bayes_mvs
- Notes
- -----
- The return values from ``bayes_mvs(data)`` is equivalent to
- ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
- In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
- on the three distribution objects returned from this function will give
- the same results that are returned from `bayes_mvs`.
- References
- ----------
- T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
- standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
- 2006.
- Examples
- --------
- >>> from scipy import stats
- >>> data = [6, 9, 12, 7, 8, 8, 13]
- >>> mean, var, std = stats.mvsdist(data)
- We now have frozen distribution objects "mean", "var" and "std" that we can
- examine:
- >>> mean.mean()
- 9.0
- >>> mean.interval(0.95)
- (6.6120585482655692, 11.387941451734431)
- >>> mean.std()
- 1.1952286093343936
- """
- x = ravel(data)
- n = len(x)
- if n < 2:
- raise ValueError("Need at least 2 data-points.")
- xbar = x.mean()
- C = x.var()
- if n > 1000: # gaussian approximations for large n
- mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
- sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
- vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
- else:
- nm1 = n - 1
- fac = n * C / 2.
- val = nm1 / 2.
- mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
- sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
- vdist = distributions.invgamma(val, scale=fac)
- return mdist, vdist, sdist
- @xp_capabilities()
- @_axis_nan_policy_factory(
- lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, default_axis=None
- )
- def kstat(data, n=2, *, axis=None):
- r"""
- Return the `n` th k-statistic ( ``1<=n<=4`` so far).
- The `n` th k-statistic ``k_n`` is the unique symmetric unbiased estimator of the
- `n` th cumulant :math:`\kappa_n` [1]_ [2]_.
- Parameters
- ----------
- data : array_like
- Input array.
- n : int, {1, 2, 3, 4}, optional
- Default is equal to 2.
- axis : int or None, default: None
- If an int, the axis of the input along which to compute the statistic.
- The statistic of each axis-slice (e.g. row) of the input will appear
- in a corresponding element of the output. If ``None``, the input will
- be raveled before computing the statistic.
- Returns
- -------
- kstat : float
- The `n` th k-statistic.
- See Also
- --------
- kstatvar : Returns an unbiased estimator of the variance of the k-statistic
- moment : Returns the n-th central moment about the mean for a sample.
- Notes
- -----
- For a sample size :math:`n`, the first few k-statistics are given by
- .. math::
- k_1 &= \frac{S_1}{n}, \\
- k_2 &= \frac{nS_2 - S_1^2}{n(n-1)}, \\
- k_3 &= \frac{2S_1^3 - 3nS_1S_2 + n^2S_3}{n(n-1)(n-2)}, \\
- k_4 &= \frac{-6S_1^4 + 12nS_1^2S_2 - 3n(n-1)S_2^2 - 4n(n+1)S_1S_3
- + n^2(n+1)S_4}{n (n-1)(n-2)(n-3)},
- where
- .. math::
- S_r \equiv \sum_{i=1}^n X_i^r,
- and :math:`X_i` is the :math:`i` th data point.
- References
- ----------
- .. [1] http://mathworld.wolfram.com/k-Statistic.html
- .. [2] http://mathworld.wolfram.com/Cumulant.html
- Examples
- --------
- >>> from scipy import stats
- >>> from numpy.random import default_rng
- >>> rng = default_rng()
- As sample size increases, `n`-th moment and `n`-th k-statistic converge to the
- same number (although they aren't identical). In the case of the normal
- distribution, they converge to zero.
- >>> for i in range(2,8):
- ... x = rng.normal(size=10**i)
- ... m, k = stats.moment(x, 3), stats.kstat(x, 3)
- ... print(f"{i=}: {m=:.3g}, {k=:.3g}, {(m-k)=:.3g}")
- i=2: m=-0.631, k=-0.651, (m-k)=0.0194 # random
- i=3: m=0.0282, k=0.0283, (m-k)=-8.49e-05
- i=4: m=-0.0454, k=-0.0454, (m-k)=1.36e-05
- i=6: m=7.53e-05, k=7.53e-05, (m-k)=-2.26e-09
- i=7: m=0.00166, k=0.00166, (m-k)=-4.99e-09
- i=8: m=-2.88e-06 k=-2.88e-06, (m-k)=8.63e-13
- """
- xp = array_namespace(data)
- data = xp.asarray(data)
- if n > 4 or n < 1:
- raise ValueError("k-statistics only supported for 1<=n<=4")
- n = int(n)
- if axis is None:
- data = xp.reshape(data, (-1,))
- axis = 0
- N = _length_nonmasked(data, axis, xp=xp)
- S = [None] + [xp.sum(data**k, axis=axis) for k in range(1, n + 1)]
- if n == 1:
- return S[1] * 1.0/N
- elif n == 2:
- return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
- elif n == 3:
- return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
- elif n == 4:
- return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
- 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
- (N*(N-1.0)*(N-2.0)*(N-3.0)))
- else:
- raise ValueError("Should not be here.")
- @xp_capabilities()
- @_axis_nan_policy_factory(
- lambda x: x, result_to_tuple=lambda x, _: (x,), n_outputs=1, default_axis=None
- )
- def kstatvar(data, n=2, *, axis=None):
- r"""Return an unbiased estimator of the variance of the k-statistic.
- See `kstat` and [1]_ for more details about the k-statistic.
- Parameters
- ----------
- data : array_like
- Input array.
- n : int, {1, 2}, optional
- Default is equal to 2.
- axis : int or None, default: None
- If an int, the axis of the input along which to compute the statistic.
- The statistic of each axis-slice (e.g. row) of the input will appear
- in a corresponding element of the output. If ``None``, the input will
- be raveled before computing the statistic.
- Returns
- -------
- kstatvar : float
- The `n` th k-statistic variance.
- See Also
- --------
- kstat : Returns the n-th k-statistic.
- moment : Returns the n-th central moment about the mean for a sample.
- Notes
- -----
- Unbiased estimators of the variances of the first two k-statistics are given by
- .. math::
- \mathrm{var}(k_1) &= \frac{k_2}{n}, \\
- \mathrm{var}(k_2) &= \frac{2k_2^2n + (n-1)k_4}{n(n + 1)}.
- References
- ----------
- .. [1] http://mathworld.wolfram.com/k-Statistic.html
- """ # noqa: E501
- xp = array_namespace(data)
- data = xp.asarray(data)
- if axis is None:
- data = xp.reshape(data, (-1,))
- axis = 0
- N = _length_nonmasked(data, axis, xp=xp)
- if n == 1:
- return kstat(data, n=2, axis=axis, _no_deco=True) * 1.0/N
- elif n == 2:
- k2 = kstat(data, n=2, axis=axis, _no_deco=True)
- k4 = kstat(data, n=4, axis=axis, _no_deco=True)
- return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
- else:
- raise ValueError("Only n=1 or n=2 supported.")
- def _calc_uniform_order_statistic_medians(n):
- """Approximations of uniform order statistic medians.
- Parameters
- ----------
- n : int
- Sample size.
- Returns
- -------
- v : 1d float array
- Approximations of the order statistic medians.
- References
- ----------
- .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
- Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
- Examples
- --------
- Order statistics of the uniform distribution on the unit interval
- are marginally distributed according to beta distributions.
- The expectations of these order statistic are evenly spaced across
- the interval, but the distributions are skewed in a way that
- pushes the medians slightly towards the endpoints of the unit interval:
- >>> import numpy as np
- >>> n = 4
- >>> k = np.arange(1, n+1)
- >>> from scipy.stats import beta
- >>> a = k
- >>> b = n-k+1
- >>> beta.mean(a, b)
- array([0.2, 0.4, 0.6, 0.8])
- >>> beta.median(a, b)
- array([0.15910358, 0.38572757, 0.61427243, 0.84089642])
- The Filliben approximation uses the exact medians of the smallest
- and greatest order statistics, and the remaining medians are approximated
- by points spread evenly across a sub-interval of the unit interval:
- >>> from scipy.stats._morestats import _calc_uniform_order_statistic_medians
- >>> _calc_uniform_order_statistic_medians(n)
- array([0.15910358, 0.38545246, 0.61454754, 0.84089642])
- This plot shows the skewed distributions of the order statistics
- of a sample of size four from a uniform distribution on the unit interval:
- >>> import matplotlib.pyplot as plt
- >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
- >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
- >>> plt.figure()
- >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
- """
- v = np.empty(n, dtype=np.float64)
- v[-1] = 0.5**(1.0 / n)
- v[0] = 1 - v[-1]
- i = np.arange(2, n)
- v[1:-1] = (i - 0.3175) / (n + 0.365)
- return v
- def _parse_dist_kw(dist, enforce_subclass=True):
- """Parse `dist` keyword.
- Parameters
- ----------
- dist : str or stats.distributions instance.
- Several functions take `dist` as a keyword, hence this utility
- function.
- enforce_subclass : bool, optional
- If True (default), `dist` needs to be a
- `_distn_infrastructure.rv_generic` instance.
- It can sometimes be useful to set this keyword to False, if a function
- wants to accept objects that just look somewhat like such an instance
- (for example, they have a ``ppf`` method).
- """
- if isinstance(dist, rv_generic):
- pass
- elif isinstance(dist, str):
- try:
- dist = getattr(distributions, dist)
- except AttributeError as e:
- raise ValueError(f"{dist} is not a valid distribution name") from e
- elif enforce_subclass:
- msg = ("`dist` should be a stats.distributions instance or a string "
- "with the name of such a distribution.")
- raise ValueError(msg)
- return dist
- def _add_axis_labels_title(plot, xlabel, ylabel, title):
- """Helper function to add axes labels and a title to stats plots."""
- try:
- if hasattr(plot, 'set_title'):
- # Matplotlib Axes instance or something that looks like it
- plot.set_title(title)
- plot.set_xlabel(xlabel)
- plot.set_ylabel(ylabel)
- else:
- # matplotlib.pyplot module
- plot.title(title)
- plot.xlabel(xlabel)
- plot.ylabel(ylabel)
- except Exception:
- # Not an MPL object or something that looks (enough) like it.
- # Don't crash on adding labels or title
- pass
- @xp_capabilities(np_only=True)
- def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
- """
- Calculate quantiles for a probability plot, and optionally show the plot.
- Generates a probability plot of sample data against the quantiles of a
- specified theoretical distribution (the normal distribution by default).
- `probplot` optionally calculates a best-fit line for the data and plots the
- results using Matplotlib or a given plot function.
- Parameters
- ----------
- x : array_like
- Sample/response data from which `probplot` creates the plot.
- sparams : tuple, optional
- Distribution-specific shape parameters (shape parameters plus location
- and scale).
- dist : str or stats.distributions instance, optional
- Distribution or distribution function name. The default is 'norm' for a
- normal probability plot. Objects that look enough like a
- stats.distributions instance (i.e. they have a ``ppf`` method) are also
- accepted.
- fit : bool, optional
- Fit a least-squares regression (best-fit) line to the sample data if
- True (default).
- plot : object, optional
- If given, plots the quantiles.
- If given and `fit` is True, also plots the least squares fit.
- `plot` is an object that has to have methods "plot" and "text".
- The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
- or a custom object with the same methods.
- Default is None, which means that no plot is created.
- rvalue : bool, optional
- If `plot` is provided and `fit` is True, setting `rvalue` to True
- includes the coefficient of determination on the plot.
- Default is False.
- Returns
- -------
- (osm, osr) : tuple of ndarrays
- Tuple of theoretical quantiles (osm, or order statistic medians) and
- ordered responses (osr). `osr` is simply sorted input `x`.
- For details on how `osm` is calculated see the Notes section.
- (slope, intercept, r) : tuple of floats, optional
- Tuple containing the result of the least-squares fit, if that is
- performed by `probplot`. `r` is the square root of the coefficient of
- determination. If ``fit=False`` and ``plot=None``, this tuple is not
- returned.
- Notes
- -----
- Even if `plot` is given, the figure is not shown or saved by `probplot`;
- ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
- calling `probplot`.
- `probplot` generates a probability plot, which should not be confused with
- a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this
- type, see ``statsmodels.api.ProbPlot``.
- The formula used for the theoretical quantiles (horizontal axis of the
- probability plot) is Filliben's estimate::
- quantiles = dist.ppf(val), for
- 0.5**(1/n), for i = n
- val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1
- 1 - 0.5**(1/n), for i = 1
- where ``i`` indicates the i-th ordered value and ``n`` is the total number
- of values.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> nsample = 100
- >>> rng = np.random.default_rng()
- A t distribution with small degrees of freedom:
- >>> ax1 = plt.subplot(221)
- >>> x = stats.t.rvs(3, size=nsample, random_state=rng)
- >>> res = stats.probplot(x, plot=plt)
- A t distribution with larger degrees of freedom:
- >>> ax2 = plt.subplot(222)
- >>> x = stats.t.rvs(25, size=nsample, random_state=rng)
- >>> res = stats.probplot(x, plot=plt)
- A mixture of two normal distributions with broadcasting:
- >>> ax3 = plt.subplot(223)
- >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
- ... size=(nsample//2,2), random_state=rng).ravel()
- >>> res = stats.probplot(x, plot=plt)
- A standard normal distribution:
- >>> ax4 = plt.subplot(224)
- >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng)
- >>> res = stats.probplot(x, plot=plt)
- Produce a new figure with a loggamma distribution, using the ``dist`` and
- ``sparams`` keywords:
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng)
- >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
- >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
- Show the results with Matplotlib:
- >>> plt.show()
- """
- x = np.asarray(x)
- if x.size == 0:
- if fit:
- return (x, x), (np.nan, np.nan, 0.0)
- else:
- return x, x
- osm_uniform = _calc_uniform_order_statistic_medians(len(x))
- dist = _parse_dist_kw(dist, enforce_subclass=False)
- if sparams is None:
- sparams = ()
- if isscalar(sparams):
- sparams = (sparams,)
- if not isinstance(sparams, tuple):
- sparams = tuple(sparams)
- osm = dist.ppf(osm_uniform, *sparams)
- osr = sort(x)
- if fit:
- # perform a linear least squares fit.
- slope, intercept, r, prob, _ = _stats_py.linregress(osm, osr)
- if plot is not None:
- plot.plot(osm, osr, 'bo')
- if fit:
- plot.plot(osm, slope*osm + intercept, 'r-')
- _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
- ylabel='Ordered Values',
- title='Probability Plot')
- # Add R^2 value to the plot as text
- if fit and rvalue:
- xmin = amin(osm)
- xmax = amax(osm)
- ymin = amin(x)
- ymax = amax(x)
- posx = xmin + 0.70 * (xmax - xmin)
- posy = ymin + 0.01 * (ymax - ymin)
- plot.text(posx, posy, f"$R^2={r ** 2:1.4f}$")
- if fit:
- return (osm, osr), (slope, intercept, r)
- else:
- return osm, osr
- @xp_capabilities(np_only=True)
- def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
- """Calculate the shape parameter that maximizes the PPCC.
- The probability plot correlation coefficient (PPCC) plot can be used
- to determine the optimal shape parameter for a one-parameter family
- of distributions. ``ppcc_max`` returns the shape parameter that would
- maximize the probability plot correlation coefficient for the given
- data to a one-parameter family of distributions.
- Parameters
- ----------
- x : array_like
- Input array.
- brack : tuple, optional
- Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
- then they are assumed to be a starting interval for a downhill bracket
- search (see `scipy.optimize.brent`).
- dist : str or stats.distributions instance, optional
- Distribution or distribution function name. Objects that look enough
- like a stats.distributions instance (i.e. they have a ``ppf`` method)
- are also accepted. The default is ``'tukeylambda'``.
- Returns
- -------
- shape_value : float
- The shape parameter at which the probability plot correlation
- coefficient reaches its max value.
- See Also
- --------
- ppcc_plot, probplot, boxcox
- Notes
- -----
- The brack keyword serves as a starting point which is useful in corner
- cases. One can use a plot to obtain a rough visual estimate of the location
- for the maximum to start the search near it.
- References
- ----------
- .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test
- for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
- .. [2] Engineering Statistics Handbook, NIST/SEMATEC,
- https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
- Examples
- --------
- First we generate some random data from a Weibull distribution
- with shape parameter 2.5:
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> rng = np.random.default_rng()
- >>> c = 2.5
- >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
- Generate the PPCC plot for this data with the Weibull distribution.
- >>> fig, ax = plt.subplots(figsize=(8, 6))
- >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax)
- We calculate the value where the shape should reach its maximum and a
- red line is drawn there. The line should coincide with the highest
- point in the PPCC graph.
- >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min')
- >>> ax.axvline(cmax, color='r')
- >>> plt.show()
- """
- dist = _parse_dist_kw(dist)
- osm_uniform = _calc_uniform_order_statistic_medians(len(x))
- osr = sort(x)
- # this function computes the x-axis values of the probability plot
- # and computes a linear regression (including the correlation)
- # and returns 1-r so that a minimization function maximizes the
- # correlation
- def tempfunc(shape, mi, yvals, func):
- xvals = func(mi, shape)
- r, prob = _stats_py.pearsonr(xvals, yvals)
- return 1 - r
- return optimize.brent(tempfunc, brack=brack,
- args=(osm_uniform, osr, dist.ppf))
- @xp_capabilities(np_only=True)
- def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
- """Calculate and optionally plot probability plot correlation coefficient.
- The probability plot correlation coefficient (PPCC) plot can be used to
- determine the optimal shape parameter for a one-parameter family of
- distributions. It cannot be used for distributions without shape
- parameters
- (like the normal distribution) or with multiple shape parameters.
- By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
- Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
- distributions via an approximately normal one, and is therefore
- particularly useful in practice.
- Parameters
- ----------
- x : array_like
- Input array.
- a, b : scalar
- Lower and upper bounds of the shape parameter to use.
- dist : str or stats.distributions instance, optional
- Distribution or distribution function name. Objects that look enough
- like a stats.distributions instance (i.e. they have a ``ppf`` method)
- are also accepted. The default is ``'tukeylambda'``.
- plot : object, optional
- If given, plots PPCC against the shape parameter.
- `plot` is an object that has to have methods "plot" and "text".
- The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
- or a custom object with the same methods.
- Default is None, which means that no plot is created.
- N : int, optional
- Number of points on the horizontal axis (equally distributed from
- `a` to `b`).
- Returns
- -------
- svals : ndarray
- The shape values for which `ppcc` was calculated.
- ppcc : ndarray
- The calculated probability plot correlation coefficient values.
- See Also
- --------
- ppcc_max, probplot, boxcox_normplot, tukeylambda
- References
- ----------
- J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
- Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
- Examples
- --------
- First we generate some random data from a Weibull distribution
- with shape parameter 2.5, and plot the histogram of the data:
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> rng = np.random.default_rng()
- >>> c = 2.5
- >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
- Take a look at the histogram of the data.
- >>> fig1, ax = plt.subplots(figsize=(9, 4))
- >>> ax.hist(x, bins=50)
- >>> ax.set_title('Histogram of x')
- >>> plt.show()
- Now we explore this data with a PPCC plot as well as the related
- probability plot and Box-Cox normplot. A red line is drawn where we
- expect the PPCC value to be maximal (at the shape parameter ``c``
- used above):
- >>> fig2 = plt.figure(figsize=(12, 4))
- >>> ax1 = fig2.add_subplot(1, 3, 1)
- >>> ax2 = fig2.add_subplot(1, 3, 2)
- >>> ax3 = fig2.add_subplot(1, 3, 3)
- >>> res = stats.probplot(x, plot=ax1)
- >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2)
- >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3)
- >>> ax3.axvline(c, color='r')
- >>> plt.show()
- """
- if b <= a:
- raise ValueError("`b` has to be larger than `a`.")
- svals = np.linspace(a, b, num=N)
- ppcc = np.empty_like(svals)
- for k, sval in enumerate(svals):
- _, r2 = probplot(x, sval, dist=dist, fit=True)
- ppcc[k] = r2[-1]
- if plot is not None:
- plot.plot(svals, ppcc, 'x')
- _add_axis_labels_title(plot, xlabel='Shape Values',
- ylabel='Prob Plot Corr. Coef.',
- title=f'({dist}) PPCC Plot')
- return svals, ppcc
- def _log_mean(logx, axis):
- # compute log of mean of x from log(x)
- return (
- special.logsumexp(logx, axis=axis, keepdims=True)
- - math.log(logx.shape[axis])
- )
- def _log_var(logx, xp, axis):
- # compute log of variance of x from log(x)
- logmean = xp.broadcast_to(_log_mean(logx, axis=axis), logx.shape)
- ones = xp.ones_like(logx)
- logxmu, _ = special.logsumexp(xp.stack((logx, logmean), axis=0), axis=0,
- b=xp.stack((ones, -ones), axis=0), return_sign=True)
- return special.logsumexp(2 * logxmu, axis=axis) - math.log(logx.shape[axis])
- @xp_capabilities()
- def boxcox_llf(lmb, data, *, axis=0, keepdims=False, nan_policy='propagate'):
- r"""The boxcox log-likelihood function.
- Parameters
- ----------
- lmb : scalar
- Parameter for Box-Cox transformation. See `boxcox` for details.
- data : array_like
- Data to calculate Box-Cox log-likelihood for. If `data` is
- multi-dimensional, the log-likelihood is calculated along the first
- axis.
- axis : int, default: 0
- If an int, the axis of the input along which to compute the statistic.
- The statistic of each axis-slice (e.g. row) of the input will appear in a
- corresponding element of the output.
- If ``None``, the input will be raveled before computing the statistic.
- nan_policy : {'propagate', 'omit', 'raise'
- Defines how to handle input NaNs.
- - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
- which the statistic is computed, the corresponding entry of the output
- will be NaN.
- - ``omit``: NaNs will be omitted when performing the calculation.
- If insufficient data remains in the axis slice along which the
- statistic is computed, the corresponding entry of the output will be
- NaN.
- - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
- keepdims : bool, default: False
- If this is set to True, the axes which are reduced are left
- in the result as dimensions with size one. With this option,
- the result will broadcast correctly against the input array.
- Returns
- -------
- llf : float or ndarray
- Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`,
- an array otherwise.
- See Also
- --------
- boxcox, probplot, boxcox_normplot, boxcox_normmax
- Notes
- -----
- The Box-Cox log-likelihood function :math:`l` is defined here as
- .. math::
- l = (\lambda - 1) \sum_i^N \log(x_i) -
- \frac{N}{2} \log\left(\sum_i^N (y_i - \bar{y})^2 / N\right),
- where :math:`N` is the number of data points ``data`` and :math:`y` is the Box-Cox
- transformed input data.
- This corresponds to the *profile log-likelihood* of the original data :math:`x`
- with some constant terms dropped.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
- Generate some random variates and calculate Box-Cox log-likelihood values
- for them for a range of ``lmbda`` values:
- >>> rng = np.random.default_rng()
- >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng)
- >>> lmbdas = np.linspace(-2, 10)
- >>> llf = np.zeros(lmbdas.shape, dtype=float)
- >>> for ii, lmbda in enumerate(lmbdas):
- ... llf[ii] = stats.boxcox_llf(lmbda, x)
- Also find the optimal lmbda value with `boxcox`:
- >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
- Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
- horizontal line to check that that's really the optimum:
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> ax.plot(lmbdas, llf, 'b.-')
- >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
- >>> ax.set_xlabel('lmbda parameter')
- >>> ax.set_ylabel('Box-Cox log-likelihood')
- Now add some probability plots to show that where the log-likelihood is
- maximized the data transformed with `boxcox` looks closest to normal:
- >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
- >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
- ... xt = stats.boxcox(x, lmbda=lmbda)
- ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
- ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
- ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
- ... ax_inset.set_xticklabels([])
- ... ax_inset.set_yticklabels([])
- ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
- >>> plt.show()
- """
- # _axis_nan_policy decorator does not currently support these for lazy arrays.
- # We want to run tests with lazy backends, so don't pass the arguments explicitly
- # unless necessary.
- kwargs = {}
- if keepdims is not False:
- kwargs['keepdims'] = keepdims
- if nan_policy != 'propagate':
- kwargs['nan_policy'] = nan_policy
- return _boxcox_llf(data, lmb=lmb, axis=axis, **kwargs)
- @_axis_nan_policy_factory(lambda x: x, n_outputs=1, default_axis=0,
- result_to_tuple=lambda x, _: (x,))
- def _boxcox_llf(data, axis=0, *, lmb):
- xp = array_namespace(data)
- dtype = xp_result_type(lmb, data, force_floating=True, xp=xp)
- data = xp.asarray(data, dtype=dtype)
- N = data.shape[axis]
- if N == 0:
- return _get_nan(data, xp=xp)
- logdata = xp.log(data)
- # Compute the variance of the transformed data.
- if lmb == 0:
- logvar = xp.log(xp.var(logdata, axis=axis))
- else:
- # Transform without the constant offset 1/lmb. The offset does
- # not affect the variance, and the subtraction of the offset can
- # lead to loss of precision.
- # Division by lmb can be factored out to enhance numerical stability.
- logx = lmb * logdata
- logvar = _log_var(logx, xp, axis) - 2 * math.log(abs(lmb))
- res = (lmb - 1) * xp.sum(logdata, axis=axis) - N/2 * logvar
- res = xp.astype(res, data.dtype, copy=False) # compensate for NumPy <2.0
- res = res[()] if res.ndim == 0 else res
- return res
- def _boxcox_conf_interval(x, lmax, alpha):
- # Need to find the lambda for which
- # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
- fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
- target = boxcox_llf(lmax, x) - fac
- def rootfunc(lmbda, data, target):
- return boxcox_llf(lmbda, data) - target
- # Find positive endpoint of interval in which answer is to be found
- newlm = lmax + 0.5
- N = 0
- while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
- newlm += 0.1
- N += 1
- if N == 500:
- raise RuntimeError("Could not find endpoint.")
- lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
- # Now find negative interval in the same way
- newlm = lmax - 0.5
- N = 0
- while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
- newlm -= 0.1
- N += 1
- if N == 500:
- raise RuntimeError("Could not find endpoint.")
- lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
- return lmminus, lmplus
- @xp_capabilities(np_only=True)
- def boxcox(x, lmbda=None, alpha=None, optimizer=None):
- r"""Return a dataset transformed by a Box-Cox power transformation.
- Parameters
- ----------
- x : ndarray
- Input array to be transformed.
- If `lmbda` is not None, this is an alias of
- `scipy.special.boxcox`.
- Returns nan if ``x < 0``; returns -inf if ``x == 0 and lmbda < 0``.
- If `lmbda` is None, array must be positive, 1-dimensional, and
- non-constant.
- lmbda : scalar, optional
- If `lmbda` is None (default), find the value of `lmbda` that maximizes
- the log-likelihood function and return it as the second output
- argument.
- If `lmbda` is not None, do the transformation for that value.
- alpha : float, optional
- If `lmbda` is None and `alpha` is not None (default), return the
- ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third
- output argument. Must be between 0.0 and 1.0.
- If `lmbda` is not None, `alpha` is ignored.
- optimizer : callable, optional
- If `lmbda` is None, `optimizer` is the scalar optimizer used to find
- the value of `lmbda` that minimizes the negative log-likelihood
- function. `optimizer` is a callable that accepts one argument:
- fun : callable
- The objective function, which evaluates the negative
- log-likelihood function at a provided value of `lmbda`
- and returns an object, such as an instance of
- `scipy.optimize.OptimizeResult`, which holds the optimal value of
- `lmbda` in an attribute `x`.
- See the example in `boxcox_normmax` or the documentation of
- `scipy.optimize.minimize_scalar` for more information.
- If `lmbda` is not None, `optimizer` is ignored.
- Returns
- -------
- boxcox : ndarray
- Box-Cox power transformed array.
- maxlog : float, optional
- If the `lmbda` parameter is None, the second returned argument is
- the `lmbda` that maximizes the log-likelihood function.
- (min_ci, max_ci) : tuple of float, optional
- If `lmbda` parameter is None and `alpha` is not None, this returned
- tuple of floats represents the minimum and maximum confidence limits
- given `alpha`.
- See Also
- --------
- probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
- Notes
- -----
- The Box-Cox transform is given by:
- .. math::
- y =
- \begin{cases}
- \frac{x^\lambda - 1}{\lambda}, &\text{for } \lambda \neq 0 \\
- \log(x), &\text{for } \lambda = 0
- \end{cases}
- `boxcox` requires the input data to be positive. Sometimes a Box-Cox
- transformation provides a shift parameter to achieve this; `boxcox` does
- not. Such a shift parameter is equivalent to adding a positive constant to
- `x` before calling `boxcox`.
- The confidence limits returned when `alpha` is provided give the interval
- where:
- .. math::
- l(\hat{\lambda}) - l(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
- with :math:`l` the log-likelihood function and :math:`\chi^2` the chi-squared
- function.
- References
- ----------
- G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
- Royal Statistical Society B, 26, 211-252 (1964).
- Examples
- --------
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- We generate some random variates from a non-normal distribution and make a
- probability plot for it, to show it is non-normal in the tails:
- >>> fig = plt.figure()
- >>> ax1 = fig.add_subplot(211)
- >>> x = stats.loggamma.rvs(5, size=500) + 5
- >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
- >>> ax1.set_xlabel('')
- >>> ax1.set_title('Probplot against normal distribution')
- We now use `boxcox` to transform the data so it's closest to normal:
- >>> ax2 = fig.add_subplot(212)
- >>> xt, _ = stats.boxcox(x)
- >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
- >>> ax2.set_title('Probplot after Box-Cox transformation')
- >>> plt.show()
- """
- x = np.asarray(x)
- if lmbda is not None: # single transformation
- return special.boxcox(x, lmbda)
- if x.ndim != 1:
- raise ValueError("Data must be 1-dimensional.")
- if x.size == 0:
- return x
- if np.all(x == x[0]):
- raise ValueError("Data must not be constant.")
- if np.any(x <= 0):
- raise ValueError("Data must be positive.")
- # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
- lmax = boxcox_normmax(x, method='mle', optimizer=optimizer)
- y = boxcox(x, lmax)
- if alpha is None:
- return y, lmax
- else:
- # Find confidence interval
- interval = _boxcox_conf_interval(x, lmax, alpha)
- return y, lmax, interval
- def _boxcox_inv_lmbda(x, y):
- # compute lmbda given x and y for Box-Cox transformation
- num = special.lambertw(-(x ** (-1 / y)) * np.log(x) / y, k=-1)
- return np.real(-num / np.log(x) - 1 / y)
- class _BigFloat:
- def __repr__(self):
- return "BIG_FLOAT"
- _BigFloat_singleton = _BigFloat()
- @xp_capabilities(np_only=True)
- def boxcox_normmax(
- x, brack=None, method='pearsonr', optimizer=None, *, ymax=_BigFloat_singleton
- ):
- """Compute optimal Box-Cox transform parameter for input data.
- Parameters
- ----------
- x : array_like
- Input array. All entries must be positive, finite, real numbers.
- brack : 2-tuple, optional, default (-2.0, 2.0)
- The starting interval for a downhill bracket search for the default
- `optimize.brent` solver. Note that this is in most cases not
- critical; the final result is allowed to be outside this bracket.
- If `optimizer` is passed, `brack` must be None.
- method : str, optional
- The method to determine the optimal transform parameter (`boxcox`
- ``lmbda`` parameter). Options are:
- 'pearsonr' (default)
- Maximizes the Pearson correlation coefficient between
- ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
- normally-distributed.
- 'mle'
- Maximizes the log-likelihood `boxcox_llf`. This is the method used
- in `boxcox`.
- 'all'
- Use all optimization methods available, and return all results.
- Useful to compare different methods.
- optimizer : callable, optional
- `optimizer` is a callable that accepts one argument:
- fun : callable
- The objective function to be minimized. `fun` accepts one argument,
- the Box-Cox transform parameter `lmbda`, and returns the value of
- the function (e.g., the negative log-likelihood) at the provided
- argument. The job of `optimizer` is to find the value of `lmbda`
- that *minimizes* `fun`.
- and returns an object, such as an instance of
- `scipy.optimize.OptimizeResult`, which holds the optimal value of
- `lmbda` in an attribute `x`.
- See the example below or the documentation of
- `scipy.optimize.minimize_scalar` for more information.
- ymax : float, optional
- The unconstrained optimal transform parameter may cause Box-Cox
- transformed data to have extreme magnitude or even overflow.
- This parameter constrains MLE optimization such that the magnitude
- of the transformed `x` does not exceed `ymax`. The default is
- the maximum value of the input dtype. If set to infinity,
- `boxcox_normmax` returns the unconstrained optimal lambda.
- Ignored when ``method='pearsonr'``.
- Returns
- -------
- maxlog : float or ndarray
- The optimal transform parameter found. An array instead of a scalar
- for ``method='all'``.
- See Also
- --------
- boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- We can generate some data and determine the optimal ``lmbda`` in various
- ways:
- >>> rng = np.random.default_rng()
- >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
- >>> y, lmax_mle = stats.boxcox(x)
- >>> lmax_pearsonr = stats.boxcox_normmax(x)
- >>> lmax_mle
- 2.217563431465757
- >>> lmax_pearsonr
- 2.238318660200961
- >>> stats.boxcox_normmax(x, method='all')
- array([2.23831866, 2.21756343])
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
- >>> ax.axvline(lmax_mle, color='r')
- >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
- >>> plt.show()
- Alternatively, we can define our own `optimizer` function. Suppose we
- are only interested in values of `lmbda` on the interval [6, 7], we
- want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``,
- and we want to use tighter tolerances when optimizing the log-likelihood
- function. To do this, we define a function that accepts positional argument
- `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject
- to the provided bounds and tolerances:
- >>> from scipy import optimize
- >>> options = {'xatol': 1e-12} # absolute tolerance on `x`
- >>> def optimizer(fun):
- ... return optimize.minimize_scalar(fun, bounds=(6, 7),
- ... method="bounded", options=options)
- >>> stats.boxcox_normmax(x, optimizer=optimizer)
- 6.000000000
- """
- x = np.asarray(x)
- if not np.all(np.isfinite(x) & (x >= 0)):
- message = ("The `x` argument of `boxcox_normmax` must contain "
- "only positive, finite, real numbers.")
- raise ValueError(message)
- end_msg = "exceed specified `ymax`."
- if ymax is _BigFloat_singleton:
- dtype = x.dtype if np.issubdtype(x.dtype, np.floating) else np.float64
- # 10000 is a safety factor because `special.boxcox` overflows prematurely.
- ymax = np.finfo(dtype).max / 10000
- end_msg = f"overflow in {dtype}."
- elif ymax <= 0:
- raise ValueError("`ymax` must be strictly positive")
- # If optimizer is not given, define default 'brent' optimizer.
- if optimizer is None:
- # Set default value for `brack`.
- if brack is None:
- brack = (-2.0, 2.0)
- def _optimizer(func, args):
- return optimize.brent(func, args=args, brack=brack)
- # Otherwise check optimizer.
- else:
- if not callable(optimizer):
- raise ValueError("`optimizer` must be a callable")
- if brack is not None:
- raise ValueError("`brack` must be None if `optimizer` is given")
- # `optimizer` is expected to return a `OptimizeResult` object, we here
- # get the solution to the optimization problem.
- def _optimizer(func, args):
- def func_wrapped(x):
- return func(x, *args)
- return getattr(optimizer(func_wrapped), 'x', None)
- def _pearsonr(x):
- osm_uniform = _calc_uniform_order_statistic_medians(len(x))
- xvals = distributions.norm.ppf(osm_uniform)
- def _eval_pearsonr(lmbda, xvals, samps):
- # This function computes the x-axis values of the probability plot
- # and computes a linear regression (including the correlation) and
- # returns ``1 - r`` so that a minimization function maximizes the
- # correlation.
- y = boxcox(samps, lmbda)
- yvals = np.sort(y)
- r, prob = _stats_py.pearsonr(xvals, yvals)
- return 1 - r
- return _optimizer(_eval_pearsonr, args=(xvals, x))
- def _mle(x):
- def _eval_mle(lmb, data):
- # function to minimize
- return -boxcox_llf(lmb, data)
- return _optimizer(_eval_mle, args=(x,))
- def _all(x):
- maxlog = np.empty(2, dtype=float)
- maxlog[0] = _pearsonr(x)
- maxlog[1] = _mle(x)
- return maxlog
- methods = {'pearsonr': _pearsonr,
- 'mle': _mle,
- 'all': _all}
- if method not in methods.keys():
- raise ValueError(f"Method {method} not recognized.")
- optimfunc = methods[method]
- res = optimfunc(x)
- if res is None:
- message = ("The `optimizer` argument of `boxcox_normmax` must return "
- "an object containing the optimal `lmbda` in attribute `x`.")
- raise ValueError(message)
- elif not np.isinf(ymax): # adjust the final lambda
- # x > 1, boxcox(x) > 0; x < 1, boxcox(x) < 0
- xmax, xmin = np.max(x), np.min(x)
- if xmin >= 1:
- x_treme = xmax
- elif xmax <= 1:
- x_treme = xmin
- else: # xmin < 1 < xmax
- indicator = special.boxcox(xmax, res) > abs(special.boxcox(xmin, res))
- if isinstance(res, np.ndarray):
- indicator = indicator[1] # select corresponds with 'mle'
- x_treme = xmax if indicator else xmin
- mask = abs(special.boxcox(x_treme, res)) > ymax
- if np.any(mask):
- message = (
- f"The optimal lambda is {res}, but the returned lambda is the "
- f"constrained optimum to ensure that the maximum or the minimum "
- f"of the transformed data does not " + end_msg
- )
- warnings.warn(message, stacklevel=2)
- # Return the constrained lambda to ensure the transformation
- # does not cause overflow or exceed specified `ymax`
- constrained_res = _boxcox_inv_lmbda(x_treme, ymax * np.sign(x_treme - 1))
- if isinstance(res, np.ndarray):
- res[mask] = constrained_res
- else:
- res = constrained_res
- return res
- def _normplot(method, x, la, lb, plot=None, N=80):
- """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
- optionally show it.
- See `boxcox_normplot` or `yeojohnson_normplot` for details.
- """
- if method == 'boxcox':
- title = 'Box-Cox Normality Plot'
- transform_func = boxcox
- else:
- title = 'Yeo-Johnson Normality Plot'
- transform_func = yeojohnson
- x = np.asarray(x)
- if x.size == 0:
- return x
- if lb <= la:
- raise ValueError("`lb` has to be larger than `la`.")
- if method == 'boxcox' and np.any(x <= 0):
- raise ValueError("Data must be positive.")
- lmbdas = np.linspace(la, lb, num=N)
- ppcc = lmbdas * 0.0
- for i, val in enumerate(lmbdas):
- # Determine for each lmbda the square root of correlation coefficient
- # of transformed x
- z = transform_func(x, lmbda=val)
- _, (_, _, r) = probplot(z, dist='norm', fit=True)
- ppcc[i] = r
- if plot is not None:
- plot.plot(lmbdas, ppcc, 'x')
- _add_axis_labels_title(plot, xlabel='$\\lambda$',
- ylabel='Prob Plot Corr. Coef.',
- title=title)
- return lmbdas, ppcc
- @xp_capabilities(np_only=True)
- def boxcox_normplot(x, la, lb, plot=None, N=80):
- """Compute parameters for a Box-Cox normality plot, optionally show it.
- A Box-Cox normality plot shows graphically what the best transformation
- parameter is to use in `boxcox` to obtain a distribution that is close
- to normal.
- Parameters
- ----------
- x : array_like
- Input array.
- la, lb : scalar
- The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
- for Box-Cox transformations. These are also the limits of the
- horizontal axis of the plot if that is generated.
- plot : object, optional
- If given, plots the quantiles and least squares fit.
- `plot` is an object that has to have methods "plot" and "text".
- The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
- or a custom object with the same methods.
- Default is None, which means that no plot is created.
- N : int, optional
- Number of points on the horizontal axis (equally distributed from
- `la` to `lb`).
- Returns
- -------
- lmbdas : ndarray
- The ``lmbda`` values for which a Box-Cox transform was done.
- ppcc : ndarray
- Probability Plot Correlation Coefficient, as obtained from `probplot`
- when fitting the Box-Cox transformed input `x` against a normal
- distribution.
- See Also
- --------
- probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
- Notes
- -----
- Even if `plot` is given, the figure is not shown or saved by
- `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
- should be used after calling `probplot`.
- Examples
- --------
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- Generate some non-normally distributed data, and create a Box-Cox plot:
- >>> x = stats.loggamma.rvs(5, size=500) + 5
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
- Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
- the same plot:
- >>> _, maxlog = stats.boxcox(x)
- >>> ax.axvline(maxlog, color='r')
- >>> plt.show()
- """
- return _normplot('boxcox', x, la, lb, plot, N)
- @xp_capabilities(np_only=True)
- def yeojohnson(x, lmbda=None):
- r"""Return a dataset transformed by a Yeo-Johnson power transformation.
- Parameters
- ----------
- x : ndarray
- Input array. Should be 1-dimensional.
- lmbda : float, optional
- If ``lmbda`` is ``None``, find the lambda that maximizes the
- log-likelihood function and return it as the second output argument.
- Otherwise the transformation is done for the given value.
- Returns
- -------
- yeojohnson: ndarray
- Yeo-Johnson power transformed array.
- maxlog : float, optional
- If the `lmbda` parameter is None, the second returned argument is
- the lambda that maximizes the log-likelihood function.
- See Also
- --------
- probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
- Notes
- -----
- The Yeo-Johnson transform is given by:
- .. math::
- y =
- \begin{cases}
- \frac{(x + 1)^\lambda - 1}{\lambda},
- &\text{for } x \geq 0, \lambda \neq 0
- \\
- \log(x + 1),
- &\text{for } x \geq 0, \lambda = 0
- \\
- -\frac{(-x + 1)^{2 - \lambda} - 1}{2 - \lambda},
- &\text{for } x < 0, \lambda \neq 2
- \\
- -\log(-x + 1),
- &\text{for } x < 0, \lambda = 2
- \end{cases}
- Unlike `boxcox`, `yeojohnson` does not require the input data to be
- positive.
- .. versionadded:: 1.2.0
- References
- ----------
- I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
- Improve Normality or Symmetry", Biometrika 87.4 (2000):
- Examples
- --------
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- We generate some random variates from a non-normal distribution and make a
- probability plot for it, to show it is non-normal in the tails:
- >>> fig = plt.figure()
- >>> ax1 = fig.add_subplot(211)
- >>> x = stats.loggamma.rvs(5, size=500) + 5
- >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
- >>> ax1.set_xlabel('')
- >>> ax1.set_title('Probplot against normal distribution')
- We now use `yeojohnson` to transform the data so it's closest to normal:
- >>> ax2 = fig.add_subplot(212)
- >>> xt, lmbda = stats.yeojohnson(x)
- >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
- >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
- >>> plt.show()
- """
- x = np.asarray(x)
- if x.size == 0:
- return x
- if np.issubdtype(x.dtype, np.complexfloating):
- raise ValueError('Yeo-Johnson transformation is not defined for '
- 'complex numbers.')
- if np.issubdtype(x.dtype, np.integer):
- x = x.astype(np.float64, copy=False)
- if lmbda is not None:
- return _yeojohnson_transform(x, lmbda)
- # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
- lmax = yeojohnson_normmax(x)
- y = _yeojohnson_transform(x, lmax)
- return y, lmax
- def _yeojohnson_transform(x, lmbda, xp=None):
- """Returns `x` transformed by the Yeo-Johnson power transform with given
- parameter `lmbda`.
- """
- xp = array_namespace(x) if xp is None else xp
- dtype = xp_result_type(x, lmbda, force_floating=True, xp=xp)
- eps = xp.finfo(dtype).eps
- out = xp.zeros_like(x, dtype=dtype)
- pos = x >= 0 # binary mask
- # when x >= 0
- if abs(lmbda) < eps:
- out = xpx.at(out)[pos].set(xp.log1p(x[pos]))
- else: # lmbda != 0
- # more stable version of: ((x + 1) ** lmbda - 1) / lmbda
- out = xpx.at(out)[pos].set(xp.expm1(lmbda * xp.log1p(x[pos])) / lmbda)
- # when x < 0
- if abs(lmbda - 2) > eps:
- out = xpx.at(out)[~pos].set(
- -xp.expm1((2 - lmbda) * xp.log1p(-x[~pos])) / (2 - lmbda))
- else: # lmbda == 2
- out = xpx.at(out)[~pos].set(-xp.log1p(-x[~pos]))
- return out
- @xp_capabilities(skip_backends=[("dask.array", "Dask can't broadcast nan shapes")])
- def yeojohnson_llf(lmb, data, *, axis=0, nan_policy='propagate', keepdims=False):
- r"""The Yeo-Johnson log-likelihood function.
- Parameters
- ----------
- lmb : scalar
- Parameter for Yeo-Johnson transformation. See `yeojohnson` for
- details.
- data : array_like
- Data to calculate Yeo-Johnson log-likelihood for.
- axis : int, default: 0
- If an int, the axis of the input along which to compute the statistic.
- The statistic of each axis-slice (e.g. row) of the input will appear in a
- corresponding element of the output.
- If ``None``, the input will be raveled before computing the statistic.
- nan_policy : {'propagate', 'omit', 'raise'
- Defines how to handle input NaNs.
- - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
- which the statistic is computed, the corresponding entry of the output
- will be NaN.
- - ``omit``: NaNs will be omitted when performing the calculation.
- If insufficient data remains in the axis slice along which the
- statistic is computed, the corresponding entry of the output will be
- NaN.
- - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
- keepdims : bool, default: False
- If this is set to True, the axes which are reduced are left
- in the result as dimensions with size one. With this option,
- the result will broadcast correctly against the input array.
- Returns
- -------
- llf : float
- Yeo-Johnson log-likelihood of `data` given `lmb`.
- See Also
- --------
- yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
- Notes
- -----
- The Yeo-Johnson log-likelihood function :math:`l` is defined here as
- .. math::
- l = -\frac{N}{2} \log(\hat{\sigma}^2) + (\lambda - 1)
- \sum_i^N \text{sign}(x_i) \log(|x_i| + 1)
- where :math:`N` is the number of data points :math:`x`=``data`` and
- :math:`\hat{\sigma}^2` is the estimated variance of the Yeo-Johnson transformed
- input data :math:`x`.
- This corresponds to the *profile log-likelihood* of the original data :math:`x`
- with some constant terms dropped.
- .. versionadded:: 1.2.0
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
- Generate some random variates and calculate Yeo-Johnson log-likelihood
- values for them for a range of ``lmbda`` values:
- >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
- >>> lmbdas = np.linspace(-2, 10)
- >>> llf = np.zeros(lmbdas.shape, dtype=float)
- >>> for ii, lmbda in enumerate(lmbdas):
- ... llf[ii] = stats.yeojohnson_llf(lmbda, x)
- Also find the optimal lmbda value with `yeojohnson`:
- >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
- Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
- horizontal line to check that that's really the optimum:
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> ax.plot(lmbdas, llf, 'b.-')
- >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
- >>> ax.set_xlabel('lmbda parameter')
- >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
- Now add some probability plots to show that where the log-likelihood is
- maximized the data transformed with `yeojohnson` looks closest to normal:
- >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
- >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
- ... xt = stats.yeojohnson(x, lmbda=lmbda)
- ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
- ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
- ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
- ... ax_inset.set_xticklabels([])
- ... ax_inset.set_yticklabels([])
- ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
- >>> plt.show()
- """
- # _axis_nan_policy decorator does not currently support these for lazy arrays.
- # We want to run tests with lazy backends, so don't pass the arguments explicitly
- # unless necessary.
- kwargs = {}
- if keepdims is not False:
- kwargs['keepdims'] = keepdims
- if nan_policy != 'propagate':
- kwargs['nan_policy'] = nan_policy
- res = _yeojohnson_llf(data, lmb=lmb, axis=axis, **kwargs)
- return res[()] if res.ndim == 0 else res
- @_axis_nan_policy_factory(lambda x: x, n_outputs=1, default_axis=0,
- result_to_tuple=lambda x, _: (x,))
- def _yeojohnson_llf(data, *, lmb, axis=0):
- xp = array_namespace(data)
- y = _yeojohnson_transform(data, lmb, xp=xp)
- sigma = xp.var(y, axis=axis)
- # Suppress RuntimeWarning raised by np.log when the variance is too low
- finite_variance = sigma >= xp.finfo(sigma.dtype).smallest_normal
- log_sigma = xpx.apply_where(finite_variance, (sigma,), xp.log, fill_value=-xp.inf)
- n = data.shape[axis]
- loglike = (-n / 2 * log_sigma
- + (lmb - 1) * xp.sum(xp.sign(data) * xp.log1p(xp.abs(data)), axis=axis))
- return loglike
- @xp_capabilities(np_only=True)
- def yeojohnson_normmax(x, brack=None):
- """Compute optimal Yeo-Johnson transform parameter.
- Compute optimal Yeo-Johnson transform parameter for input data, using
- maximum likelihood estimation.
- Parameters
- ----------
- x : array_like
- Input array.
- brack : 2-tuple, optional
- The starting interval for a downhill bracket search with
- `optimize.brent`. Note that this is in most cases not critical; the
- final result is allowed to be outside this bracket. If None,
- `optimize.fminbound` is used with bounds that avoid overflow.
- Returns
- -------
- maxlog : float
- The optimal transform parameter found.
- See Also
- --------
- yeojohnson, yeojohnson_llf, yeojohnson_normplot
- Notes
- -----
- .. versionadded:: 1.2.0
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- Generate some data and determine optimal ``lmbda``
- >>> rng = np.random.default_rng()
- >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
- >>> lmax = stats.yeojohnson_normmax(x)
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
- >>> ax.axvline(lmax, color='r')
- >>> plt.show()
- """
- def _neg_llf(lmbda, data):
- llf = np.asarray(yeojohnson_llf(lmbda, data))
- # reject likelihoods that are inf which are likely due to small
- # variance in the transformed space
- llf[np.isinf(llf)] = -np.inf
- return -llf
- with np.errstate(invalid='ignore'):
- if not np.all(np.isfinite(x)):
- raise ValueError('Yeo-Johnson input must be finite.')
- if np.all(x == 0):
- return 1.0
- if brack is not None:
- return optimize.brent(_neg_llf, brack=brack, args=(x,))
- x = np.asarray(x)
- dtype = x.dtype if np.issubdtype(x.dtype, np.floating) else np.float64
- # Allow values up to 20 times the maximum observed value to be safely
- # transformed without over- or underflow.
- log1p_max_x = np.log1p(20 * np.max(np.abs(x)))
- # Use half of floating point's exponent range to allow safe computation
- # of the variance of the transformed data.
- log_eps = np.log(np.finfo(dtype).eps)
- log_tiny_float = (np.log(np.finfo(dtype).tiny) - log_eps) / 2
- log_max_float = (np.log(np.finfo(dtype).max) + log_eps) / 2
- # Compute the bounds by approximating the inverse of the Yeo-Johnson
- # transform on the smallest and largest floating point exponents, given
- # the largest data we expect to observe. See [1] for further details.
- # [1] https://github.com/scipy/scipy/pull/18852#issuecomment-1630286174
- lb = log_tiny_float / log1p_max_x
- ub = log_max_float / log1p_max_x
- # Convert the bounds if all or some of the data is negative.
- if np.all(x < 0):
- lb, ub = 2 - ub, 2 - lb
- elif np.any(x < 0):
- lb, ub = max(2 - ub, lb), min(2 - lb, ub)
- # Match `optimize.brent`'s tolerance.
- tol_brent = 1.48e-08
- return optimize.fminbound(_neg_llf, lb, ub, args=(x,), xtol=tol_brent)
- @xp_capabilities(np_only=True)
- def yeojohnson_normplot(x, la, lb, plot=None, N=80):
- """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
- A Yeo-Johnson normality plot shows graphically what the best
- transformation parameter is to use in `yeojohnson` to obtain a
- distribution that is close to normal.
- Parameters
- ----------
- x : array_like
- Input array.
- la, lb : scalar
- The lower and upper bounds for the ``lmbda`` values to pass to
- `yeojohnson` for Yeo-Johnson transformations. These are also the
- limits of the horizontal axis of the plot if that is generated.
- plot : object, optional
- If given, plots the quantiles and least squares fit.
- `plot` is an object that has to have methods "plot" and "text".
- The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
- or a custom object with the same methods.
- Default is None, which means that no plot is created.
- N : int, optional
- Number of points on the horizontal axis (equally distributed from
- `la` to `lb`).
- Returns
- -------
- lmbdas : ndarray
- The ``lmbda`` values for which a Yeo-Johnson transform was done.
- ppcc : ndarray
- Probability Plot Correlation Coefficient, as obtained from `probplot`
- when fitting the Box-Cox transformed input `x` against a normal
- distribution.
- See Also
- --------
- probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
- Notes
- -----
- Even if `plot` is given, the figure is not shown or saved by
- `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
- should be used after calling `probplot`.
- .. versionadded:: 1.2.0
- Examples
- --------
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- Generate some non-normally distributed data, and create a Yeo-Johnson plot:
- >>> x = stats.loggamma.rvs(5, size=500) + 5
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
- Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
- the same plot:
- >>> _, maxlog = stats.yeojohnson(x)
- >>> ax.axvline(maxlog, color='r')
- >>> plt.show()
- """
- return _normplot('yeojohnson', x, la, lb, plot, N)
- ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))
- @xp_capabilities(np_only=True)
- @_axis_nan_policy_factory(ShapiroResult, n_samples=1, too_small=2, default_axis=None)
- def shapiro(x):
- r"""Perform the Shapiro-Wilk test for normality.
- The Shapiro-Wilk test tests the null hypothesis that the
- data was drawn from a normal distribution.
- Parameters
- ----------
- x : array_like
- Array of sample data. Must contain at least three observations.
- Returns
- -------
- statistic : float
- The test statistic.
- p-value : float
- The p-value for the hypothesis test.
- See Also
- --------
- anderson : The Anderson-Darling test for normality
- kstest : The Kolmogorov-Smirnov test for goodness of fit.
- :ref:`hypothesis_shapiro` : Extended example
- Notes
- -----
- The algorithm used is described in [4]_ but censoring parameters as
- described are not implemented. For N > 5000 the W test statistic is
- accurate, but the p-value may not be.
- References
- ----------
- .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
- :doi:`10.18434/M32189`
- .. [2] Shapiro, S. S. & Wilk, M.B, "An analysis of variance test for
- normality (complete samples)", Biometrika, 1965, Vol. 52,
- pp. 591-611, :doi:`10.2307/2333709`
- .. [3] Razali, N. M. & Wah, Y. B., "Power comparisons of Shapiro-Wilk,
- Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests", Journal
- of Statistical Modeling and Analytics, 2011, Vol. 2, pp. 21-33.
- .. [4] Royston P., "Remark AS R94: A Remark on Algorithm AS 181: The
- W-test for Normality", 1995, Applied Statistics, Vol. 44,
- :doi:`10.2307/2986146`
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> rng = np.random.default_rng()
- >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)
- >>> shapiro_test = stats.shapiro(x)
- >>> shapiro_test
- ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585)
- >>> shapiro_test.statistic
- 0.9813305735588074
- >>> shapiro_test.pvalue
- 0.16855233907699585
- For a more detailed example, see :ref:`hypothesis_shapiro`.
- """
- x = np.ravel(x).astype(np.float64)
- N = len(x)
- if N < 3:
- raise ValueError("Data must be at least length 3.")
- a = zeros(N//2, dtype=np.float64)
- init = 0
- y = sort(x)
- y -= x[N//2] # subtract the median (or a nearby value); see gh-15777
- w, pw, ifault = swilk(y, a, init)
- if ifault not in [0, 2]:
- warnings.warn("scipy.stats.shapiro: Input data has range zero. The"
- " results may not be accurate.", stacklevel=2)
- if N > 5000:
- warnings.warn("scipy.stats.shapiro: For N > 5000, computed p-value "
- f"may not be accurate. Current N is {N}.",
- stacklevel=2)
- # `w` and `pw` are always Python floats, which are double precision.
- # We want to ensure that they are NumPy floats, so until dtypes are
- # respected, we can explicitly convert each to float64 (faster than
- # `np.array([w, pw])`).
- return ShapiroResult(np.float64(w), np.float64(pw))
- # Values from [8]
- _Avals_norm = array([0.561, 0.631, 0.752, 0.873, 1.035])
- _Avals_expon = array([0.916, 1.062, 1.321, 1.591, 1.959])
- # From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
- # Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
- _Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
- # From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
- # on the Empirical Distribution Function.", Biometrika,
- # Vol. 66, Issue 3, Dec. 1979, pp 591-595.
- _Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
- # From Richard A. Lockhart and Michael A. Stephens "Estimation and Tests of
- # Fit for the Three-Parameter Weibull Distribution"
- # Journal of the Royal Statistical Society.Series B(Methodological)
- # Vol. 56, No. 3 (1994), pp. 491-500, table 1. Keys are c*100
- _Avals_weibull = [[0.292, 0.395, 0.467, 0.522, 0.617, 0.711, 0.836, 0.931],
- [0.295, 0.399, 0.471, 0.527, 0.623, 0.719, 0.845, 0.941],
- [0.298, 0.403, 0.476, 0.534, 0.631, 0.728, 0.856, 0.954],
- [0.301, 0.408, 0.483, 0.541, 0.640, 0.738, 0.869, 0.969],
- [0.305, 0.414, 0.490, 0.549, 0.650, 0.751, 0.885, 0.986],
- [0.309, 0.421, 0.498, 0.559, 0.662, 0.765, 0.902, 1.007],
- [0.314, 0.429, 0.508, 0.570, 0.676, 0.782, 0.923, 1.030],
- [0.320, 0.438, 0.519, 0.583, 0.692, 0.802, 0.947, 1.057],
- [0.327, 0.448, 0.532, 0.598, 0.711, 0.824, 0.974, 1.089],
- [0.334, 0.469, 0.547, 0.615, 0.732, 0.850, 1.006, 1.125],
- [0.342, 0.472, 0.563, 0.636, 0.757, 0.879, 1.043, 1.167]]
- _Avals_weibull = np.array(_Avals_weibull)
- _cvals_weibull = np.linspace(0, 0.5, 11)
- _get_As_weibull = interpolate.interp1d(_cvals_weibull, _Avals_weibull.T,
- kind='linear', bounds_error=False,
- fill_value=_Avals_weibull[-1])
- def _weibull_fit_check(params, x):
- # Refine the fit returned by `weibull_min.fit` to ensure that the first
- # order necessary conditions are satisfied. If not, raise an error.
- # Here, use `m` for the shape parameter to be consistent with [7]
- # and avoid confusion with `c` as defined in [7].
- n = len(x)
- m, u, s = params
- def dnllf_dm(m, u):
- # Partial w.r.t. shape w/ optimal scale. See [7] Equation 5.
- xu = x-u
- return (1/m - (xu**m*np.log(xu)).sum()/(xu**m).sum()
- + np.log(xu).sum()/n)
- def dnllf_du(m, u):
- # Partial w.r.t. loc w/ optimal scale. See [7] Equation 6.
- xu = x-u
- return (m-1)/m*(xu**-1).sum() - n*(xu**(m-1)).sum()/(xu**m).sum()
- def get_scale(m, u):
- # Partial w.r.t. scale solved in terms of shape and location.
- # See [7] Equation 7.
- return ((x-u)**m/n).sum()**(1/m)
- def dnllf(params):
- # Partial derivatives of the NLLF w.r.t. parameters, i.e.
- # first order necessary conditions for MLE fit.
- return [dnllf_dm(*params), dnllf_du(*params)]
- suggestion = ("Maximum likelihood estimation is known to be challenging "
- "for the three-parameter Weibull distribution. Consider "
- "performing a custom goodness-of-fit test using "
- "`scipy.stats.monte_carlo_test`.")
- if np.allclose(u, np.min(x)) or m < 1:
- # The critical values provided by [7] don't seem to control the
- # Type I error rate in this case. Error out.
- message = ("Maximum likelihood estimation has converged to "
- "a solution in which the location is equal to the minimum "
- "of the data, the shape parameter is less than 2, or both. "
- "The table of critical values in [7] does not "
- "include this case. " + suggestion)
- raise ValueError(message)
- try:
- # Refine the MLE / verify that first-order necessary conditions are
- # satisfied. If so, the critical values provided in [7] seem reliable.
- with np.errstate(over='raise', invalid='raise'):
- res = optimize.root(dnllf, params[:-1])
- message = ("Solution of MLE first-order conditions failed: "
- f"{res.message}. `anderson` cannot continue. " + suggestion)
- if not res.success:
- raise ValueError(message)
- except (FloatingPointError, ValueError) as e:
- message = ("An error occurred while fitting the Weibull distribution "
- "to the data, so `anderson` cannot continue. " + suggestion)
- raise ValueError(message) from e
- m, u = res.x
- s = get_scale(m, u)
- return m, u, s
- AndersonResult = _make_tuple_bunch('AndersonResult',
- ['statistic', 'critical_values',
- 'significance_level'], ['fit_result'])
- _anderson_warning_message = (
- """As of SciPy 1.17, users must choose a p-value calculation method by providing the
- `method` parameter. `method='interpolate'` interpolates the p-value from pre-calculated
- tables; `method` may also be an instance of `MonteCarloMethod` to approximate the
- p-value via Monte Carlo simulation. When `method` is specified, the result object will
- include a `pvalue` attribute and not attributes `critical_value`, `significance_level`,
- or `fit_result`. Beginning in 1.19.0, these other attributes will no longer be
- available, and a p-value will always be computed according to one of the available
- `method` options.""".replace('\n', ' '))
- @xp_capabilities(np_only=True)
- def anderson(x, dist='norm', *, method=None):
- """Anderson-Darling test for data coming from a particular distribution.
- The Anderson-Darling test tests the null hypothesis that a sample is
- drawn from a population that follows a particular distribution.
- For the Anderson-Darling test, the critical values depend on
- which distribution is being tested against. This function works
- for normal, exponential, logistic, weibull_min, or Gumbel (Extreme Value
- Type I) distributions.
- Parameters
- ----------
- x : array_like
- Array of sample data.
- dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1', 'weibull_min'}, optional
- The type of distribution to test against. The default is 'norm'.
- The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the
- same distribution.
- method : str or instance of `MonteCarloMethod`
- Defines the method used to compute the p-value.
- If `method` is ``"interpolated"``, the p-value is interpolated from
- pre-calculated tables.
- If `method` is an instance of `MonteCarloMethod`, the p-value is computed using
- `scipy.stats.monte_carlo_test` with the provided configuration options and other
- appropriate settings.
- .. versionadded:: 1.17.0
- If `method` is not specified, `anderson` will emit a ``FutureWarning``
- specifying that the user must opt into a p-value calculation method.
- When `method` is specified, the object returned will include a ``pvalue``
- attribute, but no ``critical_value``, ``significance_level``, or
- ``fit_result`` attributes. Beginning in 1.19.0, these other attributes will
- no longer be available, and a p-value will always be computed according to
- one of the available `method` options.
- Returns
- -------
- result : AndersonResult
- If `method` is provided, this is an object with the following attributes:
- statistic : float
- The Anderson-Darling test statistic.
- pvalue: float
- The p-value corresponding with the test statistic, calculated according to
- the specified `method`.
- If `method` is unspecified, this is an object with the following attributes:
- statistic : float
- The Anderson-Darling test statistic.
- critical_values : list
- The critical values for this distribution.
- significance_level : list
- The significance levels for the corresponding critical values
- in percents. The function returns critical values for a
- differing set of significance levels depending on the
- distribution that is being tested against.
- fit_result : `~scipy.stats._result_classes.FitResult`
- An object containing the results of fitting the distribution to
- the data.
- .. deprecated :: 1.17.0
- The tuple-unpacking behavior of the return object and attributes
- ``critical_values``, ``significance_level``, and ``fit_result`` are
- deprecated. Beginning in SciPy 1.19.0, these features will no longer be
- available, and the object returned will have attributes ``statistic`` and
- ``pvalue``.
- See Also
- --------
- kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
- Notes
- -----
- Critical values provided when `method` is unspecified are for the following
- significance levels:
- normal/exponential
- 15%, 10%, 5%, 2.5%, 1%
- logistic
- 25%, 10%, 5%, 2.5%, 1%, 0.5%
- gumbel_l / gumbel_r
- 25%, 10%, 5%, 2.5%, 1%
- weibull_min
- 50%, 25%, 15%, 10%, 5%, 2.5%, 1%, 0.5%
- If the returned statistic is larger than these critical values then
- for the corresponding significance level, the null hypothesis that
- the data come from the chosen distribution can be rejected.
- The returned statistic is referred to as 'A2' in the references.
- For `weibull_min`, maximum likelihood estimation is known to be
- challenging. If the test returns successfully, then the first order
- conditions for a maximum likelihood estimate have been verified and
- the critical values correspond relatively well to the significance levels,
- provided that the sample is sufficiently large (>10 observations [7]).
- However, for some data - especially data with no left tail - `anderson`
- is likely to result in an error message. In this case, consider
- performing a custom goodness of fit test using
- `scipy.stats.monte_carlo_test`.
- References
- ----------
- .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
- .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
- Some Comparisons, Journal of the American Statistical Association,
- Vol. 69, pp. 730-737.
- .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
- Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
- pp. 357-369.
- .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
- Distribution, Biometrika, Vol. 64, pp. 583-588.
- .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
- to Tests for Exponentiality , Technical Report No. 262,
- Department of Statistics, Stanford University, Stanford, CA.
- .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
- Based on the Empirical Distribution Function, Biometrika, Vol. 66,
- pp. 591-595.
- .. [7] Richard A. Lockhart and Michael A. Stephens "Estimation and Tests of
- Fit for the Three-Parameter Weibull Distribution"
- Journal of the Royal Statistical Society.Series B(Methodological)
- Vol. 56, No. 3 (1994), pp. 491-500, Table 0.
- .. [8] D'Agostino, Ralph B. (1986). "Tests for the Normal Distribution".
- In: Goodness-of-Fit Techniques. Ed. by Ralph B. D'Agostino and
- Michael A. Stephens. New York: Marcel Dekker, pp. 122-141. ISBN:
- 0-8247-7487-6.
- Examples
- --------
- Test the null hypothesis that a random sample was drawn from a normal
- distribution (with unspecified mean and standard deviation).
- >>> import numpy as np
- >>> from scipy.stats import anderson
- >>> rng = np.random.default_rng(9781234521)
- >>> data = rng.random(size=35)
- >>> res = anderson(data, dist='norm', method='interpolate')
- >>> res.statistic
- np.float64(0.9887620209957291)
- >>> res.pvalue
- np.float64(0.012111200538380142)
- The p-value is approximately 0.012,, so the null hypothesis may be rejected
- at a significance level of 2.5%, but not at a significance level of 1%.
- """ # numpy/numpydoc#87 # noqa: E501
- dist = dist.lower()
- if dist in {'extreme1', 'gumbel'}:
- dist = 'gumbel_l'
- dists = {'norm', 'expon', 'gumbel_l',
- 'gumbel_r', 'logistic', 'weibull_min'}
- if dist not in dists:
- raise ValueError(f"Invalid distribution; dist must be in {dists}.")
- y = sort(x)
- xbar = np.mean(x, axis=0)
- N = len(y)
- if dist == 'norm':
- s = np.std(x, ddof=1, axis=0)
- w = (y - xbar) / s
- fit_params = xbar, s
- logcdf = distributions.norm.logcdf(w)
- logsf = distributions.norm.logsf(w)
- sig = array([15, 10, 5, 2.5, 1])
- critical = around(_Avals_norm / (1.0 + 0.75/N + 2.25/N/N), 3)
- elif dist == 'expon':
- w = y / xbar
- fit_params = 0, xbar
- logcdf = distributions.expon.logcdf(w)
- logsf = distributions.expon.logsf(w)
- sig = array([15, 10, 5, 2.5, 1])
- critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
- elif dist == 'logistic':
- def rootfunc(ab, xj, N):
- a, b = ab
- tmp = (xj - a) / b
- tmp2 = exp(tmp)
- val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
- np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
- return array(val)
- sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
- sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
- w = (y - sol[0]) / sol[1]
- fit_params = sol
- logcdf = distributions.logistic.logcdf(w)
- logsf = distributions.logistic.logsf(w)
- sig = array([25, 10, 5, 2.5, 1, 0.5])
- critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
- elif dist == 'gumbel_r':
- xbar, s = distributions.gumbel_r.fit(x)
- w = (y - xbar) / s
- fit_params = xbar, s
- logcdf = distributions.gumbel_r.logcdf(w)
- logsf = distributions.gumbel_r.logsf(w)
- sig = array([25, 10, 5, 2.5, 1])
- critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
- elif dist == 'gumbel_l':
- xbar, s = distributions.gumbel_l.fit(x)
- w = (y - xbar) / s
- fit_params = xbar, s
- logcdf = distributions.gumbel_l.logcdf(w)
- logsf = distributions.gumbel_l.logsf(w)
- sig = array([25, 10, 5, 2.5, 1])
- critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
- elif dist == 'weibull_min':
- message = ("Critical values of the test statistic are given for the "
- "asymptotic distribution. These may not be accurate for "
- "samples with fewer than 10 observations. Consider using "
- "`scipy.stats.monte_carlo_test`.")
- if N < 10:
- warnings.warn(message, stacklevel=2)
- # [7] writes our 'c' as 'm', and they write `c = 1/m`. Use their names.
- m, loc, scale = distributions.weibull_min.fit(y)
- m, loc, scale = _weibull_fit_check((m, loc, scale), y)
- fit_params = m, loc, scale
- logcdf = stats.weibull_min(*fit_params).logcdf(y)
- logsf = stats.weibull_min(*fit_params).logsf(y)
- c = 1 / m # m and c are as used in [7]
- sig = array([0.5, 0.75, 0.85, 0.9, 0.95, 0.975, 0.99, 0.995])
- critical = _get_As_weibull(c)
- # Goodness-of-fit tests should only be used to provide evidence
- # _against_ the null hypothesis. Be conservative and round up.
- critical = np.round(critical + 0.0005, decimals=3)
- i = arange(1, N + 1)
- A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
- # FitResult initializer expects an optimize result, so let's work with it
- message = '`anderson` successfully fit the distribution to the data.'
- res = optimize.OptimizeResult(success=True, message=message)
- res.x = np.array(fit_params)
- fit_result = FitResult(getattr(distributions, dist), y,
- discrete=False, res=res)
- if method is None:
- warnings.warn(_anderson_warning_message, FutureWarning, stacklevel=2)
- return AndersonResult(A2, critical, sig, fit_result=fit_result)
- if method == 'interpolate':
- sig = 1 - sig if dist == 'weibull_min' else sig / 100
- pvalue = np.interp(A2, critical, sig)
- elif isinstance(method, stats.MonteCarloMethod):
- pvalue = _anderson_simulate_pvalue(x, dist, method)
- else:
- message = ("`method` must be either 'interpolate' or "
- "an instance of `MonteCarloMethod`.")
- raise ValueError(message)
- return SignificanceResult(statistic=A2, pvalue=pvalue)
- def _anderson_simulate_pvalue(x, dist, method):
- message = ("The `___` attribute of a `MonteCarloMethod` object passed as the "
- "`method` parameter of `scipy.stats.anderson` is ignored.")
- method = method._asdict()
- if method.pop('rvs', False):
- warnings.warn(message.replace('___', 'rvs'), UserWarning, stacklevel=3)
- if method.pop('batch', False):
- warnings.warn(message.replace('___', 'batch'), UserWarning, stacklevel=3)
- method['n_mc_samples'] = method.pop('n_resamples')
- kwargs= {'known_params': {'loc': 0}} if dist == 'expon' else {}
- dist = getattr(stats, dist)
- res = stats.goodness_of_fit(dist, x, statistic='ad', **kwargs, **method)
- return res.pvalue
- def _anderson_ksamp_continuous(samples, Z, Zstar, k, n, N):
- """Compute A2akN equation 3 of Scholz & Stephens.
- Parameters
- ----------
- samples : sequence of 1-D array_like
- Array of sample arrays.
- Z : array_like
- Sorted array of all observations.
- Zstar : array_like
- Sorted array of unique observations. Unused.
- k : int
- Number of samples.
- n : array_like
- Number of observations in each sample.
- N : int
- Total number of observations.
- Returns
- -------
- A2KN : float
- The A2KN statistics of Scholz and Stephens 1987.
- """
- A2kN = 0.
- j = np.arange(1, N)
- for i in arange(0, k):
- s = np.sort(samples[i])
- Mij = s.searchsorted(Z[:-1], side='right')
- inner = (N*Mij - j*n[i])**2 / (j * (N - j))
- A2kN += inner.sum() / n[i]
- return A2kN / N
- def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
- """Compute A2akN equation 7 of Scholz and Stephens.
- Parameters
- ----------
- samples : sequence of 1-D array_like
- Array of sample arrays.
- Z : array_like
- Sorted array of all observations.
- Zstar : array_like
- Sorted array of unique observations.
- k : int
- Number of samples.
- n : array_like
- Number of observations in each sample.
- N : int
- Total number of observations.
- Returns
- -------
- A2aKN : float
- The A2aKN statistics of Scholz and Stephens 1987.
- """
- A2akN = 0.
- Z_ssorted_left = Z.searchsorted(Zstar, 'left')
- if N == Zstar.size:
- lj = 1.
- else:
- lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
- Bj = Z_ssorted_left + lj / 2.
- for i in arange(0, k):
- s = np.sort(samples[i])
- s_ssorted_right = s.searchsorted(Zstar, side='right')
- Mij = s_ssorted_right.astype(float)
- fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
- Mij -= fij / 2.
- inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
- A2akN += inner.sum() / n[i]
- A2akN *= (N - 1.) / N
- return A2akN
- def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
- """Compute A2akN equation 6 of Scholz & Stephens.
- Parameters
- ----------
- samples : sequence of 1-D array_like
- Array of sample arrays.
- Z : array_like
- Sorted array of all observations.
- Zstar : array_like
- Sorted array of unique observations.
- k : int
- Number of samples.
- n : array_like
- Number of observations in each sample.
- N : int
- Total number of observations.
- Returns
- -------
- A2KN : float
- The A2KN statistics of Scholz and Stephens 1987.
- """
- A2kN = 0.
- lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
- 'left')
- Bj = lj.cumsum()
- for i in arange(0, k):
- s = np.sort(samples[i])
- Mij = s.searchsorted(Zstar[:-1], side='right')
- inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
- A2kN += inner.sum() / n[i]
- return A2kN
- Anderson_ksampResult = _make_tuple_bunch(
- 'Anderson_ksampResult',
- ['statistic', 'critical_values', 'pvalue'], []
- )
- @xp_capabilities(np_only=True)
- def anderson_ksamp(samples, midrank=_NoValue, *, variant=_NoValue, method=None):
- """The Anderson-Darling test for k-samples.
- The k-sample Anderson-Darling test is a modification of the
- one-sample Anderson-Darling test. It tests the null hypothesis
- that k-samples are drawn from the same population without having
- to specify the distribution function of that population. The
- critical values depend on the number of samples.
- Parameters
- ----------
- samples : sequence of 1-D array_like
- Array of sample data in arrays.
- midrank : bool, optional
- Variant of Anderson-Darling test which is computed. Default
- (True) is the midrank test applicable to continuous and
- discrete populations. If False, the right side empirical
- distribution is used.
- .. deprecated::1.17.0
- Use parameter `variant` instead.
- variant : {'midrank', 'right', 'continuous'}
- Variant of Anderson-Darling test to be computed. ``'midrank'`` is applicable
- to both continuous and discrete populations. ``'discrete'`` and ``'continuous'``
- perform alternative versions of the test for discrete and continuous
- populations, respectively.
- When `variant` is specified, the return object will not be unpackable as a
- tuple, and only attributes ``statistic`` and ``pvalue`` will be present.
- method : PermutationMethod, optional
- Defines the method used to compute the p-value. If `method` is an
- instance of `PermutationMethod`, the p-value is computed using
- `scipy.stats.permutation_test` with the provided configuration options
- and other appropriate settings. Otherwise, the p-value is interpolated
- from tabulated values.
- Returns
- -------
- res : Anderson_ksampResult
- An object containing attributes:
- statistic : float
- Normalized k-sample Anderson-Darling test statistic.
- critical_values : array
- The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,
- 0.5%, 0.1%.
- .. deprecated::1.17.0
- Present only when `variant` is unspecified.
- pvalue : float
- The approximate p-value of the test. If `method` is not
- provided, the value is floored / capped at 0.1% / 25%.
- Raises
- ------
- ValueError
- If fewer than 2 samples are provided, a sample is empty, or no
- distinct observations are in the samples.
- See Also
- --------
- ks_2samp : 2 sample Kolmogorov-Smirnov test
- anderson : 1 sample Anderson-Darling test
- Notes
- -----
- [1]_ defines three versions of the k-sample Anderson-Darling test:
- one for continuous distributions and two for discrete
- distributions, in which ties between samples may occur. The
- default of this routine is to compute the version based on the
- midrank empirical distribution function. This test is applicable
- to continuous and discrete data. If `variant` is set to ``'discrete'``, the
- right side empirical distribution is used for a test for discrete
- data; if `variant` is ``'continuous'``, the same test statistic and p-value are
- computed for data with no ties, but with less computation. According to [1]_,
- the two discrete test statistics differ only slightly if a few collisions due
- to round-off errors occur in the test not adjusted for ties between samples.
- The critical values corresponding to the significance levels from 0.01
- to 0.25 are taken from [1]_. p-values are floored / capped
- at 0.1% / 25%. Since the range of critical values might be extended in
- future releases, it is recommended not to test ``p == 0.25``, but rather
- ``p >= 0.25`` (analogously for the lower bound).
- .. versionadded:: 0.14.0
- References
- ----------
- .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
- Anderson-Darling Tests, Journal of the American Statistical
- Association, Vol. 82, pp. 918-924.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> rng = np.random.default_rng(44925884305279435)
- >>> res = stats.anderson_ksamp([rng.normal(size=50), rng.normal(loc=0.5, size=30)],
- ... variant='midrank')
- >>> res.statistic, res.pvalue
- (3.4444310693448936, 0.013106682406720973)
- The null hypothesis that the two random samples come from the same
- distribution can be rejected at the 5% level because the returned
- p-value is less than 0.05, but not at the 1% level.
- >>> samples = [rng.normal(size=50), rng.normal(size=30),
- ... rng.normal(size=20)]
- >>> res = stats.anderson_ksamp(samples, variant='continuous')
- >>> res.statistic, res.pvalue
- (-0.6309662273193832, 0.25)
- As we might expect, the null hypothesis cannot be rejected here for three samples
- from an identical distribution. The reported p-value (25%) has been capped at the
- maximum value for which pre-computed p-values are available.
- In such cases where the p-value is capped or when sample sizes are
- small, a permutation test may be more accurate.
- >>> method = stats.PermutationMethod(n_resamples=9999, random_state=rng)
- >>> res = stats.anderson_ksamp(samples, variant='continuous', method=method)
- >>> res.pvalue
- 0.699
- """
- k = len(samples)
- if (k < 2):
- raise ValueError("anderson_ksamp needs at least two samples")
- samples = list(map(np.asarray, samples))
- Z = np.sort(np.hstack(samples))
- N = Z.size
- Zstar = np.unique(Z)
- if Zstar.size < 2:
- raise ValueError("anderson_ksamp needs more than one distinct "
- "observation")
- n = np.array([sample.size for sample in samples])
- if np.any(n == 0):
- raise ValueError("anderson_ksamp encountered sample without "
- "observations")
- if variant == _NoValue or midrank != _NoValue:
- message = ("Parameter `variant` has been introduced to replace `midrank`; "
- "`midrank` will be removed in SciPy 1.19.0. Specify `variant` to "
- "silence this warning. Note that the returned object will no longer "
- "be unpackable as a tuple, and `critical_values` will be omitted.")
- warnings.warn(message, category=UserWarning, stacklevel=2)
- return_critical_values = False
- if variant == _NoValue:
- return_critical_values = True
- variant = 'midrank' if midrank else 'right'
- if variant == 'midrank':
- A2kN_fun = _anderson_ksamp_midrank
- elif variant == 'right':
- A2kN_fun = _anderson_ksamp_right
- elif variant == 'continuous':
- A2kN_fun = _anderson_ksamp_continuous
- else:
- message = "`variant` must be one of 'midrank', 'right', or 'continuous'."
- raise ValueError(message)
- A2kN = A2kN_fun(samples, Z, Zstar, k, n, N)
- def statistic(*samples):
- return A2kN_fun(samples, Z, Zstar, k, n, N)
- if method is not None:
- res = stats.permutation_test(samples, statistic, **method._asdict(),
- alternative='greater')
- H = (1. / n).sum()
- hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
- h = hs_cs[-1] + 1
- g = (hs_cs / arange(2, N)).sum()
- a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
- b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
- c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
- d = (2*h + 6)*k**2 - 4*h*k
- sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
- m = k - 1
- A2 = (A2kN - m) / math.sqrt(sigmasq)
- # The b_i values are the interpolation coefficients from Table 2
- # of Scholz and Stephens 1987
- b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
- b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
- b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
- critical = b0 + b1 / math.sqrt(m) + b2 / m
- sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
- if A2 < critical.min() and method is None:
- p = sig.max()
- msg = (f"p-value capped: true value larger than {p}. Consider "
- "specifying `method` "
- "(e.g. `method=stats.PermutationMethod()`.)")
- warnings.warn(msg, stacklevel=2)
- elif A2 > critical.max() and method is None:
- p = sig.min()
- msg = (f"p-value floored: true value smaller than {p}. Consider "
- "specifying `method` "
- "(e.g. `method=stats.PermutationMethod()`.)")
- warnings.warn(msg, stacklevel=2)
- elif method is None:
- # interpolation of probit of significance level
- pf = np.polyfit(critical, log(sig), 2)
- p = math.exp(np.polyval(pf, A2))
- else:
- p = res.pvalue if method is not None else p
- if return_critical_values:
- # create result object with alias for backward compatibility
- res = Anderson_ksampResult(A2, critical, p)
- res.significance_level = p
- else:
- res = SignificanceResult(statistic=A2, pvalue=p)
- return res
- AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
- class _ABW:
- """Distribution of Ansari-Bradley W-statistic under the null hypothesis."""
- # TODO: calculate exact distribution considering ties
- # We could avoid summing over more than half the frequencies,
- # but initially it doesn't seem worth the extra complexity
- def __init__(self):
- """Minimal initializer."""
- self.m = None
- self.n = None
- self.astart = None
- self.total = None
- self.freqs = None
- def _recalc(self, n, m):
- """When necessary, recalculate exact distribution."""
- if n != self.n or m != self.m:
- self.n, self.m = n, m
- # distribution is NOT symmetric when m + n is odd
- # n is len(x), m is len(y), and ratio of scales is defined x/y
- astart, a1, _ = gscale(n, m)
- self.astart = astart # minimum value of statistic
- # Exact distribution of test statistic under null hypothesis
- # expressed as frequencies/counts/integers to maintain precision.
- # Stored as floats to avoid overflow of sums.
- self.freqs = a1.astype(np.float64)
- self.total = self.freqs.sum() # could calculate from m and n
- # probability mass is self.freqs / self.total;
- def pmf(self, k, n, m):
- """Probability mass function."""
- self._recalc(n, m)
- # The convention here is that PMF at k = 12.5 is the same as at k = 12,
- # -> use `floor` in case of ties.
- ind = np.floor(k - self.astart).astype(int)
- return self.freqs[ind] / self.total
- def cdf(self, k, n, m):
- """Cumulative distribution function."""
- self._recalc(n, m)
- # Null distribution derived without considering ties is
- # approximate. Round down to avoid Type I error.
- ind = np.ceil(k - self.astart).astype(int)
- return self.freqs[:ind+1].sum() / self.total
- def sf(self, k, n, m):
- """Survival function."""
- self._recalc(n, m)
- # Null distribution derived without considering ties is
- # approximate. Round down to avoid Type I error.
- ind = np.floor(k - self.astart).astype(int)
- return self.freqs[ind:].sum() / self.total
- # Maintain state for faster repeat calls to ansari w/ method='exact'
- # _ABW() is calculated once per thread and stored as an attribute on
- # this thread-local variable inside ansari().
- _abw_state = threading.local()
- @xp_capabilities(cpu_only=True, jax_jit=False, # p-value is Cython
- skip_backends=[('dask.array', 'no rankdata')])
- @_axis_nan_policy_factory(AnsariResult, n_samples=2)
- def ansari(x, y, alternative='two-sided', *, axis=0):
- """Perform the Ansari-Bradley test for equal scale parameters.
- The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test
- for the equality of the scale parameter of the distributions
- from which two samples were drawn. The null hypothesis states that
- the ratio of the scale of the distribution underlying `x` to the scale
- of the distribution underlying `y` is 1.
- Parameters
- ----------
- x, y : array_like
- Arrays of sample data.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the ratio of scales is not equal to 1.
- * 'less': the ratio of scales is less than 1.
- * 'greater': the ratio of scales is greater than 1.
- .. versionadded:: 1.7.0
- axis : int or tuple of ints, default: 0
- If an int or tuple of ints, the axis or axes of the input along which
- to compute the statistic. The statistic of each axis-slice (e.g. row)
- of the input will appear in a corresponding element of the output.
- If ``None``, the input will be raveled before computing the statistic.
- Returns
- -------
- statistic : float
- The Ansari-Bradley test statistic.
- pvalue : float
- The p-value of the hypothesis test.
- See Also
- --------
- fligner : A non-parametric test for the equality of k variances
- mood : A non-parametric test for the equality of two scale parameters
- Notes
- -----
- The p-value given is exact when the sample sizes are both less than
- 55 and there are no ties, otherwise a normal approximation for the
- p-value is used.
- References
- ----------
- .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for
- dispersions, Annals of Mathematical Statistics, 31, 1174-1189.
- .. [2] Sprent, Peter and N.C. Smeeton. Applied nonparametric
- statistical methods. 3rd ed. Chapman and Hall/CRC. 2001.
- Section 5.8.2.
- .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality
- Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import ansari
- >>> rng = np.random.default_rng()
- For these examples, we'll create three random data sets. The first
- two, with sizes 35 and 25, are drawn from a normal distribution with
- mean 0 and standard deviation 2. The third data set has size 25 and
- is drawn from a normal distribution with standard deviation 1.25.
- >>> x1 = rng.normal(loc=0, scale=2, size=35)
- >>> x2 = rng.normal(loc=0, scale=2, size=25)
- >>> x3 = rng.normal(loc=0, scale=1.25, size=25)
- First we apply `ansari` to `x1` and `x2`. These samples are drawn
- from the same distribution, so we expect the Ansari-Bradley test
- should not lead us to conclude that the scales of the distributions
- are different.
- >>> ansari(x1, x2)
- AnsariResult(statistic=541.0, pvalue=0.9762532927399098)
- With a p-value close to 1, we cannot conclude that there is a
- significant difference in the scales (as expected).
- Now apply the test to `x1` and `x3`:
- >>> ansari(x1, x3)
- AnsariResult(statistic=425.0, pvalue=0.0003087020407974518)
- The probability of observing such an extreme value of the statistic
- under the null hypothesis of equal scales is only 0.03087%. We take this
- as evidence against the null hypothesis in favor of the alternative:
- the scales of the distributions from which the samples were drawn
- are not equal.
- We can use the `alternative` parameter to perform a one-tailed test.
- In the above example, the scale of `x1` is greater than `x3` and so
- the ratio of scales of `x1` and `x3` is greater than 1. This means
- that the p-value when ``alternative='greater'`` should be near 0 and
- hence we should be able to reject the null hypothesis:
- >>> ansari(x1, x3, alternative='greater')
- AnsariResult(statistic=425.0, pvalue=0.0001543510203987259)
- As we can see, the p-value is indeed quite low. Use of
- ``alternative='less'`` should thus yield a large p-value:
- >>> ansari(x1, x3, alternative='less')
- AnsariResult(statistic=425.0, pvalue=0.9998643258449039)
- """
- xp = array_namespace(x, y)
- dtype = xp_result_type(x, y, force_floating=True, xp=xp)
- if alternative not in {'two-sided', 'greater', 'less'}:
- raise ValueError("'alternative' must be 'two-sided',"
- " 'greater', or 'less'.")
- if not hasattr(_abw_state, 'a'):
- _abw_state.a = _ABW()
- # _axis_nan_policy decorator guarantees that axis=-1
- n = x.shape[-1]
- m = y.shape[-1]
- if m < 1: # needed by test_axis_nan_policy; not user-facing
- raise ValueError("Not enough other observations.")
- if n < 1:
- raise ValueError("Not enough test observations.")
- N = m + n
- xy = xp.concat([x, y], axis=-1) # combine
- rank, t = _stats_py._rankdata(xy, method='average', return_ties=True)
- rank, t = xp.astype(rank, dtype), xp.astype(t, dtype)
- symrank = xp.minimum(rank, N - rank + 1)
- AB = xp.sum(symrank[..., :n], axis=-1)
- repeats = xp.any(t > 1) # in theory we could branch for each slice separately
- exact = ((m < 55) and (n < 55) and not repeats)
- if exact:
- # np.vectorize converts to NumPy here, and we convert back to the result
- # type before returning
- cdf = np.vectorize(_abw_state.a.cdf, otypes=[np.float64])
- sf = np.vectorize(_abw_state.a.sf, otypes=[np.float64])
- if alternative == 'two-sided':
- pval = 2.0 * np.minimum(cdf(AB, n, m),
- sf(AB, n, m))
- elif alternative == 'greater':
- # AB statistic is _smaller_ when ratio of scales is larger,
- # so this is the opposite of the usual calculation
- pval = cdf(AB, n, m)
- else:
- pval = sf(AB, n, m)
- pval = xp.clip(xp.asarray(pval, dtype=dtype), max=1.0)
- AB = AB[()] if AB.ndim == 0 else AB
- pval = pval[()] if pval.ndim == 0 else pval
- return AnsariResult(AB, pval)
- mnAB = (n * (N + 1.0) ** 2 / 4.0 / N) if N % 2 else (n * (N + 2.0) / 4.0)
- if repeats: # adjust variance estimates
- # compute np.sum(tj * rj**2,axis=0)
- fac = xp.sum(symrank**2, axis=-1)
- if N % 2: # N odd
- varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
- else: # N even
- varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
- else:
- # otherwise compute normal approximation
- if N % 2: # N odd
- varAB = n * m * (N + 1.0) * (3 + N ** 2) / (48.0 * N ** 2)
- else:
- varAB = m * n * (N + 2) * (N - 2.0) / 48 / (N - 1.0)
- varAB = xp.asarray(varAB, dtype=dtype)
- # Small values of AB indicate larger dispersion for the x sample.
- # Large values of AB indicate larger dispersion for the y sample.
- # This is opposite to the way we define the ratio of scales. see [1]_.
- z = (mnAB - AB) / xp.sqrt(varAB)
- pvalue = _get_pvalue(z, _SimpleNormal(), alternative, xp=xp)
- AB = AB[()] if AB.ndim == 0 else AB
- pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
- return AnsariResult(AB, pvalue)
- BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
- @xp_capabilities()
- @_axis_nan_policy_factory(BartlettResult, n_samples=None)
- def bartlett(*samples, axis=0):
- r"""Perform Bartlett's test for equal variances.
- Bartlett's test tests the null hypothesis that all input samples
- are from populations with equal variances. For samples
- from significantly non-normal populations, Levene's test
- `levene` is more robust.
- Parameters
- ----------
- sample1, sample2, ... : array_like
- arrays of sample data. Only 1d arrays are accepted, they may have
- different lengths.
- Returns
- -------
- statistic : float
- The test statistic.
- pvalue : float
- The p-value of the test.
- See Also
- --------
- fligner : A non-parametric test for the equality of k variances
- levene : A robust parametric test for equality of k variances
- :ref:`hypothesis_bartlett` : Extended example
- Notes
- -----
- Conover et al. (1981) examine many of the existing parametric and
- nonparametric tests by extensive simulations and they conclude that the
- tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
- superior in terms of robustness of departures from normality and power
- ([3]_).
- References
- ----------
- .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
- .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical
- Methods, Eighth Edition, Iowa State University Press.
- .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
- Hypothesis Testing based on Quadratic Inference Function. Technical
- Report #99-03, Center for Likelihood Studies, Pennsylvania State
- University.
- .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
- Tests. Proceedings of the Royal Society of London. Series A,
- Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
- Examples
- --------
- Test whether the lists `a`, `b` and `c` come from populations
- with equal variances.
- >>> import numpy as np
- >>> from scipy import stats
- >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
- >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
- >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
- >>> stat, p = stats.bartlett(a, b, c)
- >>> p
- 1.1254782518834628e-05
- The very small p-value suggests that the populations do not have equal
- variances.
- This is not surprising, given that the sample variance of `b` is much
- larger than that of `a` and `c`:
- >>> [np.var(x, ddof=1) for x in [a, b, c]]
- [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
- For a more detailed example, see :ref:`hypothesis_bartlett`.
- """
- xp = array_namespace(*samples)
- k = len(samples)
- if k < 2:
- raise ValueError("Must enter at least two input sample vectors.")
- if axis is None:
- samples = [xp_ravel(sample) for sample in samples]
- else:
- samples = _broadcast_arrays(samples, axis=axis, xp=xp)
- samples = [xp.moveaxis(sample, axis, -1) for sample in samples]
- Ni = [xp.asarray(_length_nonmasked(sample, axis=-1, xp=xp),
- dtype=sample.dtype, device=xp_device(sample))
- for sample in samples]
- Ni = [xp.broadcast_to(N, samples[0].shape[:-1]) for N in Ni]
- ssq = [xp.var(sample, correction=1, axis=-1) for sample in samples]
- Ni = [arr[xp.newaxis, ...] for arr in Ni]
- ssq = [arr[xp.newaxis, ...] for arr in ssq]
- Ni = xp.concat(Ni, axis=0)
- Ni = xpx.at(Ni)[Ni == 0].set(xp.nan)
- ssq = xp.concat(ssq, axis=0)
- dtype = Ni.dtype
- Ntot = xp.sum(Ni, axis=0)
- spsq = xp.sum((Ni - 1)*ssq, axis=0, dtype=dtype) / (Ntot - k)
- numer = ((Ntot - k) * xp.log(spsq)
- - xp.sum((Ni - 1)*xp.log(ssq), axis=0, dtype=dtype))
- denom = (1 + 1/(3*(k - 1))
- * ((xp.sum(1/(Ni - 1), axis=0)) - 1/(Ntot - k)))
- T = numer / denom
- chi2 = _SimpleChi2(xp.asarray(k-1, dtype=dtype, device=xp_device(T)))
- pvalue = _get_pvalue(T, chi2, alternative='greater', symmetric=False, xp=xp)
- T = xp.clip(T, min=0., max=xp.inf)
- T = T[()] if T.ndim == 0 else T
- pvalue = pvalue[()] if pvalue.ndim == 0 else pvalue
- return BartlettResult(T, pvalue)
- LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
- @xp_capabilities(cpu_only=True, exceptions=['cupy'])
- @_axis_nan_policy_factory(LeveneResult, n_samples=None)
- def levene(*samples, center='median', proportiontocut=0.05, axis=0):
- r"""Perform Levene test for equal variances.
- The Levene test tests the null hypothesis that all input samples
- are from populations with equal variances. Levene's test is an
- alternative to Bartlett's test `bartlett` in the case where
- there are significant deviations from normality.
- Parameters
- ----------
- sample1, sample2, ... : array_like
- The sample data, possibly with different lengths.
- center : {'mean', 'median', 'trimmed'}, optional
- Which statistics to use to center data points within each sample. Default
- is 'median'.
- proportiontocut : float, optional
- When `center` is 'trimmed', this gives the proportion of data points
- to cut from each end. (See `scipy.stats.trim_mean`.)
- Default is 0.05.
- axis : int or tuple of ints, default: 0
- If an int or tuple of ints, the axis or axes of the input along which
- to compute the statistic. The statistic of each axis-slice (e.g. row)
- of the input will appear in a corresponding element of the output.
- If ``None``, the input will be raveled before computing the statistic.
- Returns
- -------
- statistic : float
- The test statistic.
- pvalue : float
- The p-value for the test.
- See Also
- --------
- fligner : A non-parametric test for the equality of k variances
- bartlett : A parametric test for equality of k variances in normal samples
- :ref:`hypothesis_levene` : Extended example
- Notes
- -----
- Three variations of Levene's test are possible. The possibilities
- and their recommended usages are:
- * 'median' : Recommended for skewed (non-normal) distributions>
- * 'mean' : Recommended for symmetric, moderate-tailed distributions.
- * 'trimmed' : Recommended for heavy-tailed distributions.
- The test version using the mean was proposed in the original article
- of Levene ([2]_) while the median and trimmed mean have been studied by
- Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
- test.
- References
- ----------
- .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
- .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
- Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
- Stanford University Press, pp. 278-292.
- .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
- Statistical Association, 69, 364-367
- Examples
- --------
- Test whether the lists `a`, `b` and `c` come from populations
- with equal variances.
- >>> import numpy as np
- >>> from scipy import stats
- >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
- >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
- >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
- >>> stat, p = stats.levene(a, b, c)
- >>> p
- 0.002431505967249681
- The small p-value suggests that the populations do not have equal
- variances.
- This is not surprising, given that the sample variance of `b` is much
- larger than that of `a` and `c`:
- >>> [np.var(x, ddof=1) for x in [a, b, c]]
- [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
- For a more detailed example, see :ref:`hypothesis_levene`.
- """
- xp = array_namespace(*samples)
- if center not in ['mean', 'median', 'trimmed']:
- raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
- k = len(samples)
- if k < 2:
- raise ValueError("Must provide at least two samples.")
- if center == 'median':
- def func(x):
- return (xp.median(x, axis=-1, keepdims=True)
- if (is_numpy(xp) or is_dask(xp))
- else stats.quantile(x, 0.5, axis=-1, keepdims=True))
- elif center == 'mean':
- def func(x):
- return xp.mean(x, axis=-1, keepdims=True)
- else: # center == 'trimmed'
- def func(x):
- # keepdims=True doesn't currently work for lazy arrays
- return _stats_py.trim_mean(x, proportiontocut, axis=-1)[..., xp.newaxis]
- Nis = [sample.shape[-1] for sample in samples]
- Ycis = [func(sample) for sample in samples]
- Ntot = sum(Nis)
- # compute Zij's
- Zijs = [xp.abs(sample - Yc) for sample, Yc in zip(samples, Ycis)]
- # compute Zbari
- Zbaris = [xp.mean(Zij, axis=-1, keepdims=True) for Zij in Zijs]
- Zbar = sum(Ni*Zbari for Ni, Zbari in zip(Nis, Zbaris)) / Ntot
- # compute numerator and denominator
- dfd = (Ntot - k)
- numer = dfd * sum(Ni * (Zbari - Zbar)**2
- for Ni, Zbari in zip(Nis, Zbaris))
- dfn = (k - 1.0)
- denom = dfn * sum(xp.sum((Zij - Zbari)**2, axis=-1, keepdims=True)
- for Zij, Zbari in zip(Zijs, Zbaris))
- W = numer / denom
- W = xp.squeeze(W, axis=-1)
- dfn, dfd = xp.asarray(dfn, dtype=W.dtype), xp.asarray(dfd, dtype=W.dtype)
- pval = _get_pvalue(W, _SimpleF(dfn, dfd), 'greater', xp=xp)
- return LeveneResult(W[()], pval[()])
- FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
- @xp_capabilities(skip_backends=[('dask.array', 'no rankdata'),
- ('cupy', 'no rankdata')], jax_jit=False)
- @_axis_nan_policy_factory(FlignerResult, n_samples=None)
- def fligner(*samples, center='median', proportiontocut=0.05, axis=0):
- r"""Perform Fligner-Killeen test for equality of variance.
- Fligner's test tests the null hypothesis that all input samples
- are from populations with equal variances. Fligner-Killeen's test is
- distribution free when populations are identical [2]_.
- Parameters
- ----------
- sample1, sample2, ... : array_like
- Arrays of sample data. Need not be the same length.
- center : {'mean', 'median', 'trimmed'}, optional
- Which statistics to use to center data points within each sample. Default
- is 'median'.
- proportiontocut : float, optional
- When `center` is 'trimmed', this gives the proportion of data points
- to cut from each end. (See `scipy.stats.trim_mean`.)
- Default is 0.05.
- axis : int or tuple of ints, default: 0
- If an int or tuple of ints, the axis or axes of the input along which
- to compute the statistic. The statistic of each axis-slice (e.g. row)
- of the input will appear in a corresponding element of the output.
- If ``None``, the input will be raveled before computing the statistic.
- Returns
- -------
- statistic : float
- The test statistic.
- pvalue : float
- The p-value for the hypothesis test.
- See Also
- --------
- bartlett : A parametric test for equality of k variances in normal samples
- levene : A robust parametric test for equality of k variances
- :ref:`hypothesis_fligner` : Extended example
- Notes
- -----
- As with Levene's test there are three variants of Fligner's test that
- differ by the measure of central tendency used in the test. See `levene`
- for more information.
- Conover et al. (1981) examine many of the existing parametric and
- nonparametric tests by extensive simulations and they conclude that the
- tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
- superior in terms of robustness of departures from normality and power
- [3]_.
- References
- ----------
- .. [1] Qu, A., Lindsay, B. G., and Li, B. (2000). Improving generalized
- estimating equations using quadratic inference functions.
- Biometrika, 87(4), 823-836.
- :doi:`10.1093/biomet/87.4.823`
- .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
- tests for scale. Journal of the American Statistical Association.
- 71(353), 210-213.
- .. [3] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
- comparative study of tests for homogeneity of variances, with
- applications to the outer continental shelf bidding data.
- Technometrics, 23(4), 351-361.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- Test whether the lists `a`, `b` and `c` come from populations
- with equal variances.
- >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
- >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
- >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
- >>> stat, p = stats.fligner(a, b, c)
- >>> p
- 0.00450826080004775
- The small p-value suggests that the populations do not have equal
- variances.
- This is not surprising, given that the sample variance of `b` is much
- larger than that of `a` and `c`:
- >>> [np.var(x, ddof=1) for x in [a, b, c]]
- [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
- For a more detailed example, see :ref:`hypothesis_fligner`.
- """
- xp = array_namespace(*samples)
- if center not in ['mean', 'median', 'trimmed']:
- raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
- k = len(samples)
- if k < 2:
- raise ValueError("Must provide at least two samples.")
- samples = xp_promote(*samples, force_floating=True, xp=xp)
- dtype = samples[0].dtype
- # Handle empty input
- for sample in samples:
- if sample.size == 0:
- NaN = _get_nan(*samples, xp=xp)
- return FlignerResult(NaN, NaN)
- if center == 'median':
- def func(x):
- return (xp.median(x, axis=-1, keepdims=True)
- if (is_numpy(xp) or is_dask(xp))
- else stats.quantile(x, 0.5, axis=-1, keepdims=True))
- elif center == 'mean':
- def func(x):
- return xp.mean(x, axis=-1, keepdims=True)
- else: # center == 'trimmed'
- def func(x):
- # keepdims=True doesn't currently work for lazy arrays
- return _stats_py.trim_mean(x, proportiontocut, axis=-1)[..., xp.newaxis]
- ni = [sample.shape[-1] for sample in samples]
- N = sum(ni)
- # Implementation follows [3] pg 355 F-K.
- Xibar = [func(sample) for sample in samples]
- Xij_Xibar = [xp.abs(sample - Xibar_) for sample, Xibar_ in zip(samples, Xibar)]
- Xij_Xibar = xp.concat(Xij_Xibar, axis=-1)
- ranks = _stats_py._rankdata(Xij_Xibar, method='average', xp=xp)
- ranks = xp.astype(ranks, dtype)
- a_Ni = special.ndtri(ranks / (2*(N + 1.0)) + 0.5)
- # [3] Equation 2.1
- splits = list(itertools.accumulate(ni, initial=0))
- Ai = [a_Ni[..., i:j] for i, j in zip(splits[:-1], splits[1:])]
- Aibar = [xp.mean(Ai_, axis=-1) for Ai_ in Ai]
- abar = xp.mean(a_Ni, axis=-1)
- V2 = xp.var(a_Ni, axis=-1, correction=1)
- statistic = sum(ni_ * (Aibar_ - abar)**2 for ni_, Aibar_ in zip(ni, Aibar)) / V2
- chi2 = _SimpleChi2(xp.asarray(k-1, dtype=dtype))
- pval = _get_pvalue(statistic, chi2, alternative='greater', symmetric=False, xp=xp)
- return FlignerResult(statistic, pval)
- def _mood_statistic_with_ties(x, y, t, m, n, N, xp):
- # First equation of "Mood's Squared Rank Test", Mielke pg 313
- E_0_T = m * (N * N - 1) / 12
- # m, n, N, t, and S are defined in the second paragraph of Mielke pg 312
- # The only difference is that our `t` has zeros interspersed with the relevant
- # numbers to keep the array rectangular, but these terms add nothing to the sum.
- S = xp.cumulative_sum(t, include_initial=True, axis=-1)
- S_i, S_i_m1 = S[..., 1:], S[..., :-1]
- # Second equation of "Mood's Squared Rank Test", Mielke pg 313
- varM = (m * n * (N + 1.0) * (N**2 - 4) / 180
- - m * n / (180 * N * (N - 1))
- * xp.sum(t * (t ** 2 - 1) * (t ** 2 - 4 + (15 * (N - S_i - S_i_m1) ** 2)),
- axis=-1))
- # There is a formula for Phi (`phi` in code) in terms of t, S, and Psi(I) at the
- # bottom of Mielke pg 312. Psi(I) = [I - (N+1)/2]^2 is defined (with a mistake in
- # the location of the ^2) at the beginning of "Mood's Squared Rank Test" (pg 313).
- # To vectorize this calculation, let c = (N + 1) / 2, so Psi(I) = I^2 - 2*c*I + c^2.
- # We sum each of these three parts of Psi separately using formulas for sums from a
- # to b (inclusive) of terms I^2, I, and 1 where I takes on successive integers.
- def sum_I2(a, b=None):
- return (a * (a + 1) * (2 * a + 1) / 6 if b is None
- else sum_I2(b) - sum_I2(a) + a**2)
- def sum_I(a, b=None):
- return (a * (a + 1) / 2 if b is None
- else sum_I(b) - sum_I(a) + a)
- def sum_1(a, b):
- return (b - a) + 1
- with np.errstate(invalid='ignore', divide='ignore'):
- sum_I2 = sum_I2(S_i_m1 + 1, S_i)
- sum_I = sum_I(S_i_m1 + 1, S_i)
- sum_1 = sum_1(S_i_m1 + 1, S_i)
- c = (N + 1) / 2
- phi = (sum_I2 - 2*c*sum_I + sum_1*c**2) / t
- phi = xpx.at(phi)[t == 0].set(0.) # where t = 0 we get NaNs; eliminate them
- # Mielke pg 312 defines `a` as the count of elements in sample `x` for each of the
- # unique values in the combined sample. The tricky thing is getting these to line
- # up with the locations of nonzero elements in `t`/`phi`.
- x = xp.sort(x, axis=-1)
- xy = xp.concat((x, y), axis=-1)
- i = xp.argsort(xy, stable=True, axis=-1)
- _, a = _stats_py._rankdata(x, method='average', return_ties=True)
- a = xp.astype(a, phi.dtype)
- zeros = xp.zeros(a.shape[:-1] + (n,), dtype=a.dtype)
- a = xp.concat((a, zeros), axis=-1)
- a = xp.take_along_axis(a, i, axis=-1)
- # Mielke pg 312 defines test statistic `T` as the inner product `a` and `phi`
- T = xp.vecdot(a, phi, axis=-1)
- return (T - E_0_T) / xp.sqrt(varM)
- def _mood_statistic_no_ties(r, m, n, N, xp):
- rx = r[..., :m]
- M = xp.sum((rx - (N + 1.0) / 2) ** 2, axis=-1)
- E_0_T = m * (N * N - 1.0) / 12
- varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
- return (M - E_0_T) / math.sqrt(varM)
- def _mood_too_small(samples, kwargs, axis=-1):
- x, y = samples
- m = x.shape[axis]
- n = y.shape[axis]
- N = m + n
- return N < 3
- @xp_capabilities(skip_backends=[('cupy', 'no rankdata'), ('dask.array', 'no rankdata')])
- @_axis_nan_policy_factory(SignificanceResult, n_samples=2, too_small=_mood_too_small)
- def mood(x, y, axis=0, alternative="two-sided"):
- """Perform Mood's test for equal scale parameters.
- Mood's two-sample test for scale parameters is a non-parametric
- test for the null hypothesis that two samples are drawn from the
- same distribution with the same scale parameter.
- Parameters
- ----------
- x, y : array_like
- Arrays of sample data. There must be at least three observations
- total.
- axis : int, optional
- The axis along which the samples are tested. `x` and `y` can be of
- different length along `axis`.
- If `axis` is None, `x` and `y` are flattened and the test is done on
- all values in the flattened arrays.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the scales of the distributions underlying `x` and `y`
- are different.
- * 'less': the scale of the distribution underlying `x` is less than
- the scale of the distribution underlying `y`.
- * 'greater': the scale of the distribution underlying `x` is greater
- than the scale of the distribution underlying `y`.
- .. versionadded:: 1.7.0
- Returns
- -------
- res : SignificanceResult
- An object containing attributes:
- statistic : scalar or ndarray
- The z-score for the hypothesis test. For 1-D inputs a scalar is
- returned.
- pvalue : scalar ndarray
- The p-value for the hypothesis test.
- See Also
- --------
- fligner : A non-parametric test for the equality of k variances
- ansari : A non-parametric test for the equality of 2 variances
- bartlett : A parametric test for equality of k variances in normal samples
- levene : A parametric test for equality of k variances
- Notes
- -----
- The data are assumed to be drawn from probability distributions ``f(x)``
- and ``f(x/s) / s`` respectively, for some probability density function f.
- The null hypothesis is that ``s == 1``.
- For multi-dimensional arrays, if the inputs are of shapes
- ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
- resulting z and p values will have shape ``(n0, n2, n3)``. Note that
- ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
- References
- ----------
- [1] Mielke, Paul W. "Note on Some Squared Rank Tests with Existing Ties."
- Technometrics, vol. 9, no. 2, 1967, pp. 312-14. JSTOR,
- https://doi.org/10.2307/1266427. Accessed 18 May 2022.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> rng = np.random.default_rng()
- >>> x2 = rng.standard_normal((2, 45, 6, 7))
- >>> x1 = rng.standard_normal((2, 30, 6, 7))
- >>> res = stats.mood(x1, x2, axis=1)
- >>> res.pvalue.shape
- (2, 6, 7)
- Find the number of points where the difference in scale is not significant:
- >>> (res.pvalue > 0.1).sum()
- 78
- Perform the test with different scales:
- >>> x1 = rng.standard_normal((2, 30))
- >>> x2 = rng.standard_normal((2, 35)) * 10.0
- >>> stats.mood(x1, x2, axis=1)
- SignificanceResult(statistic=array([-5.76174136, -6.12650783]),
- pvalue=array([8.32505043e-09, 8.98287869e-10]))
- """
- xp = array_namespace(x, y)
- x, y = xp_promote(x, y, force_floating=True, xp=xp)
- dtype = x.dtype
- # _axis_nan_policy decorator ensures axis=-1
- xy = xp.concat((x, y), axis=-1)
- m = x.shape[-1]
- n = y.shape[-1]
- N = m + n
- if m == 0 or n == 0 or N < 3: # only needed for test_axis_nan_policy
- NaN = _get_nan(x, y, xp=xp)
- return SignificanceResult(NaN, NaN)
- # determine if any of the samples contain ties
- # `a` represents ties within `x`; `t` represents ties within `xy`
- r, t = _stats_py._rankdata(xy, method='average', return_ties=True)
- r, t = xp.asarray(r, dtype=dtype), xp.asarray(t, dtype=dtype)
- if xp.any(t > 1):
- z = _mood_statistic_with_ties(x, y, t, m, n, N, xp=xp)
- else:
- z = _mood_statistic_no_ties(r, m, n, N, xp=xp)
- pval = _get_pvalue(z, _SimpleNormal(), alternative, xp=xp)
- z = z[()] if z.ndim == 0 else z
- pval = pval[()] if pval.ndim == 0 else pval
- return SignificanceResult(z, pval)
- WilcoxonResult = _make_tuple_bunch('WilcoxonResult', ['statistic', 'pvalue'])
- def wilcoxon_result_unpacker(res, _):
- if hasattr(res, 'zstatistic'):
- return res.statistic, res.pvalue, res.zstatistic
- else:
- return res.statistic, res.pvalue
- def wilcoxon_result_object(statistic, pvalue, zstatistic=None):
- res = WilcoxonResult(statistic, pvalue)
- if zstatistic is not None:
- res.zstatistic = zstatistic
- return res
- def wilcoxon_outputs(kwds):
- method = kwds.get('method', 'auto')
- if method == 'asymptotic':
- return 3
- return 2
- @xp_capabilities(skip_backends=[("dask.array", "no rankdata"),
- ("cupy", "no rankdata")],
- jax_jit=False, cpu_only=True) # null distribution is CPU only
- @_rename_parameter("mode", "method")
- @_axis_nan_policy_factory(
- wilcoxon_result_object, paired=True,
- n_samples=lambda kwds: 2 if kwds.get('y', None) is not None else 1,
- result_to_tuple=wilcoxon_result_unpacker, n_outputs=wilcoxon_outputs,
- )
- def wilcoxon(x, y=None, zero_method="wilcox", correction=False,
- alternative="two-sided", method='auto', *, axis=0):
- """Calculate the Wilcoxon signed-rank test.
- The Wilcoxon signed-rank test tests the null hypothesis that two
- related paired samples come from the same distribution. In particular,
- it tests whether the distribution of the differences ``x - y`` is symmetric
- about zero. It is a non-parametric version of the paired T-test.
- Parameters
- ----------
- x : array_like
- Either the first set of measurements (in which case ``y`` is the second
- set of measurements), or the differences between two sets of
- measurements (in which case ``y`` is not to be specified.) Must be
- one-dimensional.
- y : array_like, optional
- Either the second set of measurements (if ``x`` is the first set of
- measurements), or not specified (if ``x`` is the differences between
- two sets of measurements.) Must be one-dimensional.
- .. warning::
- When `y` is provided, `wilcoxon` calculates the test statistic
- based on the ranks of the absolute values of ``d = x - y``.
- Roundoff error in the subtraction can result in elements of ``d``
- being assigned different ranks even when they would be tied with
- exact arithmetic. Rather than passing `x` and `y` separately,
- consider computing the difference ``x - y``, rounding as needed to
- ensure that only truly unique elements are numerically distinct,
- and passing the result as `x`, leaving `y` at the default (None).
- zero_method : {"wilcox", "pratt", "zsplit"}, optional
- There are different conventions for handling pairs of observations
- with equal values ("zero-differences", or "zeros").
- * "wilcox": Discards all zero-differences (default); see [4]_.
- * "pratt": Includes zero-differences in the ranking process,
- but drops the ranks of the zeros (more conservative); see [3]_.
- In this case, the normal approximation is adjusted as in [5]_.
- * "zsplit": Includes zero-differences in the ranking process and
- splits the zero rank between positive and negative ones.
- correction : bool, optional
- If True, apply continuity correction by adjusting the Wilcoxon rank
- statistic by 0.5 towards the mean value when computing the
- z-statistic if a normal approximation is used. Default is False.
- alternative : {"two-sided", "greater", "less"}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- In the following, let ``d`` represent the difference between the paired
- samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or
- ``d = x`` otherwise.
- * 'two-sided': the distribution underlying ``d`` is not symmetric
- about zero.
- * 'less': the distribution underlying ``d`` is stochastically less
- than a distribution symmetric about zero.
- * 'greater': the distribution underlying ``d`` is stochastically
- greater than a distribution symmetric about zero.
- method : {"auto", "exact", "asymptotic"} or `PermutationMethod` instance, optional
- Method to calculate the p-value, see Notes. Default is "auto".
- axis : int or None, default: 0
- If an int, the axis of the input along which to compute the statistic.
- The statistic of each axis-slice (e.g. row) of the input will appear
- in a corresponding element of the output. If ``None``, the input will
- be raveled before computing the statistic.
- Returns
- -------
- An object with the following attributes.
- statistic : array_like
- If `alternative` is "two-sided", the sum of the ranks of the
- differences above or below zero, whichever is smaller.
- Otherwise the sum of the ranks of the differences above zero.
- pvalue : array_like
- The p-value for the test depending on `alternative` and `method`.
- zstatistic : array_like
- When ``method = 'asymptotic'``, this is the normalized z-statistic::
- z = (T - mn - d) / se
- where ``T`` is `statistic` as defined above, ``mn`` is the mean of the
- distribution under the null hypothesis, ``d`` is a continuity
- correction, and ``se`` is the standard error.
- When ``method != 'asymptotic'``, this attribute is not available.
- See Also
- --------
- kruskal, mannwhitneyu
- Notes
- -----
- In the following, let ``d`` represent the difference between the paired
- samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or ``d = x``
- otherwise. Assume that all elements of ``d`` are independent and
- identically distributed observations, and all are distinct and nonzero.
- - When ``len(d)`` is sufficiently large, the null distribution of the
- normalized test statistic (`zstatistic` above) is approximately normal,
- and ``method = 'asymptotic'`` can be used to compute the p-value.
- - When ``len(d)`` is small, the normal approximation may not be accurate,
- and ``method='exact'`` is preferred (at the cost of additional
- execution time).
- - The default, ``method='auto'``, selects between the two:
- ``method='exact'`` is used when ``len(d) <= 50``, and
- ``method='asymptotic'`` is used otherwise.
- The presence of "ties" (i.e. not all elements of ``d`` are unique) or
- "zeros" (i.e. elements of ``d`` are zero) changes the null distribution
- of the test statistic, and ``method='exact'`` no longer calculates
- the exact p-value. If ``method='asymptotic'``, the z-statistic is adjusted
- for more accurate comparison against the standard normal, but still,
- for finite sample sizes, the standard normal is only an approximation of
- the true null distribution of the z-statistic. For such situations, the
- `method` parameter also accepts instances of `PermutationMethod`. In this
- case, the p-value is computed using `permutation_test` with the provided
- configuration options and other appropriate settings.
- The presence of ties and zeros affects the resolution of ``method='auto'``
- accordingly: exhasutive permutations are performed when ``len(d) <= 13``,
- and the asymptotic method is used otherwise. Note that they asymptotic
- method may not be very accurate even for ``len(d) > 14``; the threshold
- was chosen as a compromise between execution time and accuracy under the
- constraint that the results must be deterministic. Consider providing an
- instance of `PermutationMethod` method manually, choosing the
- ``n_resamples`` parameter to balance time constraints and accuracy
- requirements.
- Please also note that in the edge case that all elements of ``d`` are zero,
- the p-value relying on the normal approximaton cannot be computed (NaN)
- if ``zero_method='wilcox'`` or ``zero_method='pratt'``.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
- .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
- .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
- Rank Procedures, Journal of the American Statistical Association,
- Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
- .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
- Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
- .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank
- Sampling Distribution When Zero Differences are Present,
- Journal of the American Statistical Association, Vol. 62, 1967,
- pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`
- Examples
- --------
- In [4]_, the differences in height between cross- and self-fertilized
- corn plants is given as follows:
- >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
- Cross-fertilized plants appear to be higher. To test the null
- hypothesis that there is no height difference, we can apply the
- two-sided test:
- >>> from scipy.stats import wilcoxon
- >>> res = wilcoxon(d)
- >>> res.statistic, res.pvalue
- (24.0, 0.041259765625)
- Hence, we would reject the null hypothesis at a confidence level of 5%,
- concluding that there is a difference in height between the groups.
- To confirm that the median of the differences can be assumed to be
- positive, we use:
- >>> res = wilcoxon(d, alternative='greater')
- >>> res.statistic, res.pvalue
- (96.0, 0.0206298828125)
- This shows that the null hypothesis that the median is negative can be
- rejected at a confidence level of 5% in favor of the alternative that
- the median is greater than zero. The p-values above are exact. Using the
- normal approximation gives very similar values:
- >>> res = wilcoxon(d, method='asymptotic')
- >>> res.statistic, res.pvalue
- (24.0, 0.04088813291185591)
- Note that the statistic changed to 96 in the one-sided case (the sum
- of ranks of positive differences) whereas it is 24 in the two-sided
- case (the minimum of sum of ranks above and below zero).
- In the example above, the differences in height between paired plants are
- provided to `wilcoxon` directly. Alternatively, `wilcoxon` accepts two
- samples of equal length, calculates the differences between paired
- elements, then performs the test. Consider the samples ``x`` and ``y``:
- >>> import numpy as np
- >>> x = np.array([0.5, 0.825, 0.375, 0.5])
- >>> y = np.array([0.525, 0.775, 0.325, 0.55])
- >>> res = wilcoxon(x, y, alternative='greater')
- >>> res
- WilcoxonResult(statistic=5.0, pvalue=0.5625)
- Note that had we calculated the differences by hand, the test would have
- produced different results:
- >>> d = [-0.025, 0.05, 0.05, -0.05]
- >>> ref = wilcoxon(d, alternative='greater')
- >>> ref
- WilcoxonResult(statistic=6.0, pvalue=0.5)
- The substantial difference is due to roundoff error in the results of
- ``x-y``:
- >>> d - (x-y)
- array([2.08166817e-17, 6.93889390e-17, 1.38777878e-17, 4.16333634e-17])
- Even though we expected all the elements of ``(x-y)[1:]`` to have the same
- magnitude ``0.05``, they have slightly different magnitudes in practice,
- and therefore are assigned different ranks in the test. Before performing
- the test, consider calculating ``d`` and adjusting it as necessary to
- ensure that theoretically identically values are not numerically distinct.
- For example:
- >>> d2 = np.around(x - y, decimals=3)
- >>> wilcoxon(d2, alternative='greater')
- WilcoxonResult(statistic=6.0, pvalue=0.5)
- """
- # replace approx by asymptotic to ensure backwards compatability
- if method == "approx":
- method = "asymptotic"
- return _wilcoxon._wilcoxon_nd(x, y, zero_method, correction, alternative,
- method, axis)
- MedianTestResult = _make_tuple_bunch(
- 'MedianTestResult',
- ['statistic', 'pvalue', 'median', 'table'], []
- )
- @xp_capabilities(np_only=True)
- def median_test(*samples, ties='below', correction=True, lambda_=1,
- nan_policy='propagate'):
- """Perform a Mood's median test.
- Test that two or more samples come from populations with the same median.
- Let ``n = len(samples)`` be the number of samples. The "grand median" of
- all the data is computed, and a contingency table is formed by
- classifying the values in each sample as being above or below the grand
- median. The contingency table, along with `correction` and `lambda_`,
- are passed to `scipy.stats.chi2_contingency` to compute the test statistic
- and p-value.
- Parameters
- ----------
- sample1, sample2, ... : array_like
- The set of samples. There must be at least two samples.
- Each sample must be a one-dimensional sequence containing at least
- one value. The samples are not required to have the same length.
- ties : str, optional
- Determines how values equal to the grand median are classified in
- the contingency table. The string must be one of::
- "below":
- Values equal to the grand median are counted as "below".
- "above":
- Values equal to the grand median are counted as "above".
- "ignore":
- Values equal to the grand median are not counted.
- The default is "below".
- correction : bool, optional
- If True, *and* there are just two samples, apply Yates' correction
- for continuity when computing the test statistic associated with
- the contingency table. Default is True.
- lambda_ : float or str, optional
- By default, the statistic computed in this test is Pearson's
- chi-squared statistic. `lambda_` allows a statistic from the
- Cressie-Read power divergence family to be used instead. See
- `power_divergence` for details.
- Default is 1 (Pearson's chi-squared statistic).
- nan_policy : {'propagate', 'raise', 'omit'}, optional
- Defines how to handle when input contains nan. 'propagate' returns nan,
- 'raise' throws an error, 'omit' performs the calculations ignoring nan
- values. Default is 'propagate'.
- Returns
- -------
- res : MedianTestResult
- An object containing attributes:
- statistic : float
- The test statistic. The statistic that is returned is determined
- by `lambda_`. The default is Pearson's chi-squared statistic.
- pvalue : float
- The p-value of the test.
- median : float
- The grand median.
- table : ndarray
- The contingency table. The shape of the table is (2, n), where
- n is the number of samples. The first row holds the counts of the
- values above the grand median, and the second row holds the counts
- of the values below the grand median. The table allows further
- analysis with, for example, `scipy.stats.chi2_contingency`, or with
- `scipy.stats.fisher_exact` if there are two samples, without having
- to recompute the table. If ``nan_policy`` is "propagate" and there
- are nans in the input, the return value for ``table`` is ``None``.
- See Also
- --------
- kruskal : Compute the Kruskal-Wallis H-test for independent samples.
- mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
- Notes
- -----
- .. versionadded:: 0.15.0
- References
- ----------
- .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
- (1950), pp. 394-399.
- .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
- See Sections 8.12 and 10.15.
- Examples
- --------
- A biologist runs an experiment in which there are three groups of plants.
- Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
- Each plant produces a number of seeds. The seed counts for each group
- are::
- Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
- Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
- Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
- The following code applies Mood's median test to these samples.
- >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
- >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
- >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
- >>> from scipy.stats import median_test
- >>> res = median_test(g1, g2, g3)
- The median is
- >>> res.median
- 34.0
- and the contingency table is
- >>> res.table
- array([[ 5, 10, 7],
- [11, 5, 10]])
- `p` is too large to conclude that the medians are not the same:
- >>> res.pvalue
- 0.12609082774093244
- The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
- `median_test`.
- >>> res = median_test(g1, g2, g3, lambda_="log-likelihood")
- >>> res.pvalue
- 0.12224779737117837
- The median occurs several times in the data, so we'll get a different
- result if, for example, ``ties="above"`` is used:
- >>> res = median_test(g1, g2, g3, ties="above")
- >>> res.pvalue
- 0.063873276069553273
- >>> res.table
- array([[ 5, 11, 9],
- [11, 4, 8]])
- This example demonstrates that if the data set is not large and there
- are values equal to the median, the p-value can be sensitive to the
- choice of `ties`.
- """
- if len(samples) < 2:
- raise ValueError('median_test requires two or more samples.')
- ties_options = ['below', 'above', 'ignore']
- if ties not in ties_options:
- raise ValueError(f"invalid 'ties' option '{ties}'; 'ties' must be one "
- f"of: {str(ties_options)[1:-1]}")
- data = [np.asarray(sample) for sample in samples]
- # Validate the sizes and shapes of the arguments.
- for k, d in enumerate(data):
- if d.size == 0:
- raise ValueError(f"Sample {k + 1} is empty. All samples must "
- f"contain at least one value.")
- if d.ndim != 1:
- raise ValueError(f"Sample {k + 1} has {d.ndim} dimensions. "
- f"All samples must be one-dimensional sequences.")
- cdata = np.concatenate(data)
- contains_nan = _contains_nan(cdata, nan_policy)
- if nan_policy == 'propagate' and contains_nan:
- return MedianTestResult(np.nan, np.nan, np.nan, None)
- if contains_nan:
- grand_median = np.median(cdata[~np.isnan(cdata)])
- else:
- grand_median = np.median(cdata)
- # When the minimum version of numpy supported by scipy is 1.9.0,
- # the above if/else statement can be replaced by the single line:
- # grand_median = np.nanmedian(cdata)
- # Create the contingency table.
- table = np.zeros((2, len(data)), dtype=np.int64)
- for k, sample in enumerate(data):
- sample = sample[~np.isnan(sample)]
- nabove = count_nonzero(sample > grand_median)
- nbelow = count_nonzero(sample < grand_median)
- nequal = sample.size - (nabove + nbelow)
- table[0, k] += nabove
- table[1, k] += nbelow
- if ties == "below":
- table[1, k] += nequal
- elif ties == "above":
- table[0, k] += nequal
- # Check that no row or column of the table is all zero.
- # Such a table can not be given to chi2_contingency, because it would have
- # a zero in the table of expected frequencies.
- rowsums = table.sum(axis=1)
- if rowsums[0] == 0:
- raise ValueError(f"All values are below the grand median ({grand_median}).")
- if rowsums[1] == 0:
- raise ValueError(f"All values are above the grand median ({grand_median}).")
- if ties == "ignore":
- # We already checked that each sample has at least one value, but it
- # is possible that all those values equal the grand median. If `ties`
- # is "ignore", that would result in a column of zeros in `table`. We
- # check for that case here.
- zero_cols = np.nonzero((table == 0).all(axis=0))[0]
- if len(zero_cols) > 0:
- raise ValueError(
- f"All values in sample {zero_cols[0] + 1} are equal to the grand "
- f"median ({grand_median!r}), so they are ignored, resulting in an "
- f"empty sample."
- )
- stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
- correction=correction)
- return MedianTestResult(stat, p, grand_median, table)
- def _circfuncs_common(samples, period, xp=None):
- xp = array_namespace(samples) if xp is None else xp
- samples = xp_promote(samples, force_floating=True, xp=xp)
- # Recast samples as radians that range between 0 and 2 pi and calculate
- # the sine and cosine
- scaled_samples = samples * ((2.0 * pi) / period)
- sin_samp = xp.sin(scaled_samples)
- cos_samp = xp.cos(scaled_samples)
- return samples, sin_samp, cos_samp
- @xp_capabilities()
- @_axis_nan_policy_factory(
- lambda x: x, n_outputs=1, default_axis=None,
- result_to_tuple=lambda x, _: (x,)
- )
- def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
- r"""Compute the circular mean of a sample of angle observations.
- Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
- radians, their *circular mean* is defined by ([1]_, Eq. 2.2.4)
- .. math::
- \mathrm{Arg} \left( \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right)
- where :math:`i` is the imaginary unit and :math:`\mathop{\mathrm{Arg}} z`
- gives the principal value of the argument of complex number :math:`z`,
- restricted to the range :math:`[0,2\pi]` by default. :math:`z` in the
- above expression is known as the `mean resultant vector`.
- Parameters
- ----------
- samples : array_like
- Input array of angle observations. The value of a full angle is
- equal to ``(high - low)``.
- high : float, optional
- Upper boundary of the principal value of an angle. Default is ``2*pi``.
- low : float, optional
- Lower boundary of the principal value of an angle. Default is ``0``.
- Returns
- -------
- circmean : float
- Circular mean, restricted to the range ``[low, high]``.
- If the mean resultant vector is zero, an input-dependent,
- implementation-defined number between ``[low, high]`` is returned.
- If the input array is empty, ``np.nan`` is returned.
- See Also
- --------
- circstd : Circular standard deviation.
- circvar : Circular variance.
- References
- ----------
- .. [1] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
- John Wiley & Sons, 1999.
- Examples
- --------
- For readability, all angles are printed out in degrees.
- >>> import numpy as np
- >>> from scipy.stats import circmean
- >>> import matplotlib.pyplot as plt
- >>> angles = np.deg2rad(np.array([20, 30, 330]))
- >>> circmean = circmean(angles)
- >>> np.rad2deg(circmean)
- 7.294976657784009
- >>> mean = angles.mean()
- >>> np.rad2deg(mean)
- 126.66666666666666
- Plot and compare the circular mean against the arithmetic mean.
- >>> plt.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
- ... np.sin(np.linspace(0, 2*np.pi, 500)),
- ... c='k')
- >>> plt.scatter(np.cos(angles), np.sin(angles), c='k')
- >>> plt.scatter(np.cos(circmean), np.sin(circmean), c='b',
- ... label='circmean')
- >>> plt.scatter(np.cos(mean), np.sin(mean), c='r', label='mean')
- >>> plt.legend()
- >>> plt.axis('equal')
- >>> plt.show()
- """
- xp = array_namespace(samples)
- # Needed for non-NumPy arrays to get appropriate NaN result
- # Apparently atan2(0, 0) is 0, even though it is mathematically undefined
- if xp_size(samples) == 0:
- return xp.mean(samples, axis=axis)
- period = high - low
- samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
- sin_sum = xp.sum(sin_samp, axis=axis)
- cos_sum = xp.sum(cos_samp, axis=axis)
- res = xp.atan2(sin_sum, cos_sum)
- res = res[()] if res.ndim == 0 else res
- return (res * (period / (2.0 * pi)) - low) % period + low
- @xp_capabilities()
- @_axis_nan_policy_factory(
- lambda x: x, n_outputs=1, default_axis=None,
- result_to_tuple=lambda x, _: (x,)
- )
- def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
- r"""Compute the circular variance of a sample of angle observations.
- Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
- radians, their *circular variance* is defined by ([2]_, Eq. 2.3.3)
- .. math::
- 1 - \left| \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right|
- where :math:`i` is the imaginary unit and :math:`|z|` gives the length
- of the complex number :math:`z`. :math:`|z|` in the above expression
- is known as the `mean resultant length`.
- Parameters
- ----------
- samples : array_like
- Input array of angle observations. The value of a full angle is
- equal to ``(high - low)``.
- high : float, optional
- Upper boundary of the principal value of an angle. Default is ``2*pi``.
- low : float, optional
- Lower boundary of the principal value of an angle. Default is ``0``.
- Returns
- -------
- circvar : float
- Circular variance. The returned value is in the range ``[0, 1]``,
- where ``0`` indicates no variance and ``1`` indicates large variance.
- If the input array is empty, ``np.nan`` is returned.
- See Also
- --------
- circmean : Circular mean.
- circstd : Circular standard deviation.
- Notes
- -----
- In the limit of small angles, the circular variance is close to
- half the 'linear' variance if measured in radians.
- References
- ----------
- .. [1] Fisher, N.I. *Statistical analysis of circular data*. Cambridge
- University Press, 1993.
- .. [2] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
- John Wiley & Sons, 1999.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import circvar
- >>> import matplotlib.pyplot as plt
- >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
- ... 0.133, -0.473, -0.001, -0.348, 0.131])
- >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
- ... 0.104, -0.136, -0.867, 0.012, 0.105])
- >>> circvar_1 = circvar(samples_1)
- >>> circvar_2 = circvar(samples_2)
- Plot the samples.
- >>> fig, (left, right) = plt.subplots(ncols=2)
- >>> for image in (left, right):
- ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
- ... np.sin(np.linspace(0, 2*np.pi, 500)),
- ... c='k')
- ... image.axis('equal')
- ... image.axis('off')
- >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
- >>> left.set_title(f"circular variance: {np.round(circvar_1, 2)!r}")
- >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
- >>> right.set_title(f"circular variance: {np.round(circvar_2, 2)!r}")
- >>> plt.show()
- """
- xp = array_namespace(samples)
- period = high - low
- samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
- sin_mean = xp.mean(sin_samp, axis=axis)
- cos_mean = xp.mean(cos_samp, axis=axis)
- hypotenuse = (sin_mean**2. + cos_mean**2.)**0.5
- # hypotenuse can go slightly above 1 due to rounding errors
- R = xp.clip(hypotenuse, max=1.)
- res = 1. - R
- return res
- @xp_capabilities()
- @_axis_nan_policy_factory(
- lambda x: x, n_outputs=1, default_axis=None,
- result_to_tuple=lambda x, _: (x,)
- )
- def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate', *,
- normalize=False):
- r"""
- Compute the circular standard deviation of a sample of angle observations.
- Given :math:`n` angle observations :math:`x_1, \cdots, x_n` measured in
- radians, their `circular standard deviation` is defined by
- ([2]_, Eq. 2.3.11)
- .. math::
- \sqrt{ -2 \log \left| \frac{1}{n} \sum_{k=1}^n e^{i x_k} \right| }
- where :math:`i` is the imaginary unit and :math:`|z|` gives the length
- of the complex number :math:`z`. :math:`|z|` in the above expression
- is known as the `mean resultant length`.
- Parameters
- ----------
- samples : array_like
- Input array of angle observations. The value of a full angle is
- equal to ``(high - low)``.
- high : float, optional
- Upper boundary of the principal value of an angle. Default is ``2*pi``.
- low : float, optional
- Lower boundary of the principal value of an angle. Default is ``0``.
- normalize : boolean, optional
- If ``False`` (the default), the return value is computed from the
- above formula with the input scaled by ``(2*pi)/(high-low)`` and
- the output scaled (back) by ``(high-low)/(2*pi)``. If ``True``,
- the output is not scaled and is returned directly.
- Returns
- -------
- circstd : float
- Circular standard deviation, optionally normalized.
- If the input array is empty, ``np.nan`` is returned.
- See Also
- --------
- circmean : Circular mean.
- circvar : Circular variance.
- Notes
- -----
- In the limit of small angles, the circular standard deviation is close
- to the 'linear' standard deviation if ``normalize`` is ``False``.
- References
- ----------
- .. [1] Mardia, K. V. (1972). 2. In *Statistics of Directional Data*
- (pp. 18-24). Academic Press. :doi:`10.1016/C2013-0-07425-7`.
- .. [2] Mardia, K. V. and Jupp, P. E. *Directional Statistics*.
- John Wiley & Sons, 1999.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import circstd
- >>> import matplotlib.pyplot as plt
- >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
- ... 0.133, -0.473, -0.001, -0.348, 0.131])
- >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
- ... 0.104, -0.136, -0.867, 0.012, 0.105])
- >>> circstd_1 = circstd(samples_1)
- >>> circstd_2 = circstd(samples_2)
- Plot the samples.
- >>> fig, (left, right) = plt.subplots(ncols=2)
- >>> for image in (left, right):
- ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
- ... np.sin(np.linspace(0, 2*np.pi, 500)),
- ... c='k')
- ... image.axis('equal')
- ... image.axis('off')
- >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
- >>> left.set_title(f"circular std: {np.round(circstd_1, 2)!r}")
- >>> right.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
- ... np.sin(np.linspace(0, 2*np.pi, 500)),
- ... c='k')
- >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
- >>> right.set_title(f"circular std: {np.round(circstd_2, 2)!r}")
- >>> plt.show()
- """
- xp = array_namespace(samples)
- period = high - low
- samples, sin_samp, cos_samp = _circfuncs_common(samples, period, xp=xp)
- sin_mean = xp.mean(sin_samp, axis=axis) # [1] (2.2.3)
- cos_mean = xp.mean(cos_samp, axis=axis) # [1] (2.2.3)
- hypotenuse = (sin_mean**2. + cos_mean**2.)**0.5
- # hypotenuse can go slightly above 1 due to rounding errors
- R = xp.clip(hypotenuse, max=1.) # [1] (2.2.4)
- res = (-2*xp.log(R))**0.5+0.0 # torch.pow returns -0.0 if R==1
- if not normalize:
- res *= (high-low)/(2.*pi) # [1] (2.3.14) w/ (2.3.7)
- return res
- class DirectionalStats:
- def __init__(self, mean_direction, mean_resultant_length):
- self.mean_direction = mean_direction
- self.mean_resultant_length = mean_resultant_length
- def __repr__(self):
- return (f"DirectionalStats(mean_direction={self.mean_direction},"
- f" mean_resultant_length={self.mean_resultant_length})")
- @xp_capabilities()
- def directional_stats(samples, *, axis=0, normalize=True):
- """
- Computes sample statistics for directional data.
- Computes the directional mean (also called the mean direction vector) and
- mean resultant length of a sample of vectors.
- The directional mean is a measure of "preferred direction" of vector data.
- It is analogous to the sample mean, but it is for use when the length of
- the data is irrelevant (e.g. unit vectors).
- The mean resultant length is a value between 0 and 1 used to quantify the
- dispersion of directional data: the smaller the mean resultant length, the
- greater the dispersion. Several definitions of directional variance
- involving the mean resultant length are given in [1]_ and [2]_.
- Parameters
- ----------
- samples : array_like
- Input array. Must be at least two-dimensional, and the last axis of the
- input must correspond with the dimensionality of the vector space.
- When the input is exactly two dimensional, this means that each row
- of the data is a vector observation.
- axis : int, default: 0
- Axis along which the directional mean is computed.
- normalize: boolean, default: True
- If True, normalize the input to ensure that each observation is a
- unit vector. It the observations are already unit vectors, consider
- setting this to False to avoid unnecessary computation.
- Returns
- -------
- res : DirectionalStats
- An object containing attributes:
- mean_direction : ndarray
- Directional mean.
- mean_resultant_length : ndarray
- The mean resultant length [1]_.
- See Also
- --------
- circmean: circular mean; i.e. directional mean for 2D *angles*
- circvar: circular variance; i.e. directional variance for 2D *angles*
- Notes
- -----
- This uses a definition of directional mean from [1]_.
- Assuming the observations are unit vectors, the calculation is as follows.
- .. code-block:: python
- mean = samples.mean(axis=0)
- mean_resultant_length = np.linalg.norm(mean)
- mean_direction = mean / mean_resultant_length
- This definition is appropriate for *directional* data (i.e. vector data
- for which the magnitude of each observation is irrelevant) but not
- for *axial* data (i.e. vector data for which the magnitude and *sign* of
- each observation is irrelevant).
- Several definitions of directional variance involving the mean resultant
- length ``R`` have been proposed, including ``1 - R`` [1]_, ``1 - R**2``
- [2]_, and ``2 * (1 - R)`` [2]_. Rather than choosing one, this function
- returns ``R`` as attribute `mean_resultant_length` so the user can compute
- their preferred measure of dispersion.
- References
- ----------
- .. [1] Mardia, Jupp. (2000). *Directional Statistics*
- (p. 163). Wiley.
- .. [2] https://en.wikipedia.org/wiki/Directional_statistics
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import directional_stats
- >>> data = np.array([[3, 4], # first observation, 2D vector space
- ... [6, -8]]) # second observation
- >>> dirstats = directional_stats(data)
- >>> dirstats.mean_direction
- array([1., 0.])
- In contrast, the regular sample mean of the vectors would be influenced
- by the magnitude of each observation. Furthermore, the result would not be
- a unit vector.
- >>> data.mean(axis=0)
- array([4.5, -2.])
- An exemplary use case for `directional_stats` is to find a *meaningful*
- center for a set of observations on a sphere, e.g. geographical locations.
- >>> data = np.array([[0.8660254, 0.5, 0.],
- ... [0.8660254, -0.5, 0.]])
- >>> dirstats = directional_stats(data)
- >>> dirstats.mean_direction
- array([1., 0., 0.])
- The regular sample mean on the other hand yields a result which does not
- lie on the surface of the sphere.
- >>> data.mean(axis=0)
- array([0.8660254, 0., 0.])
- The function also returns the mean resultant length, which
- can be used to calculate a directional variance. For example, using the
- definition ``Var(z) = 1 - R`` from [2]_ where ``R`` is the
- mean resultant length, we can calculate the directional variance of the
- vectors in the above example as:
- >>> 1 - dirstats.mean_resultant_length
- 0.13397459716167093
- """
- xp = array_namespace(samples)
- samples = xp.asarray(samples)
- if samples.ndim < 2:
- raise ValueError("samples must at least be two-dimensional. "
- f"Instead samples has shape: {tuple(samples.shape)}")
- samples = xp.moveaxis(samples, axis, 0)
- if is_marray(xp):
- _xp = array_namespace(samples.mask)
- mask = _xp.any(samples.mask, axis=-1, keepdims=True)
- samples = xp.asarray(samples.data, mask=mask)
- if normalize:
- vectornorms = xp_vector_norm(samples, axis=-1, keepdims=True, xp=xp)
- samples = samples/vectornorms
- mean = xp.mean(samples, axis=0)
- mean_resultant_length = xp_vector_norm(mean, axis=-1, keepdims=True, xp=xp)
- mean_direction = mean / mean_resultant_length
- mrl = xp.squeeze(mean_resultant_length, axis=-1)
- mean_resultant_length = mrl[()] if mrl.ndim == 0 else mrl
- return DirectionalStats(mean_direction, mean_resultant_length)
- @xp_capabilities(skip_backends=[('dask.array', "no take_along_axis")], jax_jit=False)
- def false_discovery_control(ps, *, axis=0, method='bh'):
- """Adjust p-values to control the false discovery rate.
- The false discovery rate (FDR) is the expected proportion of rejected null
- hypotheses that are actually true.
- If the null hypothesis is rejected when the *adjusted* p-value falls below
- a specified level, the false discovery rate is controlled at that level.
- Parameters
- ----------
- ps : 1D array_like
- The p-values to adjust. Elements must be real numbers between 0 and 1.
- axis : int
- The axis along which to perform the adjustment. The adjustment is
- performed independently along each axis-slice. If `axis` is None, `ps`
- is raveled before performing the adjustment.
- method : {'bh', 'by'}
- The false discovery rate control procedure to apply: ``'bh'`` is for
- Benjamini-Hochberg [1]_ (Eq. 1), ``'by'`` is for Benjaminini-Yekutieli
- [2]_ (Theorem 1.3). The latter is more conservative, but it is
- guaranteed to control the FDR even when the p-values are not from
- independent tests.
- Returns
- -------
- ps_adusted : array_like
- The adjusted p-values. If the null hypothesis is rejected where these
- fall below a specified level, the false discovery rate is controlled
- at that level.
- See Also
- --------
- combine_pvalues
- statsmodels.stats.multitest.multipletests
- Notes
- -----
- In multiple hypothesis testing, false discovery control procedures tend to
- offer higher power than familywise error rate control procedures (e.g.
- Bonferroni correction [1]_).
- If the p-values correspond with independent tests (or tests with
- "positive regression dependencies" [2]_), rejecting null hypotheses
- corresponding with Benjamini-Hochberg-adjusted p-values below :math:`q`
- controls the false discovery rate at a level less than or equal to
- :math:`q m_0 / m`, where :math:`m_0` is the number of true null hypotheses
- and :math:`m` is the total number of null hypotheses tested. The same is
- true even for dependent tests when the p-values are adjusted accorded to
- the more conservative Benjaminini-Yekutieli procedure.
- The adjusted p-values produced by this function are comparable to those
- produced by the R function ``p.adjust`` and the statsmodels function
- `statsmodels.stats.multitest.multipletests`. Please consider the latter
- for more advanced methods of multiple comparison correction.
- References
- ----------
- .. [1] Benjamini, Yoav, and Yosef Hochberg. "Controlling the false
- discovery rate: a practical and powerful approach to multiple
- testing." Journal of the Royal statistical society: series B
- (Methodological) 57.1 (1995): 289-300.
- .. [2] Benjamini, Yoav, and Daniel Yekutieli. "The control of the false
- discovery rate in multiple testing under dependency." Annals of
- statistics (2001): 1165-1188.
- .. [3] TileStats. FDR - Benjamini-Hochberg explained - Youtube.
- https://www.youtube.com/watch?v=rZKa4tW2NKs.
- .. [4] Neuhaus, Karl-Ludwig, et al. "Improved thrombolysis in acute
- myocardial infarction with front-loaded administration of alteplase:
- results of the rt-PA-APSAC patency study (TAPS)." Journal of the
- American College of Cardiology 19.5 (1992): 885-891.
- Examples
- --------
- We follow the example from [1]_.
- Thrombolysis with recombinant tissue-type plasminogen activator (rt-PA)
- and anisoylated plasminogen streptokinase activator (APSAC) in
- myocardial infarction has been proved to reduce mortality. [4]_
- investigated the effects of a new front-loaded administration of rt-PA
- versus those obtained with a standard regimen of APSAC, in a randomized
- multicentre trial in 421 patients with acute myocardial infarction.
- There were four families of hypotheses tested in the study, the last of
- which was "cardiac and other events after the start of thrombolitic
- treatment". FDR control may be desired in this family of hypotheses
- because it would not be appropriate to conclude that the front-loaded
- treatment is better if it is merely equivalent to the previous treatment.
- The p-values corresponding with the 15 hypotheses in this family were
- >>> ps = [0.0001, 0.0004, 0.0019, 0.0095, 0.0201, 0.0278, 0.0298, 0.0344,
- ... 0.0459, 0.3240, 0.4262, 0.5719, 0.6528, 0.7590, 1.000]
- If the chosen significance level is 0.05, we may be tempted to reject the
- null hypotheses for the tests corresponding with the first nine p-values,
- as the first nine p-values fall below the chosen significance level.
- However, this would ignore the problem of "multiplicity": if we fail to
- correct for the fact that multiple comparisons are being performed, we
- are more likely to incorrectly reject true null hypotheses.
- One approach to the multiplicity problem is to control the family-wise
- error rate (FWER), that is, the rate at which the null hypothesis is
- rejected when it is actually true. A common procedure of this kind is the
- Bonferroni correction [1]_. We begin by multiplying the p-values by the
- number of hypotheses tested.
- >>> import numpy as np
- >>> np.array(ps) * len(ps)
- array([1.5000e-03, 6.0000e-03, 2.8500e-02, 1.4250e-01, 3.0150e-01,
- 4.1700e-01, 4.4700e-01, 5.1600e-01, 6.8850e-01, 4.8600e+00,
- 6.3930e+00, 8.5785e+00, 9.7920e+00, 1.1385e+01, 1.5000e+01])
- To control the FWER at 5%, we reject only the hypotheses corresponding
- with adjusted p-values less than 0.05. In this case, only the hypotheses
- corresponding with the first three p-values can be rejected. According to
- [1]_, these three hypotheses concerned "allergic reaction" and "two
- different aspects of bleeding."
- An alternative approach is to control the false discovery rate: the
- expected fraction of rejected null hypotheses that are actually true. The
- advantage of this approach is that it typically affords greater power: an
- increased rate of rejecting the null hypothesis when it is indeed false. To
- control the false discovery rate at 5%, we apply the Benjamini-Hochberg
- p-value adjustment.
- >>> from scipy import stats
- >>> stats.false_discovery_control(ps)
- array([0.0015 , 0.003 , 0.0095 , 0.035625 , 0.0603 ,
- 0.06385714, 0.06385714, 0.0645 , 0.0765 , 0.486 ,
- 0.58118182, 0.714875 , 0.75323077, 0.81321429, 1. ])
- Now, the first *four* adjusted p-values fall below 0.05, so we would reject
- the null hypotheses corresponding with these *four* p-values. Rejection
- of the fourth null hypothesis was particularly important to the original
- study as it led to the conclusion that the new treatment had a
- "substantially lower in-hospital mortality rate."
- For simplicity of exposition, the p-values in the example above were given in
- sorted order, but this is not required; `false_discovery_control` returns
- adjusted p-values in order corresponding with the input `ps`.
- >>> stats.false_discovery_control([0.5, 0.6, 0.1, 0.001])
- array([0.6 , 0.6 , 0.2 , 0.004])
- """
- xp = array_namespace(ps)
- # Input Validation and Special Cases
- ps = xp.asarray(ps)
- ps_in_range = (xp.isdtype(ps.dtype, ("integral", "real floating"))
- and xp.all(ps == xp.clip(ps, 0., 1.)))
- if not ps_in_range:
- raise ValueError("`ps` must include only numbers between 0 and 1.")
- methods = {'bh', 'by'}
- if method.lower() not in methods:
- raise ValueError(f"Unrecognized `method` '{method}'."
- f"Method must be one of {methods}.")
- method = method.lower()
- if axis is None:
- axis = 0
- ps = xp_ravel(ps)
- axis = np.asarray(axis)[()] # use of NumPy for input validation is OK
- if not np.issubdtype(axis.dtype, np.integer) or axis.size != 1:
- raise ValueError("`axis` must be an integer or `None`")
- axis = int(axis)
- if xp_size(ps) <= 1 or ps.shape[axis] <= 1:
- return ps[()] if ps.ndim == 0 else ps
- ps = xp.moveaxis(ps, axis, -1)
- m = ps.shape[-1]
- # Main Algorithm
- # Equivalent to the ideas of [1] and [2], except that this adjusts the
- # p-values as described in [3]. The results are similar to those produced
- # by R's p.adjust.
- # "Let [ps] be the ordered observed p-values..."
- order = xp.argsort(ps, axis=-1)
- ps = xp.take_along_axis(ps, order, axis=-1) # this copies ps
- # Equation 1 of [1] rearranged to reject when p is less than specified q
- i = xp.arange(1, m+1, dtype=ps.dtype, device=xp_device(ps))
- # ps *= m / i
- ps = xpx.at(ps)[...].multiply(m / i)
- # Theorem 1.3 of [2]
- if method == 'by':
- # ps *= np.sum(1 / i)
- ps = xpx.at(ps)[...].multiply(xp.sum(1 / i))
- # accounts for rejecting all null hypotheses i for i < k, where k is
- # defined in Eq. 1 of either [1] or [2]. See [3]. Starting with the index j
- # of the second to last element, we replace element j with element j+1 if
- # the latter is smaller.
- if is_numpy(xp):
- np.minimum.accumulate(ps[..., ::-1], out=ps[..., ::-1], axis=-1)
- else:
- n = ps.shape[-1]
- for j in range(n-2, -1, -1):
- # ps[..., j] = xp.minimum(ps[..., j], ps[..., j+1])
- ps = xpx.at(ps)[..., j].set(xp.minimum(ps[..., j], ps[..., j+1]))
- # Restore original order of axes and data
- ps = _reorder_along_axis(ps, order, axis=-1, xp=xp)
- ps = xp.moveaxis(ps, -1, axis)
- return xp.clip(ps, 0., 1.)
- def _reorder_along_axis(x, i, *, axis, xp):
- if is_jax(xp):
- return xp.put_along_axis(x, i, values=x, axis=axis, inplace=False)
- if hasattr(xp, 'put_along_axis'):
- xp.put_along_axis(x, i, values=x.copy(), axis=axis)
- return x
- else:
- return xp.take_along_axis(x, xp.argsort(i, axis=-1), axis=-1)
|