_mstats_basic.py 120 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657
  1. """
  2. An extension of scipy.stats._stats_py to support masked arrays
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. __all__ = ['argstoarray',
  6. 'count_tied_groups',
  7. 'describe',
  8. 'f_oneway', 'find_repeats','friedmanchisquare',
  9. 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
  10. 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
  11. 'ks_1samp', 'kstest',
  12. 'linregress',
  13. 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
  14. 'normaltest',
  15. 'obrientransform',
  16. 'pearsonr','plotting_positions','pointbiserialr',
  17. 'rankdata',
  18. 'scoreatpercentile','sem',
  19. 'sen_seasonal_slopes','skew','skewtest','spearmanr',
  20. 'siegelslopes', 'theilslopes',
  21. 'tmax','tmean','tmin','trim','trimboth',
  22. 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
  23. 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
  24. 'ttest_ind','ttest_rel','tvar',
  25. 'variation',
  26. 'winsorize',
  27. 'brunnermunzel',
  28. ]
  29. import numpy as np
  30. from numpy import ndarray
  31. import numpy.ma as ma
  32. from numpy.ma import masked, nomask
  33. import math
  34. import itertools
  35. import warnings
  36. from collections import namedtuple
  37. from . import distributions
  38. from scipy._lib._util import _rename_parameter, _contains_nan, _dedent_for_py313
  39. from scipy._lib._bunch import _make_tuple_bunch
  40. import scipy.special as special
  41. import scipy.stats._stats_py
  42. import scipy.stats._stats_py as _stats_py
  43. from ._stats_mstats_common import (
  44. _find_repeats,
  45. theilslopes as stats_theilslopes,
  46. siegelslopes as stats_siegelslopes
  47. )
  48. def _chk_asarray(a, axis):
  49. # Always returns a masked array, raveled for axis=None
  50. a = ma.asanyarray(a)
  51. if axis is None:
  52. a = ma.ravel(a)
  53. outaxis = 0
  54. else:
  55. outaxis = axis
  56. return a, outaxis
  57. def _chk2_asarray(a, b, axis):
  58. a = ma.asanyarray(a)
  59. b = ma.asanyarray(b)
  60. if axis is None:
  61. a = ma.ravel(a)
  62. b = ma.ravel(b)
  63. outaxis = 0
  64. else:
  65. outaxis = axis
  66. return a, b, outaxis
  67. def _chk_size(a, b):
  68. a = ma.asanyarray(a)
  69. b = ma.asanyarray(b)
  70. (na, nb) = (a.size, b.size)
  71. if na != nb:
  72. raise ValueError("The size of the input array should match!"
  73. f" ({na} <> {nb})")
  74. return (a, b, na)
  75. def _ttest_finish(df, t, alternative):
  76. """Common code between all 3 t-test functions."""
  77. # We use ``stdtr`` directly here to preserve masked arrays
  78. if alternative == 'less':
  79. pval = special._ufuncs.stdtr(df, t)
  80. elif alternative == 'greater':
  81. pval = special._ufuncs.stdtr(df, -t)
  82. elif alternative == 'two-sided':
  83. pval = special._ufuncs.stdtr(df, -np.abs(t))*2
  84. else:
  85. raise ValueError("alternative must be "
  86. "'less', 'greater' or 'two-sided'")
  87. if t.ndim == 0:
  88. t = t[()]
  89. if pval.ndim == 0:
  90. pval = pval[()]
  91. return t, pval
  92. def argstoarray(*args):
  93. """
  94. Constructs a 2D array from a group of sequences.
  95. Sequences are filled with missing values to match the length of the longest
  96. sequence.
  97. Parameters
  98. ----------
  99. *args : sequences
  100. Group of sequences.
  101. Returns
  102. -------
  103. argstoarray : MaskedArray
  104. A ( `m` x `n` ) masked array, where `m` is the number of arguments and
  105. `n` the length of the longest argument.
  106. Notes
  107. -----
  108. `numpy.ma.vstack` has identical behavior, but is called with a sequence
  109. of sequences.
  110. Examples
  111. --------
  112. A 2D masked array constructed from a group of sequences is returned.
  113. >>> from scipy.stats.mstats import argstoarray
  114. >>> argstoarray([1, 2, 3], [4, 5, 6])
  115. masked_array(
  116. data=[[1.0, 2.0, 3.0],
  117. [4.0, 5.0, 6.0]],
  118. mask=[[False, False, False],
  119. [False, False, False]],
  120. fill_value=1e+20)
  121. The returned masked array filled with missing values when the lengths of
  122. sequences are different.
  123. >>> argstoarray([1, 3], [4, 5, 6])
  124. masked_array(
  125. data=[[1.0, 3.0, --],
  126. [4.0, 5.0, 6.0]],
  127. mask=[[False, False, True],
  128. [False, False, False]],
  129. fill_value=1e+20)
  130. """
  131. if len(args) == 1 and not isinstance(args[0], ndarray):
  132. output = ma.asarray(args[0])
  133. if output.ndim != 2:
  134. raise ValueError("The input should be 2D")
  135. else:
  136. n = len(args)
  137. m = max([len(k) for k in args])
  138. output = ma.array(np.empty((n,m), dtype=float), mask=True)
  139. for (k,v) in enumerate(args):
  140. output[k,:len(v)] = v
  141. output[np.logical_not(np.isfinite(output._data))] = masked
  142. return output
  143. def find_repeats(arr):
  144. """Find repeats in arr and return a tuple (repeats, repeat_count).
  145. The input is cast to float64. Masked values are discarded.
  146. Parameters
  147. ----------
  148. arr : sequence
  149. Input array. The array is flattened if it is not 1D.
  150. Returns
  151. -------
  152. repeats : ndarray
  153. Array of repeated values.
  154. counts : ndarray
  155. Array of counts.
  156. Examples
  157. --------
  158. >>> from scipy.stats import mstats
  159. >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
  160. (array([2.]), array([4]))
  161. In the above example, 2 repeats 4 times.
  162. >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
  163. (array([4., 5.]), array([2, 2]))
  164. In the above example, both 4 and 5 repeat 2 times.
  165. """
  166. # Make sure we get a copy. ma.compressed promises a "new array", but can
  167. # actually return a reference.
  168. compr = np.asarray(ma.compressed(arr), dtype=np.float64)
  169. try:
  170. need_copy = np.may_share_memory(compr, arr)
  171. except AttributeError:
  172. # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
  173. # while in numpy 1.8.2 and above it just (correctly) returns False.
  174. need_copy = False
  175. if need_copy:
  176. compr = compr.copy()
  177. return _find_repeats(compr)
  178. def count_tied_groups(x, use_missing=False):
  179. """
  180. Counts the number of tied values.
  181. Parameters
  182. ----------
  183. x : sequence
  184. Sequence of data on which to counts the ties
  185. use_missing : bool, optional
  186. Whether to consider missing values as tied.
  187. Returns
  188. -------
  189. count_tied_groups : dict
  190. Returns a dictionary (nb of ties: nb of groups).
  191. Examples
  192. --------
  193. >>> from scipy.stats import mstats
  194. >>> import numpy as np
  195. >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
  196. >>> mstats.count_tied_groups(z)
  197. {2: 1, 3: 2}
  198. In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
  199. >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
  200. >>> mstats.count_tied_groups(z)
  201. {2: 2, 3: 1}
  202. >>> z[[1,-1]] = np.ma.masked
  203. >>> mstats.count_tied_groups(z, use_missing=True)
  204. {2: 2, 3: 1}
  205. """
  206. nmasked = ma.getmask(x).sum()
  207. # We need the copy as find_repeats will overwrite the initial data
  208. data = ma.compressed(x).copy()
  209. (ties, counts) = find_repeats(data)
  210. nties = {}
  211. if len(ties):
  212. nties = dict(zip(np.unique(counts), itertools.repeat(1)))
  213. nties.update(dict(zip(*find_repeats(counts))))
  214. if nmasked and use_missing:
  215. try:
  216. nties[nmasked] += 1
  217. except KeyError:
  218. nties[nmasked] = 1
  219. return nties
  220. def rankdata(data, axis=None, use_missing=False):
  221. """Returns the rank (also known as order statistics) of each data point
  222. along the given axis.
  223. If some values are tied, their rank is averaged.
  224. If some values are masked, their rank is set to 0 if use_missing is False,
  225. or set to the average rank of the unmasked values if use_missing is True.
  226. Parameters
  227. ----------
  228. data : sequence
  229. Input data. The data is transformed to a masked array
  230. axis : {None,int}, optional
  231. Axis along which to perform the ranking.
  232. If None, the array is first flattened. An exception is raised if
  233. the axis is specified for arrays with a dimension larger than 2
  234. use_missing : bool, optional
  235. Whether the masked values have a rank of 0 (False) or equal to the
  236. average rank of the unmasked values (True).
  237. """
  238. def _rank1d(data, use_missing=False):
  239. n = data.count()
  240. rk = np.empty(data.size, dtype=float)
  241. idx = data.argsort()
  242. rk[idx[:n]] = np.arange(1,n+1)
  243. if use_missing:
  244. rk[idx[n:]] = (n+1)/2.
  245. else:
  246. rk[idx[n:]] = 0
  247. repeats = find_repeats(data.copy())
  248. for r in repeats[0]:
  249. condition = (data == r).filled(False)
  250. rk[condition] = rk[condition].mean()
  251. return rk
  252. data = ma.array(data, copy=False)
  253. if axis is None:
  254. if data.ndim > 1:
  255. return _rank1d(data.ravel(), use_missing).reshape(data.shape)
  256. else:
  257. return _rank1d(data, use_missing)
  258. else:
  259. return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
  260. ModeResult = namedtuple('ModeResult', ('mode', 'count'))
  261. def mode(a, axis=0):
  262. """
  263. Returns an array of the modal (most common) value in the passed array.
  264. Parameters
  265. ----------
  266. a : array_like
  267. n-dimensional array of which to find mode(s).
  268. axis : int or None, optional
  269. Axis along which to operate. Default is 0. If None, compute over
  270. the whole array `a`.
  271. Returns
  272. -------
  273. mode : ndarray
  274. Array of modal values.
  275. count : ndarray
  276. Array of counts for each mode.
  277. Notes
  278. -----
  279. For more details, see `scipy.stats.mode`.
  280. Examples
  281. --------
  282. >>> import numpy as np
  283. >>> from scipy import stats
  284. >>> from scipy.stats import mstats
  285. >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
  286. >>> mstats.mode(m_arr) # note that most zeros are masked
  287. ModeResult(mode=array([1.]), count=array([2.]))
  288. """
  289. return _mode(a, axis=axis, keepdims=True)
  290. def _mode(a, axis=0, keepdims=True):
  291. # Don't want to expose `keepdims` from the public `mstats.mode`
  292. a, axis = _chk_asarray(a, axis)
  293. def _mode1D(a):
  294. (rep,cnt) = find_repeats(a)
  295. if not cnt.ndim:
  296. return (0, 0)
  297. elif cnt.size:
  298. return (rep[cnt.argmax()], cnt.max())
  299. else:
  300. return (a.min(), 1)
  301. if axis is None:
  302. output = _mode1D(ma.ravel(a))
  303. output = (ma.array(output[0]), ma.array(output[1]))
  304. else:
  305. output = ma.apply_along_axis(_mode1D, axis, a)
  306. if keepdims is None or keepdims:
  307. newshape = list(a.shape)
  308. newshape[axis] = 1
  309. slices = [slice(None)] * output.ndim
  310. slices[axis] = 0
  311. modes = output[tuple(slices)].reshape(newshape)
  312. slices[axis] = 1
  313. counts = output[tuple(slices)].reshape(newshape)
  314. output = (modes, counts)
  315. else:
  316. output = np.moveaxis(output, axis, 0)
  317. return ModeResult(*output)
  318. def _betai(a, b, x):
  319. x = np.asanyarray(x)
  320. x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
  321. return special.betainc(a, b, x)
  322. def msign(x):
  323. """Returns the sign of x, or 0 if x is masked."""
  324. return ma.filled(np.sign(x), 0)
  325. def pearsonr(x, y):
  326. r"""
  327. Pearson correlation coefficient and p-value for testing non-correlation.
  328. The Pearson correlation coefficient [1]_ measures the linear relationship
  329. between two datasets. The calculation of the p-value relies on the
  330. assumption that each dataset is normally distributed. (See Kowalski [3]_
  331. for a discussion of the effects of non-normality of the input on the
  332. distribution of the correlation coefficient.) Like other correlation
  333. coefficients, this one varies between -1 and +1 with 0 implying no
  334. correlation. Correlations of -1 or +1 imply an exact linear relationship.
  335. Parameters
  336. ----------
  337. x : (N,) array_like
  338. Input array.
  339. y : (N,) array_like
  340. Input array.
  341. Returns
  342. -------
  343. r : float
  344. Pearson's correlation coefficient.
  345. p-value : float
  346. Two-tailed p-value.
  347. Warns
  348. -----
  349. `~scipy.stats.ConstantInputWarning`
  350. Raised if an input is a constant array. The correlation coefficient
  351. is not defined in this case, so ``np.nan`` is returned.
  352. `~scipy.stats.NearConstantInputWarning`
  353. Raised if an input is "nearly" constant. The array ``x`` is considered
  354. nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
  355. Numerical errors in the calculation ``x - mean(x)`` in this case might
  356. result in an inaccurate calculation of r.
  357. See Also
  358. --------
  359. spearmanr : Spearman rank-order correlation coefficient.
  360. kendalltau : Kendall's tau, a correlation measure for ordinal data.
  361. Notes
  362. -----
  363. The correlation coefficient is calculated as follows:
  364. .. math::
  365. r = \frac{\sum (x - m_x) (y - m_y)}
  366. {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
  367. where :math:`m_x` is the mean of the vector x and :math:`m_y` is
  368. the mean of the vector y.
  369. Under the assumption that x and y are drawn from
  370. independent normal distributions (so the population correlation coefficient
  371. is 0), the probability density function of the sample correlation
  372. coefficient r is ([1]_, [2]_):
  373. .. math::
  374. f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
  375. where n is the number of samples, and B is the beta function. This
  376. is sometimes referred to as the exact distribution of r. This is
  377. the distribution that is used in `pearsonr` to compute the p-value.
  378. The distribution is a beta distribution on the interval [-1, 1],
  379. with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
  380. implementation of the beta distribution, the distribution of r is::
  381. dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
  382. The p-value returned by `pearsonr` is a two-sided p-value. The p-value
  383. roughly indicates the probability of an uncorrelated system
  384. producing datasets that have a Pearson correlation at least as extreme
  385. as the one computed from these datasets. More precisely, for a
  386. given sample with correlation coefficient r, the p-value is
  387. the probability that abs(r') of a random sample x' and y' drawn from
  388. the population with zero correlation would be greater than or equal
  389. to abs(r). In terms of the object ``dist`` shown above, the p-value
  390. for a given r and length n can be computed as::
  391. p = 2*dist.cdf(-abs(r))
  392. When n is 2, the above continuous distribution is not well-defined.
  393. One can interpret the limit of the beta distribution as the shape
  394. parameters a and b approach a = b = 0 as a discrete distribution with
  395. equal probability masses at r = 1 and r = -1. More directly, one
  396. can observe that, given the data x = [x1, x2] and y = [y1, y2], and
  397. assuming x1 != x2 and y1 != y2, the only possible values for r are 1
  398. and -1. Because abs(r') for any sample x' and y' with length 2 will
  399. be 1, the two-sided p-value for a sample of length 2 is always 1.
  400. References
  401. ----------
  402. .. [1] "Pearson correlation coefficient", Wikipedia,
  403. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
  404. .. [2] Student, "Probable error of a correlation coefficient",
  405. Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
  406. .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
  407. of the Sample Product-Moment Correlation Coefficient"
  408. Journal of the Royal Statistical Society. Series C (Applied
  409. Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
  410. Examples
  411. --------
  412. >>> import numpy as np
  413. >>> from scipy import stats
  414. >>> from scipy.stats import mstats
  415. >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
  416. (-0.7426106572325057, 0.1505558088534455)
  417. There is a linear dependence between x and y if y = a + b*x + e, where
  418. a,b are constants and e is a random error term, assumed to be independent
  419. of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
  420. e follow a normal distribution with mean zero and standard deviation s>0.
  421. >>> s = 0.5
  422. >>> x = stats.norm.rvs(size=500)
  423. >>> e = stats.norm.rvs(scale=s, size=500)
  424. >>> y = x + e
  425. >>> mstats.pearsonr(x, y)
  426. (0.9029601878969703, 8.428978827629898e-185) # may vary
  427. This should be close to the exact value given by
  428. >>> 1/np.sqrt(1 + s**2)
  429. 0.8944271909999159
  430. For s=0.5, we observe a high level of correlation. In general, a large
  431. variance of the noise reduces the correlation, while the correlation
  432. approaches one as the variance of the error goes to zero.
  433. It is important to keep in mind that no correlation does not imply
  434. independence unless (x, y) is jointly normal. Correlation can even be zero
  435. when there is a very simple dependence structure: if X follows a
  436. standard normal distribution, let y = abs(x). Note that the correlation
  437. between x and y is zero. Indeed, since the expectation of x is zero,
  438. cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
  439. by symmetry. The following lines of code illustrate this observation:
  440. >>> y = np.abs(x)
  441. >>> mstats.pearsonr(x, y)
  442. (-0.016172891856853524, 0.7182823678751942) # may vary
  443. A non-zero correlation coefficient can be misleading. For example, if X has
  444. a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
  445. A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
  446. implying a high level of correlation:
  447. >>> y = np.where(x < 0, x, 0)
  448. >>> mstats.pearsonr(x, y)
  449. (0.8537091583771509, 3.183461621422181e-143) # may vary
  450. This is unintuitive since there is no dependence of x and y if x is larger
  451. than zero which happens in about half of the cases if we sample x and y.
  452. """
  453. (x, y, n) = _chk_size(x, y)
  454. (x, y) = (x.ravel(), y.ravel())
  455. # Get the common mask and the total nb of unmasked elements
  456. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  457. n -= m.sum()
  458. df = n-2
  459. if df < 0:
  460. return (masked, masked)
  461. return scipy.stats._stats_py.pearsonr(
  462. ma.masked_array(x, mask=m).compressed(),
  463. ma.masked_array(y, mask=m).compressed())
  464. def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
  465. alternative='two-sided'):
  466. """
  467. Calculates a Spearman rank-order correlation coefficient and the p-value
  468. to test for non-correlation.
  469. The Spearman correlation is a nonparametric measure of the linear
  470. relationship between two datasets. Unlike the Pearson correlation, the
  471. Spearman correlation does not assume that both datasets are normally
  472. distributed. Like other correlation coefficients, this one varies
  473. between -1 and +1 with 0 implying no correlation. Correlations of -1 or
  474. +1 imply a monotonic relationship. Positive correlations imply that
  475. as `x` increases, so does `y`. Negative correlations imply that as `x`
  476. increases, `y` decreases.
  477. Missing values are discarded pair-wise: if a value is missing in `x`, the
  478. corresponding value in `y` is masked.
  479. The p-value roughly indicates the probability of an uncorrelated system
  480. producing datasets that have a Spearman correlation at least as extreme
  481. as the one computed from these datasets. The p-values are not entirely
  482. reliable but are probably reasonable for datasets larger than 500 or so.
  483. Parameters
  484. ----------
  485. x, y : 1D or 2D array_like, y is optional
  486. One or two 1-D or 2-D arrays containing multiple variables and
  487. observations. When these are 1-D, each represents a vector of
  488. observations of a single variable. For the behavior in the 2-D case,
  489. see under ``axis``, below.
  490. use_ties : bool, optional
  491. DO NOT USE. Does not do anything, keyword is only left in place for
  492. backwards compatibility reasons.
  493. axis : int or None, optional
  494. If axis=0 (default), then each column represents a variable, with
  495. observations in the rows. If axis=1, the relationship is transposed:
  496. each row represents a variable, while the columns contain observations.
  497. If axis=None, then both arrays will be raveled.
  498. nan_policy : {'propagate', 'raise', 'omit'}, optional
  499. Defines how to handle when input contains nan. 'propagate' returns nan,
  500. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  501. values. Default is 'propagate'.
  502. alternative : {'two-sided', 'less', 'greater'}, optional
  503. Defines the alternative hypothesis. Default is 'two-sided'.
  504. The following options are available:
  505. * 'two-sided': the correlation is nonzero
  506. * 'less': the correlation is negative (less than zero)
  507. * 'greater': the correlation is positive (greater than zero)
  508. .. versionadded:: 1.7.0
  509. Returns
  510. -------
  511. res : SignificanceResult
  512. An object containing attributes:
  513. statistic : float or ndarray (2-D square)
  514. Spearman correlation matrix or correlation coefficient (if only 2
  515. variables are given as parameters). Correlation matrix is square
  516. with length equal to total number of variables (columns or rows) in
  517. ``a`` and ``b`` combined.
  518. pvalue : float
  519. The p-value for a hypothesis test whose null hypothesis
  520. is that two sets of data are linearly uncorrelated. See
  521. `alternative` above for alternative hypotheses. `pvalue` has the
  522. same shape as `statistic`.
  523. References
  524. ----------
  525. [CRCProbStat2000] section 14.7
  526. """
  527. if not use_ties:
  528. raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
  529. # Always returns a masked array, raveled if axis=None
  530. x, axisout = _chk_asarray(x, axis)
  531. if y is not None:
  532. # Deal only with 2-D `x` case.
  533. y, _ = _chk_asarray(y, axis)
  534. if axisout == 0:
  535. x = ma.column_stack((x, y))
  536. else:
  537. x = ma.vstack((x, y))
  538. if axisout == 1:
  539. # To simplify the code that follow (always use `n_obs, n_vars` shape)
  540. x = x.T
  541. if nan_policy == 'omit':
  542. x = ma.masked_invalid(x)
  543. def _spearmanr_2cols(x):
  544. # Mask the same observations for all variables, and then drop those
  545. # observations (can't leave them masked, rankdata is weird).
  546. x = ma.mask_rowcols(x, axis=0)
  547. x = x[~x.mask.any(axis=1), :]
  548. # If either column is entirely NaN or Inf
  549. if not np.any(x.data):
  550. res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
  551. res.correlation = np.nan
  552. return res
  553. m = ma.getmask(x)
  554. n_obs = x.shape[0]
  555. dof = n_obs - 2 - int(m.sum(axis=0)[0])
  556. if dof < 0:
  557. raise ValueError("The input must have at least 3 entries!")
  558. # Gets the ranks and rank differences
  559. x_ranked = rankdata(x, axis=0)
  560. rs = ma.corrcoef(x_ranked, rowvar=False).data
  561. # rs can have elements equal to 1, so avoid zero division warnings
  562. with np.errstate(divide='ignore'):
  563. # clip the small negative values possibly caused by rounding
  564. # errors before taking the square root
  565. t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  566. t, prob = _ttest_finish(dof, t, alternative)
  567. # For backwards compatibility, return scalars when comparing 2 columns
  568. if rs.shape == (2, 2):
  569. res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
  570. prob[1, 0])
  571. res.correlation = rs[1, 0]
  572. return res
  573. else:
  574. res = scipy.stats._stats_py.SignificanceResult(rs, prob)
  575. res.correlation = rs
  576. return res
  577. # Need to do this per pair of variables, otherwise the dropped observations
  578. # in a third column mess up the result for a pair.
  579. n_vars = x.shape[1]
  580. if n_vars == 2:
  581. return _spearmanr_2cols(x)
  582. else:
  583. rs = np.ones((n_vars, n_vars), dtype=float)
  584. prob = np.zeros((n_vars, n_vars), dtype=float)
  585. for var1 in range(n_vars - 1):
  586. for var2 in range(var1+1, n_vars):
  587. result = _spearmanr_2cols(x[:, [var1, var2]])
  588. rs[var1, var2] = result.correlation
  589. rs[var2, var1] = result.correlation
  590. prob[var1, var2] = result.pvalue
  591. prob[var2, var1] = result.pvalue
  592. res = scipy.stats._stats_py.SignificanceResult(rs, prob)
  593. res.correlation = rs
  594. return res
  595. def _kendall_p_exact(n, c, alternative='two-sided'):
  596. # Use the fact that distribution is symmetric: always calculate a CDF in
  597. # the left tail.
  598. # This will be the one-sided p-value if `c` is on the side of
  599. # the null distribution predicted by the alternative hypothesis.
  600. # The two-sided p-value will be twice this value.
  601. # If `c` is on the other side of the null distribution, we'll need to
  602. # take the complement and add back the probability mass at `c`.
  603. in_right_tail = (c >= (n*(n-1))//2 - c)
  604. alternative_greater = (alternative == 'greater')
  605. c = int(min(c, (n*(n-1))//2 - c))
  606. # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
  607. # (4th Edition), Charles Griffin & Co., 1970.
  608. if n <= 0:
  609. raise ValueError(f'n ({n}) must be positive')
  610. elif c < 0 or 4*c > n*(n-1):
  611. raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
  612. elif n == 1:
  613. prob = 1.0
  614. p_mass_at_c = 1
  615. elif n == 2:
  616. prob = 1.0
  617. p_mass_at_c = 0.5
  618. elif c == 0:
  619. prob = 2.0/math.factorial(n) if n < 171 else 0.0
  620. p_mass_at_c = prob/2
  621. elif c == 1:
  622. prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
  623. p_mass_at_c = (n-1)/math.factorial(n)
  624. elif 4*c == n*(n-1) and alternative == 'two-sided':
  625. # I'm sure there's a simple formula for p_mass_at_c in this
  626. # case, but I don't know it. Use generic formula for one-sided p-value.
  627. prob = 1.0
  628. elif n < 171:
  629. new = np.zeros(c+1)
  630. new[0:2] = 1.0
  631. for j in range(3,n+1):
  632. new = np.cumsum(new)
  633. if j <= c:
  634. new[j:] -= new[:c+1-j]
  635. prob = 2.0*np.sum(new)/math.factorial(n)
  636. p_mass_at_c = new[-1]/math.factorial(n)
  637. else:
  638. new = np.zeros(c+1)
  639. new[0:2] = 1.0
  640. for j in range(3, n+1):
  641. new = np.cumsum(new)/j
  642. if j <= c:
  643. new[j:] -= new[:c+1-j]
  644. prob = np.sum(new)
  645. p_mass_at_c = new[-1]/2
  646. if alternative != 'two-sided':
  647. # if the alternative hypothesis and alternative agree,
  648. # one-sided p-value is half the two-sided p-value
  649. if in_right_tail == alternative_greater:
  650. prob /= 2
  651. else:
  652. prob = 1 - prob/2 + p_mass_at_c
  653. prob = np.clip(prob, 0, 1)
  654. return prob
  655. def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
  656. alternative='two-sided'):
  657. """
  658. Computes Kendall's rank correlation tau on two variables *x* and *y*.
  659. Parameters
  660. ----------
  661. x : sequence
  662. First data list (for example, time).
  663. y : sequence
  664. Second data list.
  665. use_ties : {True, False}, optional
  666. Whether ties correction should be performed.
  667. use_missing : {False, True}, optional
  668. Whether missing data should be allocated a rank of 0 (False) or the
  669. average rank (True)
  670. method : {'auto', 'asymptotic', 'exact'}, optional
  671. Defines which method is used to calculate the p-value [1]_.
  672. 'asymptotic' uses a normal approximation valid for large samples.
  673. 'exact' computes the exact p-value, but can only be used if no ties
  674. are present. As the sample size increases, the 'exact' computation
  675. time may grow and the result may lose some precision.
  676. 'auto' is the default and selects the appropriate
  677. method based on a trade-off between speed and accuracy.
  678. alternative : {'two-sided', 'less', 'greater'}, optional
  679. Defines the alternative hypothesis. Default is 'two-sided'.
  680. The following options are available:
  681. * 'two-sided': the rank correlation is nonzero
  682. * 'less': the rank correlation is negative (less than zero)
  683. * 'greater': the rank correlation is positive (greater than zero)
  684. Returns
  685. -------
  686. res : SignificanceResult
  687. An object containing attributes:
  688. statistic : float
  689. The tau statistic.
  690. pvalue : float
  691. The p-value for a hypothesis test whose null hypothesis is
  692. an absence of association, tau = 0.
  693. References
  694. ----------
  695. .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
  696. Charles Griffin & Co., 1970.
  697. """
  698. (x, y, n) = _chk_size(x, y)
  699. (x, y) = (x.flatten(), y.flatten())
  700. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  701. if m is not nomask:
  702. x = ma.array(x, mask=m, copy=True)
  703. y = ma.array(y, mask=m, copy=True)
  704. # need int() here, otherwise numpy defaults to 32 bit
  705. # integer on all Windows architectures, causing overflow.
  706. # int() will keep it infinite precision.
  707. n -= int(m.sum())
  708. if n < 2:
  709. res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
  710. res.correlation = np.nan
  711. return res
  712. rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
  713. ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
  714. idx = rx.argsort()
  715. (rx, ry) = (rx[idx], ry[idx])
  716. C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
  717. for i in range(len(ry)-1)], dtype=float)
  718. D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
  719. for i in range(len(ry)-1)], dtype=float)
  720. xties = count_tied_groups(x)
  721. yties = count_tied_groups(y)
  722. if use_ties:
  723. corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
  724. corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
  725. denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
  726. else:
  727. denom = n*(n-1)/2.
  728. tau = (C-D) / denom
  729. if method == 'exact' and (xties or yties):
  730. raise ValueError("Ties found, exact method cannot be used.")
  731. if method == 'auto':
  732. if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
  733. method = 'exact'
  734. else:
  735. method = 'asymptotic'
  736. if not xties and not yties and method == 'exact':
  737. prob = _kendall_p_exact(n, C, alternative)
  738. elif method == 'asymptotic':
  739. var_s = n*(n-1)*(2*n+5)
  740. if use_ties:
  741. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
  742. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
  743. v1 = (np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *
  744. np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float))
  745. v1 /= 2.*n*(n-1)
  746. if n > 2:
  747. v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
  748. dtype=float) * \
  749. np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
  750. dtype=float)
  751. v2 /= 9.*n*(n-1)*(n-2)
  752. else:
  753. v2 = 0
  754. else:
  755. v1 = v2 = 0
  756. var_s /= 18.
  757. var_s += (v1 + v2)
  758. z = (C-D)/np.sqrt(var_s)
  759. prob = scipy.stats._stats_py._get_pvalue(z, distributions.norm, alternative)
  760. else:
  761. raise ValueError("Unknown method "+str(method)+" specified, please "
  762. "use auto, exact or asymptotic.")
  763. res = scipy.stats._stats_py.SignificanceResult(tau[()], prob[()])
  764. res.correlation = tau
  765. return res
  766. def kendalltau_seasonal(x):
  767. """
  768. Computes a multivariate Kendall's rank correlation tau, for seasonal data.
  769. Parameters
  770. ----------
  771. x : 2-D ndarray
  772. Array of seasonal data, with seasons in columns.
  773. """
  774. x = ma.array(x, subok=True, copy=False, ndmin=2)
  775. (n,m) = x.shape
  776. n_p = x.count(0)
  777. S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
  778. S_tot = S_szn.sum()
  779. n_tot = x.count()
  780. ties = count_tied_groups(x.compressed())
  781. corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
  782. denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
  783. R = rankdata(x, axis=0, use_missing=True)
  784. K = ma.empty((m,m), dtype=int)
  785. covmat = ma.empty((m,m), dtype=float)
  786. denom_szn = ma.empty(m, dtype=float)
  787. for j in range(m):
  788. ties_j = count_tied_groups(x[:,j].compressed())
  789. corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
  790. cmb = n_p[j]*(n_p[j]-1)
  791. for k in range(j,m,1):
  792. K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
  793. for i in range(n))
  794. covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
  795. n*(n_p[j]+1)*(n_p[k]+1))/3.
  796. K[k,j] = K[j,k]
  797. covmat[k,j] = covmat[j,k]
  798. denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
  799. var_szn = covmat.diagonal()
  800. z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
  801. z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
  802. z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
  803. prob_szn = special.erfc(abs(z_szn.data)/np.sqrt(2))
  804. prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
  805. prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
  806. chi2_tot = (z_szn*z_szn).sum()
  807. chi2_trd = m * z_szn.mean()**2
  808. output = {'seasonal tau': S_szn/denom_szn,
  809. 'global tau': S_tot/denom_tot,
  810. 'global tau (alt)': S_tot/denom_szn.sum(),
  811. 'seasonal p-value': prob_szn,
  812. 'global p-value (indep)': prob_tot_ind,
  813. 'global p-value (dep)': prob_tot_dep,
  814. 'chi2 total': chi2_tot,
  815. 'chi2 trend': chi2_trd,
  816. }
  817. return output
  818. PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
  819. 'pvalue'))
  820. def pointbiserialr(x, y):
  821. """Calculates a point biserial correlation coefficient and its p-value.
  822. Parameters
  823. ----------
  824. x : array_like of bools
  825. Input array.
  826. y : array_like
  827. Input array.
  828. Returns
  829. -------
  830. correlation : float
  831. R value
  832. pvalue : float
  833. 2-tailed p-value
  834. Notes
  835. -----
  836. Missing values are considered pair-wise: if a value is missing in x,
  837. the corresponding value in y is masked.
  838. For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
  839. """
  840. x = ma.fix_invalid(x, copy=True).astype(bool)
  841. y = ma.fix_invalid(y, copy=True).astype(float)
  842. # Get rid of the missing data
  843. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  844. if m is not nomask:
  845. unmask = np.logical_not(m)
  846. x = x[unmask]
  847. y = y[unmask]
  848. n = len(x)
  849. # phat is the fraction of x values that are True
  850. phat = x.sum() / float(n)
  851. y0 = y[~x] # y-values where x is False
  852. y1 = y[x] # y-values where x is True
  853. y0m = y0.mean()
  854. y1m = y1.mean()
  855. rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
  856. df = n-2
  857. t = rpb*ma.sqrt(df/(1.0-rpb**2))
  858. prob = _betai(0.5*df, 0.5, df/(df+t*t))
  859. return PointbiserialrResult(rpb, prob)
  860. def linregress(x, y=None):
  861. r"""
  862. Calculate a linear least-squares regression for two sets of measurements.
  863. Parameters
  864. ----------
  865. x, y : array_like
  866. Two sets of measurements. Both arrays should have the same length N. If
  867. only `x` is given (and ``y=None``), then it must be a two-dimensional
  868. array where one dimension has length 2. The two sets of measurements
  869. are then found by splitting the array along the length-2 dimension. In
  870. the case where ``y=None`` and `x` is a 2xN array, ``linregress(x)`` is
  871. equivalent to ``linregress(x[0], x[1])``.
  872. Returns
  873. -------
  874. result : ``LinregressResult`` instance
  875. The return value is an object with the following attributes:
  876. slope : float
  877. Slope of the regression line.
  878. intercept : float
  879. Intercept of the regression line.
  880. rvalue : float
  881. The Pearson correlation coefficient. The square of ``rvalue``
  882. is equal to the coefficient of determination.
  883. pvalue : float
  884. The p-value for a hypothesis test whose null hypothesis is
  885. that the slope is zero, using Wald Test with t-distribution of
  886. the test statistic. See `alternative` above for alternative
  887. hypotheses.
  888. stderr : float
  889. Standard error of the estimated slope (gradient), under the
  890. assumption of residual normality.
  891. intercept_stderr : float
  892. Standard error of the estimated intercept, under the assumption
  893. of residual normality.
  894. See Also
  895. --------
  896. scipy.optimize.curve_fit :
  897. Use non-linear least squares to fit a function to data.
  898. scipy.optimize.leastsq :
  899. Minimize the sum of squares of a set of equations.
  900. Notes
  901. -----
  902. Missing values are considered pair-wise: if a value is missing in `x`,
  903. the corresponding value in `y` is masked.
  904. For compatibility with older versions of SciPy, the return value acts
  905. like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
  906. ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
  907. slope, intercept, r, p, se = linregress(x, y)
  908. With that style, however, the standard error of the intercept is not
  909. available. To have access to all the computed values, including the
  910. standard error of the intercept, use the return value as an object
  911. with attributes, e.g.::
  912. result = linregress(x, y)
  913. print(result.intercept, result.intercept_stderr)
  914. Examples
  915. --------
  916. >>> import numpy as np
  917. >>> import matplotlib.pyplot as plt
  918. >>> from scipy import stats
  919. >>> rng = np.random.default_rng()
  920. Generate some data:
  921. >>> x = rng.random(10)
  922. >>> y = 1.6*x + rng.random(10)
  923. Perform the linear regression:
  924. >>> res = stats.mstats.linregress(x, y)
  925. Coefficient of determination (R-squared):
  926. >>> print(f"R-squared: {res.rvalue**2:.6f}")
  927. R-squared: 0.717533
  928. Plot the data along with the fitted line:
  929. >>> plt.plot(x, y, 'o', label='original data')
  930. >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
  931. >>> plt.legend()
  932. >>> plt.show()
  933. Calculate 95% confidence interval on slope and intercept:
  934. >>> # Two-sided inverse Students t-distribution
  935. >>> # p - probability, df - degrees of freedom
  936. >>> from scipy.stats import t
  937. >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
  938. >>> ts = tinv(0.05, len(x)-2)
  939. >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
  940. slope (95%): 1.453392 +/- 0.743465
  941. >>> print(f"intercept (95%): {res.intercept:.6f}"
  942. ... f" +/- {ts*res.intercept_stderr:.6f}")
  943. intercept (95%): 0.616950 +/- 0.544475
  944. """
  945. if y is None:
  946. x = ma.array(x)
  947. if x.shape[0] == 2:
  948. x, y = x
  949. elif x.shape[1] == 2:
  950. x, y = x.T
  951. else:
  952. raise ValueError("If only `x` is given as input, "
  953. "it has to be of shape (2, N) or (N, 2), "
  954. f"provided shape was {x.shape}")
  955. else:
  956. x = ma.array(x)
  957. y = ma.array(y)
  958. x = x.flatten()
  959. y = y.flatten()
  960. if np.amax(x) == np.amin(x) and len(x) > 1:
  961. raise ValueError("Cannot calculate a linear regression "
  962. "if all x values are identical")
  963. m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
  964. if m is not nomask:
  965. x = ma.array(x, mask=m)
  966. y = ma.array(y, mask=m)
  967. if np.any(~m):
  968. result = _stats_py.linregress(x.data[~m], y.data[~m])
  969. else:
  970. # All data is masked
  971. result = _stats_py.LinregressResult(slope=None, intercept=None,
  972. rvalue=None, pvalue=None,
  973. stderr=None,
  974. intercept_stderr=None)
  975. else:
  976. result = _stats_py.linregress(x.data, y.data)
  977. return result
  978. def theilslopes(y, x=None, alpha=0.95, method='separate'):
  979. r"""
  980. Computes the Theil-Sen estimator for a set of points (x, y).
  981. `theilslopes` implements a method for robust linear regression. It
  982. computes the slope as the median of all slopes between paired values.
  983. Parameters
  984. ----------
  985. y : array_like
  986. Dependent variable.
  987. x : array_like or None, optional
  988. Independent variable. If None, use ``arange(len(y))`` instead.
  989. alpha : float, optional
  990. Confidence degree between 0 and 1. Default is 95% confidence.
  991. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  992. interpreted as "find the 90% confidence interval".
  993. method : {'joint', 'separate'}, optional
  994. Method to be used for computing estimate for intercept.
  995. Following methods are supported,
  996. * 'joint': Uses np.median(y - slope * x) as intercept.
  997. * 'separate': Uses np.median(y) - slope * np.median(x)
  998. as intercept.
  999. The default is 'separate'.
  1000. .. versionadded:: 1.8.0
  1001. Returns
  1002. -------
  1003. result : ``TheilslopesResult`` instance
  1004. The return value is an object with the following attributes:
  1005. slope : float
  1006. Theil slope.
  1007. intercept : float
  1008. Intercept of the Theil line.
  1009. low_slope : float
  1010. Lower bound of the confidence interval on `slope`.
  1011. high_slope : float
  1012. Upper bound of the confidence interval on `slope`.
  1013. See Also
  1014. --------
  1015. siegelslopes : a similar technique using repeated medians
  1016. Notes
  1017. -----
  1018. For more details on `theilslopes`, see `scipy.stats.theilslopes`.
  1019. """
  1020. y = ma.asarray(y).flatten()
  1021. if x is None:
  1022. x = ma.arange(len(y), dtype=float)
  1023. else:
  1024. x = ma.asarray(x).flatten()
  1025. if len(x) != len(y):
  1026. raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
  1027. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  1028. y._mask = x._mask = m
  1029. # Disregard any masked elements of x or y
  1030. y = y.compressed()
  1031. x = x.compressed().astype(float)
  1032. # We now have unmasked arrays so can use `scipy.stats.theilslopes`
  1033. return stats_theilslopes(y, x, alpha=alpha, method=method)
  1034. def siegelslopes(y, x=None, method="hierarchical"):
  1035. r"""
  1036. Computes the Siegel estimator for a set of points (x, y).
  1037. `siegelslopes` implements a method for robust linear regression
  1038. using repeated medians to fit a line to the points (x, y).
  1039. The method is robust to outliers with an asymptotic breakdown point
  1040. of 50%.
  1041. Parameters
  1042. ----------
  1043. y : array_like
  1044. Dependent variable.
  1045. x : array_like or None, optional
  1046. Independent variable. If None, use ``arange(len(y))`` instead.
  1047. method : {'hierarchical', 'separate'}
  1048. If 'hierarchical', estimate the intercept using the estimated
  1049. slope ``slope`` (default option).
  1050. If 'separate', estimate the intercept independent of the estimated
  1051. slope. See Notes for details.
  1052. Returns
  1053. -------
  1054. result : ``SiegelslopesResult`` instance
  1055. The return value is an object with the following attributes:
  1056. slope : float
  1057. Estimate of the slope of the regression line.
  1058. intercept : float
  1059. Estimate of the intercept of the regression line.
  1060. See Also
  1061. --------
  1062. theilslopes : a similar technique without repeated medians
  1063. Notes
  1064. -----
  1065. For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
  1066. """
  1067. y = ma.asarray(y).ravel()
  1068. if x is None:
  1069. x = ma.arange(len(y), dtype=float)
  1070. else:
  1071. x = ma.asarray(x).ravel()
  1072. if len(x) != len(y):
  1073. raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
  1074. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  1075. y._mask = x._mask = m
  1076. # Disregard any masked elements of x or y
  1077. y = y.compressed()
  1078. x = x.compressed().astype(float)
  1079. # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
  1080. return stats_siegelslopes(y, x, method=method)
  1081. SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
  1082. ['intra_slope', 'inter_slope'])
  1083. def sen_seasonal_slopes(x):
  1084. r"""
  1085. Computes seasonal Theil-Sen and Kendall slope estimators.
  1086. The seasonal generalization of Sen's slope computes the slopes between all
  1087. pairs of values within a "season" (column) of a 2D array. It returns an
  1088. array containing the median of these "within-season" slopes for each
  1089. season (the Theil-Sen slope estimator of each season), and it returns the
  1090. median of the within-season slopes across all seasons (the seasonal Kendall
  1091. slope estimator).
  1092. Parameters
  1093. ----------
  1094. x : 2D array_like
  1095. Each column of `x` contains measurements of the dependent variable
  1096. within a season. The independent variable (usually time) of each season
  1097. is assumed to be ``np.arange(x.shape[0])``.
  1098. Returns
  1099. -------
  1100. result : ``SenSeasonalSlopesResult`` instance
  1101. The return value is an object with the following attributes:
  1102. intra_slope : ndarray
  1103. For each season, the Theil-Sen slope estimator: the median of
  1104. within-season slopes.
  1105. inter_slope : float
  1106. The seasonal Kendall slope estimator: the median of within-season
  1107. slopes *across all* seasons.
  1108. See Also
  1109. --------
  1110. theilslopes : the analogous function for non-seasonal data
  1111. scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
  1112. Notes
  1113. -----
  1114. The slopes :math:`d_{ijk}` within season :math:`i` are:
  1115. .. math::
  1116. d_{ijk} = \frac{x_{ij} - x_{ik}}
  1117. {j - k}
  1118. for pairs of distinct integer indices :math:`j, k` of :math:`x`.
  1119. Element :math:`i` of the returned `intra_slope` array is the median of the
  1120. :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
  1121. estimator of season :math:`i`. The returned `inter_slope` value, better
  1122. known as the seasonal Kendall slope estimator, is the median of the
  1123. :math:`d_{ijk}` over all :math:`i, j, k`.
  1124. References
  1125. ----------
  1126. .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
  1127. "Techniques of trend analysis for monthly water quality data."
  1128. *Water Resources Research* 18.1 (1982): 107-121.
  1129. Examples
  1130. --------
  1131. Suppose we have 100 observations of a dependent variable for each of four
  1132. seasons:
  1133. >>> import numpy as np
  1134. >>> rng = np.random.default_rng()
  1135. >>> x = rng.random(size=(100, 4))
  1136. We compute the seasonal slopes as:
  1137. >>> from scipy import stats
  1138. >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
  1139. If we define a function to compute all slopes between observations within
  1140. a season:
  1141. >>> def dijk(yi):
  1142. ... n = len(yi)
  1143. ... x = np.arange(n)
  1144. ... dy = yi - yi[:, np.newaxis]
  1145. ... dx = x - x[:, np.newaxis]
  1146. ... # we only want unique pairs of distinct indices
  1147. ... mask = np.triu(np.ones((n, n), dtype=bool), k=1)
  1148. ... return dy[mask]/dx[mask]
  1149. then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
  1150. >>> i = 2
  1151. >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
  1152. True
  1153. and ``inter_slope`` is the median of the values returned by ``dijk`` for
  1154. all seasons:
  1155. >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
  1156. >>> np.allclose(np.median(all_slopes), inter_slope)
  1157. True
  1158. Because the data are randomly generated, we would expect the median slopes
  1159. to be nearly zero both within and across all seasons, and indeed they are:
  1160. >>> intra_slope.data
  1161. array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
  1162. >>> inter_slope
  1163. -0.0010511779872922058
  1164. """
  1165. x = ma.array(x, subok=True, copy=False, ndmin=2)
  1166. (n,_) = x.shape
  1167. # Get list of slopes per season
  1168. szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
  1169. for i in range(n)])
  1170. szn_medslopes = ma.median(szn_slopes, axis=0)
  1171. medslope = ma.median(szn_slopes, axis=None)
  1172. return SenSeasonalSlopesResult(szn_medslopes, medslope)
  1173. Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
  1174. def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
  1175. """
  1176. Calculates the T-test for the mean of ONE group of scores.
  1177. Parameters
  1178. ----------
  1179. a : array_like
  1180. sample observation
  1181. popmean : float or array_like
  1182. expected value in null hypothesis, if array_like than it must have the
  1183. same shape as `a` excluding the axis dimension
  1184. axis : int or None, optional
  1185. Axis along which to compute test. If None, compute over the whole
  1186. array `a`.
  1187. alternative : {'two-sided', 'less', 'greater'}, optional
  1188. Defines the alternative hypothesis.
  1189. The following options are available (default is 'two-sided'):
  1190. * 'two-sided': the mean of the underlying distribution of the sample
  1191. is different than the given population mean (`popmean`)
  1192. * 'less': the mean of the underlying distribution of the sample is
  1193. less than the given population mean (`popmean`)
  1194. * 'greater': the mean of the underlying distribution of the sample is
  1195. greater than the given population mean (`popmean`)
  1196. .. versionadded:: 1.7.0
  1197. Returns
  1198. -------
  1199. statistic : float or array
  1200. t-statistic
  1201. pvalue : float or array
  1202. The p-value
  1203. Notes
  1204. -----
  1205. For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
  1206. """
  1207. a, axis = _chk_asarray(a, axis)
  1208. if a.size == 0:
  1209. return (np.nan, np.nan)
  1210. x = a.mean(axis=axis)
  1211. v = a.var(axis=axis, ddof=1)
  1212. n = a.count(axis=axis)
  1213. # force df to be an array for masked division not to throw a warning
  1214. df = ma.asanyarray(n - 1.0)
  1215. svar = ((n - 1.0) * v) / df
  1216. with np.errstate(divide='ignore', invalid='ignore'):
  1217. t = (x - popmean) / ma.sqrt(svar / n)
  1218. t, prob = _ttest_finish(df, t, alternative)
  1219. return Ttest_1sampResult(t, prob)
  1220. ttest_onesamp = ttest_1samp
  1221. Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
  1222. def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
  1223. """
  1224. Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
  1225. Parameters
  1226. ----------
  1227. a, b : array_like
  1228. The arrays must have the same shape, except in the dimension
  1229. corresponding to `axis` (the first, by default).
  1230. axis : int or None, optional
  1231. Axis along which to compute test. If None, compute over the whole
  1232. arrays, `a`, and `b`.
  1233. equal_var : bool, optional
  1234. If True, perform a standard independent 2 sample test that assumes equal
  1235. population variances.
  1236. If False, perform Welch's t-test, which does not assume equal population
  1237. variance.
  1238. .. versionadded:: 0.17.0
  1239. alternative : {'two-sided', 'less', 'greater'}, optional
  1240. Defines the alternative hypothesis.
  1241. The following options are available (default is 'two-sided'):
  1242. * 'two-sided': the means of the distributions underlying the samples
  1243. are unequal.
  1244. * 'less': the mean of the distribution underlying the first sample
  1245. is less than the mean of the distribution underlying the second
  1246. sample.
  1247. * 'greater': the mean of the distribution underlying the first
  1248. sample is greater than the mean of the distribution underlying
  1249. the second sample.
  1250. .. versionadded:: 1.7.0
  1251. Returns
  1252. -------
  1253. statistic : float or array
  1254. The calculated t-statistic.
  1255. pvalue : float or array
  1256. The p-value.
  1257. Notes
  1258. -----
  1259. For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
  1260. """
  1261. a, b, axis = _chk2_asarray(a, b, axis)
  1262. if a.size == 0 or b.size == 0:
  1263. return Ttest_indResult(np.nan, np.nan)
  1264. (x1, x2) = (a.mean(axis), b.mean(axis))
  1265. (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
  1266. (n1, n2) = (a.count(axis), b.count(axis))
  1267. if equal_var:
  1268. # force df to be an array for masked division not to throw a warning
  1269. df = ma.asanyarray(n1 + n2 - 2.0)
  1270. svar = ((n1-1)*v1+(n2-1)*v2) / df
  1271. denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!
  1272. else:
  1273. vn1 = v1/n1
  1274. vn2 = v2/n2
  1275. with np.errstate(divide='ignore', invalid='ignore'):
  1276. df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
  1277. # If df is undefined, variances are zero.
  1278. # It doesn't matter what df is as long as it is not NaN.
  1279. df = np.where(np.isnan(df), 1, df)
  1280. denom = ma.sqrt(vn1 + vn2)
  1281. with np.errstate(divide='ignore', invalid='ignore'):
  1282. t = (x1-x2) / denom
  1283. t, prob = _ttest_finish(df, t, alternative)
  1284. return Ttest_indResult(t, prob)
  1285. Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
  1286. def ttest_rel(a, b, axis=0, alternative='two-sided'):
  1287. """
  1288. Calculates the T-test on TWO RELATED samples of scores, a and b.
  1289. Parameters
  1290. ----------
  1291. a, b : array_like
  1292. The arrays must have the same shape.
  1293. axis : int or None, optional
  1294. Axis along which to compute test. If None, compute over the whole
  1295. arrays, `a`, and `b`.
  1296. alternative : {'two-sided', 'less', 'greater'}, optional
  1297. Defines the alternative hypothesis.
  1298. The following options are available (default is 'two-sided'):
  1299. * 'two-sided': the means of the distributions underlying the samples
  1300. are unequal.
  1301. * 'less': the mean of the distribution underlying the first sample
  1302. is less than the mean of the distribution underlying the second
  1303. sample.
  1304. * 'greater': the mean of the distribution underlying the first
  1305. sample is greater than the mean of the distribution underlying
  1306. the second sample.
  1307. .. versionadded:: 1.7.0
  1308. Returns
  1309. -------
  1310. statistic : float or array
  1311. t-statistic
  1312. pvalue : float or array
  1313. two-tailed p-value
  1314. Notes
  1315. -----
  1316. For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
  1317. """
  1318. a, b, axis = _chk2_asarray(a, b, axis)
  1319. if len(a) != len(b):
  1320. raise ValueError('unequal length arrays')
  1321. if a.size == 0 or b.size == 0:
  1322. return Ttest_relResult(np.nan, np.nan)
  1323. n = a.count(axis)
  1324. df = ma.asanyarray(n-1.0)
  1325. d = (a-b).astype('d')
  1326. dm = d.mean(axis)
  1327. v = d.var(axis=axis, ddof=1)
  1328. denom = ma.sqrt(v / n)
  1329. with np.errstate(divide='ignore', invalid='ignore'):
  1330. t = dm / denom
  1331. t, prob = _ttest_finish(df, t, alternative)
  1332. return Ttest_relResult(t, prob)
  1333. MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
  1334. 'pvalue'))
  1335. def mannwhitneyu(x,y, use_continuity=True):
  1336. """
  1337. Computes the Mann-Whitney statistic
  1338. Missing values in `x` and/or `y` are discarded.
  1339. Parameters
  1340. ----------
  1341. x : sequence
  1342. Input
  1343. y : sequence
  1344. Input
  1345. use_continuity : {True, False}, optional
  1346. Whether a continuity correction (1/2.) should be taken into account.
  1347. Returns
  1348. -------
  1349. statistic : float
  1350. The minimum of the Mann-Whitney statistics
  1351. pvalue : float
  1352. Approximate two-sided p-value assuming a normal distribution.
  1353. """
  1354. x = ma.asarray(x).compressed().view(ndarray)
  1355. y = ma.asarray(y).compressed().view(ndarray)
  1356. ranks = rankdata(np.concatenate([x,y]))
  1357. (nx, ny) = (len(x), len(y))
  1358. nt = nx + ny
  1359. U = ranks[:nx].sum() - nx*(nx+1)/2.
  1360. U = max(U, nx*ny - U)
  1361. u = nx*ny - U
  1362. mu = (nx*ny)/2.
  1363. sigsq = (nt**3 - nt)/12.
  1364. ties = count_tied_groups(ranks)
  1365. sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
  1366. sigsq *= nx*ny/float(nt*(nt-1))
  1367. if use_continuity:
  1368. z = (U - 1/2. - mu) / ma.sqrt(sigsq)
  1369. else:
  1370. z = (U - mu) / ma.sqrt(sigsq)
  1371. prob = special.erfc(abs(z)/np.sqrt(2))
  1372. return MannwhitneyuResult(u, prob)
  1373. KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
  1374. def kruskal(*args):
  1375. """
  1376. Compute the Kruskal-Wallis H-test for independent samples
  1377. Parameters
  1378. ----------
  1379. sample1, sample2, ... : array_like
  1380. Two or more arrays with the sample measurements can be given as
  1381. arguments.
  1382. Returns
  1383. -------
  1384. statistic : float
  1385. The Kruskal-Wallis H statistic, corrected for ties
  1386. pvalue : float
  1387. The p-value for the test using the assumption that H has a chi
  1388. square distribution
  1389. Notes
  1390. -----
  1391. For more details on `kruskal`, see `scipy.stats.kruskal`.
  1392. Examples
  1393. --------
  1394. >>> from scipy.stats.mstats import kruskal
  1395. Random samples from three different brands of batteries were tested
  1396. to see how long the charge lasted. Results were as follows:
  1397. >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
  1398. >>> b = [6.9, 7.0, 6.1, 7.9]
  1399. >>> c = [7.2, 6.9, 6.1, 6.5]
  1400. Test the hypothesis that the distribution functions for all of the brands'
  1401. durations are identical. Use 5% level of significance.
  1402. >>> kruskal(a, b, c)
  1403. KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
  1404. The null hypothesis is rejected at the 5% level of significance
  1405. because the returned p-value is less than the critical value of 5%.
  1406. """
  1407. output = argstoarray(*args)
  1408. ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
  1409. sumrk = ranks.sum(-1)
  1410. ngrp = ranks.count(-1)
  1411. ntot = ranks.count()
  1412. H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
  1413. # Tie correction
  1414. ties = count_tied_groups(ranks)
  1415. T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
  1416. if T == 0:
  1417. raise ValueError('All numbers are identical in kruskal')
  1418. H /= T
  1419. df = len(output) - 1
  1420. prob = distributions.chi2.sf(H, df)
  1421. return KruskalResult(H, prob)
  1422. kruskalwallis = kruskal
  1423. @_rename_parameter("mode", "method")
  1424. def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
  1425. """
  1426. Computes the Kolmogorov-Smirnov test on one sample of masked values.
  1427. Missing values in `x` are discarded.
  1428. Parameters
  1429. ----------
  1430. x : array_like
  1431. a 1-D array of observations of random variables.
  1432. cdf : str or callable
  1433. If a string, it should be the name of a distribution in `scipy.stats`.
  1434. If a callable, that callable is used to calculate the cdf.
  1435. args : tuple, sequence, optional
  1436. Distribution parameters, used if `cdf` is a string.
  1437. alternative : {'two-sided', 'less', 'greater'}, optional
  1438. Indicates the alternative hypothesis. Default is 'two-sided'.
  1439. method : {'auto', 'exact', 'asymp'}, optional
  1440. Defines the method used for calculating the p-value.
  1441. The following options are available (default is 'auto'):
  1442. * 'auto' : use 'exact' for small size arrays, 'asymp' for large
  1443. * 'exact' : use approximation to exact distribution of test statistic
  1444. * 'asymp' : use asymptotic distribution of test statistic
  1445. Returns
  1446. -------
  1447. d : float
  1448. Value of the Kolmogorov Smirnov test
  1449. p : float
  1450. Corresponding p-value.
  1451. """
  1452. alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
  1453. alternative.lower()[0], alternative)
  1454. return scipy.stats._stats_py.ks_1samp(
  1455. x, cdf, args=args, alternative=alternative, method=method)
  1456. @_rename_parameter("mode", "method")
  1457. def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
  1458. """
  1459. Computes the Kolmogorov-Smirnov test on two samples.
  1460. Missing values in `x` and/or `y` are discarded.
  1461. Parameters
  1462. ----------
  1463. data1 : array_like
  1464. First data set
  1465. data2 : array_like
  1466. Second data set
  1467. alternative : {'two-sided', 'less', 'greater'}, optional
  1468. Indicates the alternative hypothesis. Default is 'two-sided'.
  1469. method : {'auto', 'exact', 'asymp'}, optional
  1470. Defines the method used for calculating the p-value.
  1471. The following options are available (default is 'auto'):
  1472. * 'auto' : use 'exact' for small size arrays, 'asymp' for large
  1473. * 'exact' : use approximation to exact distribution of test statistic
  1474. * 'asymp' : use asymptotic distribution of test statistic
  1475. Returns
  1476. -------
  1477. d : float
  1478. Value of the Kolmogorov Smirnov test
  1479. p : float
  1480. Corresponding p-value.
  1481. """
  1482. # Ideally this would be accomplished by
  1483. # ks_2samp = scipy.stats._stats_py.ks_2samp
  1484. # but the circular dependencies between _mstats_basic and stats prevent that.
  1485. alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
  1486. alternative.lower()[0], alternative)
  1487. return scipy.stats._stats_py.ks_2samp(data1, data2,
  1488. alternative=alternative,
  1489. method=method)
  1490. ks_twosamp = ks_2samp
  1491. @_rename_parameter("mode", "method")
  1492. def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
  1493. """
  1494. Parameters
  1495. ----------
  1496. data1 : array_like
  1497. data2 : str, callable or array_like
  1498. args : tuple, sequence, optional
  1499. Distribution parameters, used if `data1` or `data2` are strings.
  1500. alternative : str, as documented in stats.kstest
  1501. method : str, as documented in stats.kstest
  1502. Returns
  1503. -------
  1504. tuple of (K-S statistic, probability)
  1505. """
  1506. return scipy.stats._stats_py.kstest(data1, data2, args,
  1507. alternative=alternative, method=method)
  1508. def trima(a, limits=None, inclusive=(True,True)):
  1509. """
  1510. Trims an array by masking the data outside some given limits.
  1511. Returns a masked version of the input array.
  1512. Parameters
  1513. ----------
  1514. a : array_like
  1515. Input array.
  1516. limits : {None, tuple}, optional
  1517. Tuple of (lower limit, upper limit) in absolute values.
  1518. Values of the input array lower (greater) than the lower (upper) limit
  1519. will be masked. A limit is None indicates an open interval.
  1520. inclusive : (bool, bool) tuple, optional
  1521. Tuple of (lower flag, upper flag), indicating whether values exactly
  1522. equal to the lower (upper) limit are allowed.
  1523. Examples
  1524. --------
  1525. >>> from scipy.stats.mstats import trima
  1526. >>> import numpy as np
  1527. >>> a = np.arange(10)
  1528. The interval is left-closed and right-open, i.e., `[2, 8)`.
  1529. Trim the array by keeping only values in the interval.
  1530. >>> trima(a, limits=(2, 8), inclusive=(True, False))
  1531. masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
  1532. mask=[ True, True, False, False, False, False, False, False,
  1533. True, True],
  1534. fill_value=999999)
  1535. """
  1536. a = ma.asarray(a)
  1537. a.unshare_mask()
  1538. if (limits is None) or (limits == (None, None)):
  1539. return a
  1540. (lower_lim, upper_lim) = limits
  1541. (lower_in, upper_in) = inclusive
  1542. condition = False
  1543. if lower_lim is not None:
  1544. if lower_in:
  1545. condition |= (a < lower_lim)
  1546. else:
  1547. condition |= (a <= lower_lim)
  1548. if upper_lim is not None:
  1549. if upper_in:
  1550. condition |= (a > upper_lim)
  1551. else:
  1552. condition |= (a >= upper_lim)
  1553. a[condition.filled(True)] = masked
  1554. return a
  1555. def trimr(a, limits=None, inclusive=(True, True), axis=None):
  1556. """
  1557. Trims an array by masking some proportion of the data on each end.
  1558. Returns a masked version of the input array.
  1559. Parameters
  1560. ----------
  1561. a : sequence
  1562. Input array.
  1563. limits : {None, tuple}, optional
  1564. Tuple of the percentages to cut on each side of the array, with respect
  1565. to the number of unmasked data, as floats between 0. and 1.
  1566. Noting n the number of unmasked data before trimming, the
  1567. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  1568. masked, and the total number of unmasked data after trimming is
  1569. n*(1.-sum(limits)). The value of one limit can be set to None to
  1570. indicate an open interval.
  1571. inclusive : {(True,True) tuple}, optional
  1572. Tuple of flags indicating whether the number of data being masked on
  1573. the left (right) end should be truncated (True) or rounded (False) to
  1574. integers.
  1575. axis : {None,int}, optional
  1576. Axis along which to trim. If None, the whole array is trimmed, but its
  1577. shape is maintained.
  1578. """
  1579. def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1580. n = a.count()
  1581. idx = a.argsort()
  1582. if low_limit:
  1583. if low_inclusive:
  1584. lowidx = int(low_limit*n)
  1585. else:
  1586. lowidx = int(np.round(low_limit*n))
  1587. a[idx[:lowidx]] = masked
  1588. if up_limit is not None:
  1589. if up_inclusive:
  1590. upidx = n - int(n*up_limit)
  1591. else:
  1592. upidx = n - int(np.round(n*up_limit))
  1593. a[idx[upidx:]] = masked
  1594. return a
  1595. a = ma.asarray(a)
  1596. a.unshare_mask()
  1597. if limits is None:
  1598. return a
  1599. # Check the limits
  1600. (lolim, uplim) = limits
  1601. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1602. if lolim is not None:
  1603. if lolim > 1. or lolim < 0:
  1604. raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
  1605. if uplim is not None:
  1606. if uplim > 1. or uplim < 0:
  1607. raise ValueError(errmsg % 'end' + f"(got {uplim})")
  1608. (loinc, upinc) = inclusive
  1609. if axis is None:
  1610. shp = a.shape
  1611. return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
  1612. else:
  1613. return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
  1614. trimdoc = _dedent_for_py313("""
  1615. Parameters
  1616. ----------
  1617. a : sequence
  1618. Input array
  1619. limits : {None, tuple}, optional
  1620. If `relative` is False, tuple (lower limit, upper limit) in absolute values.
  1621. Values of the input array lower (greater) than the lower (upper) limit are
  1622. masked.
  1623. If `relative` is True, tuple (lower percentage, upper percentage) to cut
  1624. on each side of the array, with respect to the number of unmasked data.
  1625. Noting n the number of unmasked data before trimming, the (n*limits[0])th
  1626. smallest data and the (n*limits[1])th largest data are masked, and the
  1627. total number of unmasked data after trimming is n*(1.-sum(limits))
  1628. In each case, the value of one limit can be set to None to indicate an
  1629. open interval.
  1630. If limits is None, no trimming is performed
  1631. inclusive : {(bool, bool) tuple}, optional
  1632. If `relative` is False, tuple indicating whether values exactly equal
  1633. to the absolute limits are allowed.
  1634. If `relative` is True, tuple indicating whether the number of data
  1635. being masked on each side should be rounded (True) or truncated
  1636. (False).
  1637. relative : bool, optional
  1638. Whether to consider the limits as absolute values (False) or proportions
  1639. to cut (True).
  1640. axis : int, optional
  1641. Axis along which to trim.""")
  1642. def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
  1643. """
  1644. Trims an array by masking the data outside some given limits.
  1645. Returns a masked version of the input array.
  1646. %s
  1647. Examples
  1648. --------
  1649. >>> from scipy.stats.mstats import trim
  1650. >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
  1651. >>> print(trim(z,(3,8)))
  1652. [-- -- 3 4 5 6 7 8 -- --]
  1653. >>> print(trim(z,(0.1,0.2),relative=True))
  1654. [-- 2 3 4 5 6 7 8 -- --]
  1655. """
  1656. if relative:
  1657. return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
  1658. else:
  1659. return trima(a, limits=limits, inclusive=inclusive)
  1660. if trim.__doc__:
  1661. trim.__doc__ = trim.__doc__ % trimdoc
  1662. def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
  1663. """
  1664. Trims the smallest and largest data values.
  1665. Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
  1666. ``int(proportiontocut * n)`` largest values of data along the given axis,
  1667. where n is the number of unmasked values before trimming.
  1668. Parameters
  1669. ----------
  1670. data : ndarray
  1671. Data to trim.
  1672. proportiontocut : float, optional
  1673. Percentage of trimming (as a float between 0 and 1).
  1674. If n is the number of unmasked values before trimming, the number of
  1675. values after trimming is ``(1 - 2*proportiontocut) * n``.
  1676. Default is 0.2.
  1677. inclusive : {(bool, bool) tuple}, optional
  1678. Tuple indicating whether the number of data being masked on each side
  1679. should be rounded (True) or truncated (False).
  1680. axis : int, optional
  1681. Axis along which to perform the trimming.
  1682. If None, the input array is first flattened.
  1683. """
  1684. return trimr(data, limits=(proportiontocut,proportiontocut),
  1685. inclusive=inclusive, axis=axis)
  1686. def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
  1687. axis=None):
  1688. """
  1689. Trims the data by masking values from one tail.
  1690. Parameters
  1691. ----------
  1692. data : array_like
  1693. Data to trim.
  1694. proportiontocut : float, optional
  1695. Percentage of trimming. If n is the number of unmasked values
  1696. before trimming, the number of values after trimming is
  1697. ``(1 - proportiontocut) * n``. Default is 0.2.
  1698. tail : {'left','right'}, optional
  1699. If 'left' the `proportiontocut` lowest values will be masked.
  1700. If 'right' the `proportiontocut` highest values will be masked.
  1701. Default is 'left'.
  1702. inclusive : {(bool, bool) tuple}, optional
  1703. Tuple indicating whether the number of data being masked on each side
  1704. should be rounded (True) or truncated (False). Default is
  1705. (True, True).
  1706. axis : int, optional
  1707. Axis along which to perform the trimming.
  1708. If None, the input array is first flattened. Default is None.
  1709. Returns
  1710. -------
  1711. trimtail : ndarray
  1712. Returned array of same shape as `data` with masked tail values.
  1713. """
  1714. tail = str(tail).lower()[0]
  1715. if tail == 'l':
  1716. limits = (proportiontocut,None)
  1717. elif tail == 'r':
  1718. limits = (None, proportiontocut)
  1719. else:
  1720. raise TypeError("The tail argument should be in ('left','right')")
  1721. return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
  1722. trim1 = trimtail
  1723. def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1724. axis=None):
  1725. """Returns the trimmed mean of the data along the given axis.
  1726. %s
  1727. """
  1728. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1729. limits = (limits, limits)
  1730. if relative:
  1731. return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
  1732. else:
  1733. return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
  1734. if trimmed_mean.__doc__:
  1735. trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
  1736. def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1737. axis=None, ddof=0):
  1738. """Returns the trimmed variance of the data along the given axis.
  1739. %s
  1740. ddof : {0,integer}, optional
  1741. Means Delta Degrees of Freedom. The denominator used during computations
  1742. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1743. biased estimate of the variance.
  1744. """
  1745. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1746. limits = (limits, limits)
  1747. if relative:
  1748. out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
  1749. else:
  1750. out = trima(a,limits=limits,inclusive=inclusive)
  1751. return out.var(axis=axis, ddof=ddof)
  1752. if trimmed_var.__doc__:
  1753. trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
  1754. def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1755. axis=None, ddof=0):
  1756. """Returns the trimmed standard deviation of the data along the given axis.
  1757. %s
  1758. ddof : {0,integer}, optional
  1759. Means Delta Degrees of Freedom. The denominator used during computations
  1760. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1761. biased estimate of the variance.
  1762. """
  1763. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1764. limits = (limits, limits)
  1765. if relative:
  1766. out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
  1767. else:
  1768. out = trima(a,limits=limits,inclusive=inclusive)
  1769. return out.std(axis=axis,ddof=ddof)
  1770. if trimmed_std.__doc__:
  1771. trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
  1772. def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
  1773. """
  1774. Returns the standard error of the trimmed mean along the given axis.
  1775. Parameters
  1776. ----------
  1777. a : sequence
  1778. Input array
  1779. limits : {(0.1,0.1), tuple of float}, optional
  1780. tuple (lower percentage, upper percentage) to cut on each side of the
  1781. array, with respect to the number of unmasked data.
  1782. If n is the number of unmasked data before trimming, the values
  1783. smaller than ``n * limits[0]`` and the values larger than
  1784. ``n * `limits[1]`` are masked, and the total number of unmasked
  1785. data after trimming is ``n * (1.-sum(limits))``. In each case,
  1786. the value of one limit can be set to None to indicate an open interval.
  1787. If `limits` is None, no trimming is performed.
  1788. inclusive : {(bool, bool) tuple} optional
  1789. Tuple indicating whether the number of data being masked on each side
  1790. should be rounded (True) or truncated (False).
  1791. axis : int, optional
  1792. Axis along which to trim.
  1793. Returns
  1794. -------
  1795. trimmed_stde : scalar or ndarray
  1796. """
  1797. def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1798. "Returns the standard error of the trimmed mean for a 1D input data."
  1799. n = a.count()
  1800. idx = a.argsort()
  1801. if low_limit:
  1802. if low_inclusive:
  1803. lowidx = int(low_limit*n)
  1804. else:
  1805. lowidx = np.round(low_limit*n)
  1806. a[idx[:lowidx]] = masked
  1807. if up_limit is not None:
  1808. if up_inclusive:
  1809. upidx = n - int(n*up_limit)
  1810. else:
  1811. upidx = n - np.round(n*up_limit)
  1812. a[idx[upidx:]] = masked
  1813. a[idx[:lowidx]] = a[idx[lowidx]]
  1814. a[idx[upidx:]] = a[idx[upidx-1]]
  1815. winstd = a.std(ddof=1)
  1816. return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
  1817. a = ma.array(a, copy=True, subok=True)
  1818. a.unshare_mask()
  1819. if limits is None:
  1820. return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
  1821. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1822. limits = (limits, limits)
  1823. # Check the limits
  1824. (lolim, uplim) = limits
  1825. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1826. if lolim is not None:
  1827. if lolim > 1. or lolim < 0:
  1828. raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
  1829. if uplim is not None:
  1830. if uplim > 1. or uplim < 0:
  1831. raise ValueError(errmsg % 'end' + f"(got {uplim})")
  1832. (loinc, upinc) = inclusive
  1833. if (axis is None):
  1834. return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
  1835. else:
  1836. if a.ndim > 2:
  1837. raise ValueError(f"Array 'a' must be at most two dimensional, "
  1838. f"but got a.ndim = {a.ndim}")
  1839. return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
  1840. lolim,uplim,loinc,upinc)
  1841. def _mask_to_limits(a, limits, inclusive):
  1842. """Mask an array for values outside of given limits.
  1843. This is primarily a utility function.
  1844. Parameters
  1845. ----------
  1846. a : array
  1847. limits : (float or None, float or None)
  1848. A tuple consisting of the (lower limit, upper limit). Values in the
  1849. input array less than the lower limit or greater than the upper limit
  1850. will be masked out. None implies no limit.
  1851. inclusive : (bool, bool)
  1852. A tuple consisting of the (lower flag, upper flag). These flags
  1853. determine whether values exactly equal to lower or upper are allowed.
  1854. Returns
  1855. -------
  1856. A MaskedArray.
  1857. Raises
  1858. ------
  1859. A ValueError if there are no values within the given limits.
  1860. """
  1861. lower_limit, upper_limit = limits
  1862. lower_include, upper_include = inclusive
  1863. am = ma.MaskedArray(a)
  1864. if lower_limit is not None:
  1865. if lower_include:
  1866. am = ma.masked_less(am, lower_limit)
  1867. else:
  1868. am = ma.masked_less_equal(am, lower_limit)
  1869. if upper_limit is not None:
  1870. if upper_include:
  1871. am = ma.masked_greater(am, upper_limit)
  1872. else:
  1873. am = ma.masked_greater_equal(am, upper_limit)
  1874. if am.count() == 0:
  1875. raise ValueError("No array values within given limits")
  1876. return am
  1877. def tmean(a, limits=None, inclusive=(True, True), axis=None):
  1878. """
  1879. Compute the trimmed mean.
  1880. Parameters
  1881. ----------
  1882. a : array_like
  1883. Array of values.
  1884. limits : None or (lower limit, upper limit), optional
  1885. Values in the input array less than the lower limit or greater than the
  1886. upper limit will be ignored. When limits is None (default), then all
  1887. values are used. Either of the limit values in the tuple can also be
  1888. None representing a half-open interval.
  1889. inclusive : (bool, bool), optional
  1890. A tuple consisting of the (lower flag, upper flag). These flags
  1891. determine whether values exactly equal to the lower or upper limits
  1892. are included. The default value is (True, True).
  1893. axis : int or None, optional
  1894. Axis along which to operate. If None, compute over the
  1895. whole array. Default is None.
  1896. Returns
  1897. -------
  1898. tmean : float
  1899. Notes
  1900. -----
  1901. For more details on `tmean`, see `scipy.stats.tmean`.
  1902. Examples
  1903. --------
  1904. >>> import numpy as np
  1905. >>> from scipy.stats import mstats
  1906. >>> a = np.array([[6, 8, 3, 0],
  1907. ... [3, 9, 1, 2],
  1908. ... [8, 7, 8, 2],
  1909. ... [5, 6, 0, 2],
  1910. ... [4, 5, 5, 2]])
  1911. ...
  1912. ...
  1913. >>> mstats.tmean(a, (2,5))
  1914. 3.3
  1915. >>> mstats.tmean(a, (2,5), axis=0)
  1916. masked_array(data=[4.0, 5.0, 4.0, 2.0],
  1917. mask=[False, False, False, False],
  1918. fill_value=1e+20)
  1919. """
  1920. return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
  1921. def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  1922. """
  1923. Compute the trimmed variance
  1924. This function computes the sample variance of an array of values,
  1925. while ignoring values which are outside of given `limits`.
  1926. Parameters
  1927. ----------
  1928. a : array_like
  1929. Array of values.
  1930. limits : None or (lower limit, upper limit), optional
  1931. Values in the input array less than the lower limit or greater than the
  1932. upper limit will be ignored. When limits is None, then all values are
  1933. used. Either of the limit values in the tuple can also be None
  1934. representing a half-open interval. The default value is None.
  1935. inclusive : (bool, bool), optional
  1936. A tuple consisting of the (lower flag, upper flag). These flags
  1937. determine whether values exactly equal to the lower or upper limits
  1938. are included. The default value is (True, True).
  1939. axis : int or None, optional
  1940. Axis along which to operate. If None, compute over the
  1941. whole array. Default is zero.
  1942. ddof : int, optional
  1943. Delta degrees of freedom. Default is 1.
  1944. Returns
  1945. -------
  1946. tvar : float
  1947. Trimmed variance.
  1948. Notes
  1949. -----
  1950. For more details on `tvar`, see `scipy.stats.tvar`.
  1951. """
  1952. a = a.astype(float).ravel()
  1953. if limits is None:
  1954. n = (~a.mask).sum() # todo: better way to do that?
  1955. return np.ma.var(a) * n/(n-1.)
  1956. am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
  1957. return np.ma.var(am, axis=axis, ddof=ddof)
  1958. def tmin(a, lowerlimit=None, axis=0, inclusive=True):
  1959. """
  1960. Compute the trimmed minimum
  1961. Parameters
  1962. ----------
  1963. a : array_like
  1964. array of values
  1965. lowerlimit : None or float, optional
  1966. Values in the input array less than the given limit will be ignored.
  1967. When lowerlimit is None, then all values are used. The default value
  1968. is None.
  1969. axis : int or None, optional
  1970. Axis along which to operate. Default is 0. If None, compute over the
  1971. whole array `a`.
  1972. inclusive : {True, False}, optional
  1973. This flag determines whether values exactly equal to the lower limit
  1974. are included. The default value is True.
  1975. Returns
  1976. -------
  1977. tmin : float, int or ndarray
  1978. Notes
  1979. -----
  1980. For more details on `tmin`, see `scipy.stats.tmin`.
  1981. Examples
  1982. --------
  1983. >>> import numpy as np
  1984. >>> from scipy.stats import mstats
  1985. >>> a = np.array([[6, 8, 3, 0],
  1986. ... [3, 2, 1, 2],
  1987. ... [8, 1, 8, 2],
  1988. ... [5, 3, 0, 2],
  1989. ... [4, 7, 5, 2]])
  1990. ...
  1991. >>> mstats.tmin(a, 5)
  1992. masked_array(data=[5, 7, 5, --],
  1993. mask=[False, False, False, True],
  1994. fill_value=999999)
  1995. """
  1996. a, axis = _chk_asarray(a, axis)
  1997. am = trima(a, (lowerlimit, None), (inclusive, False))
  1998. return ma.minimum.reduce(am, axis)
  1999. def tmax(a, upperlimit=None, axis=0, inclusive=True):
  2000. """
  2001. Compute the trimmed maximum
  2002. This function computes the maximum value of an array along a given axis,
  2003. while ignoring values larger than a specified upper limit.
  2004. Parameters
  2005. ----------
  2006. a : array_like
  2007. array of values
  2008. upperlimit : None or float, optional
  2009. Values in the input array greater than the given limit will be ignored.
  2010. When upperlimit is None, then all values are used. The default value
  2011. is None.
  2012. axis : int or None, optional
  2013. Axis along which to operate. Default is 0. If None, compute over the
  2014. whole array `a`.
  2015. inclusive : {True, False}, optional
  2016. This flag determines whether values exactly equal to the upper limit
  2017. are included. The default value is True.
  2018. Returns
  2019. -------
  2020. tmax : float, int or ndarray
  2021. Notes
  2022. -----
  2023. For more details on `tmax`, see `scipy.stats.tmax`.
  2024. Examples
  2025. --------
  2026. >>> import numpy as np
  2027. >>> from scipy.stats import mstats
  2028. >>> a = np.array([[6, 8, 3, 0],
  2029. ... [3, 9, 1, 2],
  2030. ... [8, 7, 8, 2],
  2031. ... [5, 6, 0, 2],
  2032. ... [4, 5, 5, 2]])
  2033. ...
  2034. ...
  2035. >>> mstats.tmax(a, 4)
  2036. masked_array(data=[4, --, 3, 2],
  2037. mask=[False, True, False, False],
  2038. fill_value=999999)
  2039. """
  2040. a, axis = _chk_asarray(a, axis)
  2041. am = trima(a, (None, upperlimit), (False, inclusive))
  2042. return ma.maximum.reduce(am, axis)
  2043. def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  2044. """
  2045. Compute the trimmed standard error of the mean.
  2046. This function finds the standard error of the mean for given
  2047. values, ignoring values outside the given `limits`.
  2048. Parameters
  2049. ----------
  2050. a : array_like
  2051. array of values
  2052. limits : None or (lower limit, upper limit), optional
  2053. Values in the input array less than the lower limit or greater than the
  2054. upper limit will be ignored. When limits is None, then all values are
  2055. used. Either of the limit values in the tuple can also be None
  2056. representing a half-open interval. The default value is None.
  2057. inclusive : (bool, bool), optional
  2058. A tuple consisting of the (lower flag, upper flag). These flags
  2059. determine whether values exactly equal to the lower or upper limits
  2060. are included. The default value is (True, True).
  2061. axis : int or None, optional
  2062. Axis along which to operate. If None, compute over the
  2063. whole array. Default is zero.
  2064. ddof : int, optional
  2065. Delta degrees of freedom. Default is 1.
  2066. Returns
  2067. -------
  2068. tsem : float
  2069. Notes
  2070. -----
  2071. For more details on `tsem`, see `scipy.stats.tsem`.
  2072. """
  2073. a = ma.asarray(a).ravel()
  2074. if limits is None:
  2075. n = float(a.count())
  2076. return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
  2077. am = trima(a.ravel(), limits, inclusive)
  2078. sd = np.sqrt(am.var(axis=axis, ddof=ddof))
  2079. return sd / np.sqrt(am.count())
  2080. def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
  2081. axis=None, nan_policy='propagate'):
  2082. """Returns a Winsorized version of the input array.
  2083. The (limits[0])th lowest values are set to the (limits[0])th percentile,
  2084. and the (limits[1])th highest values are set to the (1 - limits[1])th
  2085. percentile.
  2086. Masked values are skipped.
  2087. Parameters
  2088. ----------
  2089. a : sequence
  2090. Input array.
  2091. limits : {None, tuple of float}, optional
  2092. Tuple of the percentages to cut on each side of the array, with respect
  2093. to the number of unmasked data, as floats between 0. and 1.
  2094. Noting n the number of unmasked data before trimming, the
  2095. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  2096. masked, and the total number of unmasked data after trimming
  2097. is n*(1.-sum(limits)) The value of one limit can be set to None to
  2098. indicate an open interval.
  2099. inclusive : {(True, True) tuple}, optional
  2100. Tuple indicating whether the number of data being masked on each side
  2101. should be truncated (True) or rounded (False).
  2102. inplace : {False, True}, optional
  2103. Whether to winsorize in place (True) or to use a copy (False)
  2104. axis : {None, int}, optional
  2105. Axis along which to trim. If None, the whole array is trimmed, but its
  2106. shape is maintained.
  2107. nan_policy : {'propagate', 'raise', 'omit'}, optional
  2108. Defines how to handle when input contains nan.
  2109. The following options are available (default is 'propagate'):
  2110. * 'propagate': allows nan values and may overwrite or propagate them
  2111. * 'raise': throws an error
  2112. * 'omit': performs the calculations ignoring nan values
  2113. Notes
  2114. -----
  2115. This function is applied to reduce the effect of possibly spurious outliers
  2116. by limiting the extreme values.
  2117. Examples
  2118. --------
  2119. >>> import numpy as np
  2120. >>> from scipy.stats.mstats import winsorize
  2121. A shuffled array contains integers from 1 to 10.
  2122. >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
  2123. The 10% of the lowest value (i.e., ``1``) and the 20% of the highest
  2124. values (i.e., ``9`` and ``10``) are replaced.
  2125. >>> winsorize(a, limits=[0.1, 0.2])
  2126. masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
  2127. mask=False,
  2128. fill_value=999999)
  2129. """
  2130. def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
  2131. contains_nan, nan_policy):
  2132. n = a.count()
  2133. idx = a.argsort()
  2134. if contains_nan:
  2135. nan_count = np.count_nonzero(np.isnan(a))
  2136. if low_limit:
  2137. if low_include:
  2138. lowidx = int(low_limit * n)
  2139. else:
  2140. lowidx = np.round(low_limit * n).astype(int)
  2141. if contains_nan and nan_policy == 'omit':
  2142. lowidx = min(lowidx, n-nan_count-1)
  2143. a[idx[:lowidx]] = a[idx[lowidx]]
  2144. if up_limit is not None:
  2145. if up_include:
  2146. upidx = n - int(n * up_limit)
  2147. else:
  2148. upidx = n - np.round(n * up_limit).astype(int)
  2149. if contains_nan and nan_policy == 'omit':
  2150. a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
  2151. else:
  2152. a[idx[upidx:]] = a[idx[upidx - 1]]
  2153. return a
  2154. contains_nan = _contains_nan(a, nan_policy)
  2155. # We are going to modify a: better make a copy
  2156. a = ma.array(a, copy=np.logical_not(inplace))
  2157. if limits is None:
  2158. return a
  2159. if (not isinstance(limits, tuple)) and isinstance(limits, float):
  2160. limits = (limits, limits)
  2161. # Check the limits
  2162. (lolim, uplim) = limits
  2163. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  2164. if lolim is not None:
  2165. if lolim > 1. or lolim < 0:
  2166. raise ValueError(errmsg % 'beginning' + f"(got {lolim})")
  2167. if uplim is not None:
  2168. if uplim > 1. or uplim < 0:
  2169. raise ValueError(errmsg % 'end' + f"(got {uplim})")
  2170. (loinc, upinc) = inclusive
  2171. if axis is None:
  2172. shp = a.shape
  2173. return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
  2174. contains_nan, nan_policy).reshape(shp)
  2175. else:
  2176. return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
  2177. upinc, contains_nan, nan_policy)
  2178. def moment(a, moment=1, axis=0):
  2179. """
  2180. Calculates the nth moment about the mean for a sample.
  2181. Parameters
  2182. ----------
  2183. a : array_like
  2184. data
  2185. moment : int, optional
  2186. order of central moment that is returned
  2187. axis : int or None, optional
  2188. Axis along which the central moment is computed. Default is 0.
  2189. If None, compute over the whole array `a`.
  2190. Returns
  2191. -------
  2192. n-th central moment : ndarray or float
  2193. The appropriate moment along the given axis or over all values if axis
  2194. is None. The denominator for the moment calculation is the number of
  2195. observations, no degrees of freedom correction is done.
  2196. Notes
  2197. -----
  2198. For more details about `moment`, see `scipy.stats.moment`.
  2199. """
  2200. a, axis = _chk_asarray(a, axis)
  2201. if a.size == 0:
  2202. moment_shape = list(a.shape)
  2203. del moment_shape[axis]
  2204. dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
  2205. # empty array, return nan(s) with shape matching `moment`
  2206. out_shape = (moment_shape if np.isscalar(moment)
  2207. else [len(moment)] + moment_shape)
  2208. if len(out_shape) == 0:
  2209. return dtype(np.nan)
  2210. else:
  2211. return ma.array(np.full(out_shape, np.nan, dtype=dtype))
  2212. # for array_like moment input, return a value for each.
  2213. if not np.isscalar(moment):
  2214. mean = a.mean(axis, keepdims=True)
  2215. mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
  2216. return ma.array(mmnt)
  2217. else:
  2218. return _moment(a, moment, axis)
  2219. # Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
  2220. def _moment(a, moment, axis, *, mean=None):
  2221. if np.abs(moment - np.round(moment)) > 0:
  2222. raise ValueError("All moment parameters must be integers")
  2223. if moment == 0 or moment == 1:
  2224. # By definition the zeroth moment about the mean is 1, and the first
  2225. # moment is 0.
  2226. shape = list(a.shape)
  2227. del shape[axis]
  2228. dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
  2229. if len(shape) == 0:
  2230. return dtype(1.0 if moment == 0 else 0.0)
  2231. else:
  2232. return (ma.ones(shape, dtype=dtype) if moment == 0
  2233. else ma.zeros(shape, dtype=dtype))
  2234. else:
  2235. # Exponentiation by squares: form exponent sequence
  2236. n_list = [moment]
  2237. current_n = moment
  2238. while current_n > 2:
  2239. if current_n % 2:
  2240. current_n = (current_n-1)/2
  2241. else:
  2242. current_n /= 2
  2243. n_list.append(current_n)
  2244. # Starting point for exponentiation by squares
  2245. mean = a.mean(axis, keepdims=True) if mean is None else mean
  2246. a_zero_mean = a - mean
  2247. if n_list[-1] == 1:
  2248. s = a_zero_mean.copy()
  2249. else:
  2250. s = a_zero_mean**2
  2251. # Perform multiplications
  2252. for n in n_list[-2::-1]:
  2253. s = s**2
  2254. if n % 2:
  2255. s *= a_zero_mean
  2256. return s.mean(axis)
  2257. def variation(a, axis=0, ddof=0):
  2258. """
  2259. Compute the coefficient of variation.
  2260. The coefficient of variation is the standard deviation divided by the
  2261. mean. This function is equivalent to::
  2262. np.std(x, axis=axis, ddof=ddof) / np.mean(x)
  2263. The default for ``ddof`` is 0, but many definitions of the coefficient
  2264. of variation use the square root of the unbiased sample variance
  2265. for the sample standard deviation, which corresponds to ``ddof=1``.
  2266. Parameters
  2267. ----------
  2268. a : array_like
  2269. Input array.
  2270. axis : int or None, optional
  2271. Axis along which to calculate the coefficient of variation. Default
  2272. is 0. If None, compute over the whole array `a`.
  2273. ddof : int, optional
  2274. Delta degrees of freedom. Default is 0.
  2275. Returns
  2276. -------
  2277. variation : ndarray
  2278. The calculated variation along the requested axis.
  2279. Notes
  2280. -----
  2281. For more details about `variation`, see `scipy.stats.variation`.
  2282. Examples
  2283. --------
  2284. >>> import numpy as np
  2285. >>> from scipy.stats.mstats import variation
  2286. >>> a = np.array([2,8,4])
  2287. >>> variation(a)
  2288. 0.5345224838248487
  2289. >>> b = np.array([2,8,3,4])
  2290. >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
  2291. >>> variation(c)
  2292. 0.5345224838248487
  2293. In the example above, it can be seen that this works the same as
  2294. `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
  2295. array elements.
  2296. """
  2297. a, axis = _chk_asarray(a, axis)
  2298. return a.std(axis, ddof=ddof)/a.mean(axis)
  2299. def skew(a, axis=0, bias=True):
  2300. """
  2301. Computes the skewness of a data set.
  2302. Parameters
  2303. ----------
  2304. a : ndarray
  2305. data
  2306. axis : int or None, optional
  2307. Axis along which skewness is calculated. Default is 0.
  2308. If None, compute over the whole array `a`.
  2309. bias : bool, optional
  2310. If False, then the calculations are corrected for statistical bias.
  2311. Returns
  2312. -------
  2313. skewness : ndarray
  2314. The skewness of values along an axis, returning 0 where all values are
  2315. equal.
  2316. Notes
  2317. -----
  2318. For more details about `skew`, see `scipy.stats.skew`.
  2319. """
  2320. a, axis = _chk_asarray(a,axis)
  2321. mean = a.mean(axis, keepdims=True)
  2322. m2 = _moment(a, 2, axis, mean=mean)
  2323. m3 = _moment(a, 3, axis, mean=mean)
  2324. zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
  2325. with np.errstate(all='ignore'):
  2326. vals = ma.where(zero, 0, m3 / m2**1.5)
  2327. if not bias and zero is not ma.masked and m2 is not ma.masked:
  2328. n = a.count(axis)
  2329. can_correct = ~zero & (n > 2)
  2330. if can_correct.any():
  2331. n = np.extract(can_correct, n)
  2332. m2 = np.extract(can_correct, m2)
  2333. m3 = np.extract(can_correct, m3)
  2334. nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
  2335. np.place(vals, can_correct, nval)
  2336. return vals
  2337. def kurtosis(a, axis=0, fisher=True, bias=True):
  2338. """
  2339. Computes the kurtosis (Fisher or Pearson) of a dataset.
  2340. Kurtosis is the fourth central moment divided by the square of the
  2341. variance. If Fisher's definition is used, then 3.0 is subtracted from
  2342. the result to give 0.0 for a normal distribution.
  2343. If bias is False then the kurtosis is calculated using k statistics to
  2344. eliminate bias coming from biased moment estimators
  2345. Use `kurtosistest` to see if result is close enough to normal.
  2346. Parameters
  2347. ----------
  2348. a : array
  2349. data for which the kurtosis is calculated
  2350. axis : int or None, optional
  2351. Axis along which the kurtosis is calculated. Default is 0.
  2352. If None, compute over the whole array `a`.
  2353. fisher : bool, optional
  2354. If True, Fisher's definition is used (normal ==> 0.0). If False,
  2355. Pearson's definition is used (normal ==> 3.0).
  2356. bias : bool, optional
  2357. If False, then the calculations are corrected for statistical bias.
  2358. Returns
  2359. -------
  2360. kurtosis : array
  2361. The kurtosis of values along an axis. If all values are equal,
  2362. return -3 for Fisher's definition and 0 for Pearson's definition.
  2363. Notes
  2364. -----
  2365. For more details about `kurtosis`, see `scipy.stats.kurtosis`.
  2366. """
  2367. a, axis = _chk_asarray(a, axis)
  2368. mean = a.mean(axis, keepdims=True)
  2369. m2 = _moment(a, 2, axis, mean=mean)
  2370. m4 = _moment(a, 4, axis, mean=mean)
  2371. zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
  2372. with np.errstate(all='ignore'):
  2373. vals = ma.where(zero, 0, m4 / m2**2.0)
  2374. if not bias and zero is not ma.masked and m2 is not ma.masked:
  2375. n = a.count(axis)
  2376. can_correct = ~zero & (n > 3)
  2377. if can_correct.any():
  2378. n = np.extract(can_correct, n)
  2379. m2 = np.extract(can_correct, m2)
  2380. m4 = np.extract(can_correct, m4)
  2381. nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
  2382. np.place(vals, can_correct, nval+3.0)
  2383. if fisher:
  2384. return vals - 3
  2385. else:
  2386. return vals
  2387. DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
  2388. 'variance', 'skewness',
  2389. 'kurtosis'))
  2390. def describe(a, axis=0, ddof=0, bias=True):
  2391. """
  2392. Computes several descriptive statistics of the passed array.
  2393. Parameters
  2394. ----------
  2395. a : array_like
  2396. Data array
  2397. axis : int or None, optional
  2398. Axis along which to calculate statistics. Default 0. If None,
  2399. compute over the whole array `a`.
  2400. ddof : int, optional
  2401. degree of freedom (default 0); note that default ddof is different
  2402. from the same routine in stats.describe
  2403. bias : bool, optional
  2404. If False, then the skewness and kurtosis calculations are corrected for
  2405. statistical bias.
  2406. Returns
  2407. -------
  2408. nobs : int
  2409. (size of the data (discarding missing values)
  2410. minmax : (int, int)
  2411. min, max
  2412. mean : float
  2413. arithmetic mean
  2414. variance : float
  2415. unbiased variance
  2416. skewness : float
  2417. biased skewness
  2418. kurtosis : float
  2419. biased kurtosis
  2420. Examples
  2421. --------
  2422. >>> import numpy as np
  2423. >>> from scipy.stats.mstats import describe
  2424. >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
  2425. >>> describe(ma)
  2426. DescribeResult(nobs=np.int64(3), minmax=(masked_array(data=0,
  2427. mask=False,
  2428. fill_value=999999), masked_array(data=2,
  2429. mask=False,
  2430. fill_value=999999)), mean=np.float64(1.0),
  2431. variance=np.float64(0.6666666666666666),
  2432. skewness=masked_array(data=0., mask=False, fill_value=1e+20),
  2433. kurtosis=np.float64(-1.5))
  2434. """
  2435. a, axis = _chk_asarray(a, axis)
  2436. n = a.count(axis)
  2437. mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
  2438. m = a.mean(axis)
  2439. v = a.var(axis, ddof=ddof)
  2440. sk = skew(a, axis, bias=bias)
  2441. kurt = kurtosis(a, axis, bias=bias)
  2442. return DescribeResult(n, mm, m, v, sk, kurt)
  2443. def stde_median(data, axis=None):
  2444. """Returns the McKean-Schrader estimate of the standard error of the sample
  2445. median along the given axis. masked values are discarded.
  2446. Parameters
  2447. ----------
  2448. data : ndarray
  2449. Data to trim.
  2450. axis : {None,int}, optional
  2451. Axis along which to perform the trimming.
  2452. If None, the input array is first flattened.
  2453. """
  2454. def _stdemed_1D(data):
  2455. data = np.sort(data.compressed())
  2456. n = len(data)
  2457. z = 2.5758293035489004
  2458. k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
  2459. return ((data[n-k] - data[k-1])/(2.*z))
  2460. data = ma.array(data, copy=False, subok=True)
  2461. if (axis is None):
  2462. return _stdemed_1D(data)
  2463. else:
  2464. if data.ndim > 2:
  2465. raise ValueError(f"Array 'data' must be at most two dimensional, "
  2466. f"but got data.ndim = {data.ndim}")
  2467. return ma.apply_along_axis(_stdemed_1D, axis, data)
  2468. SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
  2469. def skewtest(a, axis=0, alternative='two-sided'):
  2470. """
  2471. Tests whether the skew is different from the normal distribution.
  2472. Parameters
  2473. ----------
  2474. a : array_like
  2475. The data to be tested
  2476. axis : int or None, optional
  2477. Axis along which statistics are calculated. Default is 0.
  2478. If None, compute over the whole array `a`.
  2479. alternative : {'two-sided', 'less', 'greater'}, optional
  2480. Defines the alternative hypothesis. Default is 'two-sided'.
  2481. The following options are available:
  2482. * 'two-sided': the skewness of the distribution underlying the sample
  2483. is different from that of the normal distribution (i.e. 0)
  2484. * 'less': the skewness of the distribution underlying the sample
  2485. is less than that of the normal distribution
  2486. * 'greater': the skewness of the distribution underlying the sample
  2487. is greater than that of the normal distribution
  2488. .. versionadded:: 1.7.0
  2489. Returns
  2490. -------
  2491. statistic : array_like
  2492. The computed z-score for this test.
  2493. pvalue : array_like
  2494. A p-value for the hypothesis test
  2495. Notes
  2496. -----
  2497. For more details about `skewtest`, see `scipy.stats.skewtest`.
  2498. """
  2499. a, axis = _chk_asarray(a, axis)
  2500. if axis is None:
  2501. a = a.ravel()
  2502. axis = 0
  2503. b2 = skew(a,axis)
  2504. n = a.count(axis)
  2505. if np.min(n) < 8:
  2506. raise ValueError(f"skewtest is not valid with less than 8 samples; "
  2507. f"{np.min(n)} samples were given.")
  2508. y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
  2509. beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
  2510. W2 = -1 + ma.sqrt(2*(beta2-1))
  2511. delta = 1/ma.sqrt(0.5*ma.log(W2))
  2512. alpha = ma.sqrt(2.0/(W2-1))
  2513. y = ma.where(y == 0, 1, y)
  2514. Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
  2515. pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
  2516. return SkewtestResult(Z[()], pvalue[()])
  2517. KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
  2518. def kurtosistest(a, axis=0, alternative='two-sided'):
  2519. """
  2520. Tests whether a dataset has normal kurtosis
  2521. Parameters
  2522. ----------
  2523. a : array_like
  2524. array of the sample data
  2525. axis : int or None, optional
  2526. Axis along which to compute test. Default is 0. If None,
  2527. compute over the whole array `a`.
  2528. alternative : {'two-sided', 'less', 'greater'}, optional
  2529. Defines the alternative hypothesis.
  2530. The following options are available (default is 'two-sided'):
  2531. * 'two-sided': the kurtosis of the distribution underlying the sample
  2532. is different from that of the normal distribution
  2533. * 'less': the kurtosis of the distribution underlying the sample
  2534. is less than that of the normal distribution
  2535. * 'greater': the kurtosis of the distribution underlying the sample
  2536. is greater than that of the normal distribution
  2537. .. versionadded:: 1.7.0
  2538. Returns
  2539. -------
  2540. statistic : array_like
  2541. The computed z-score for this test.
  2542. pvalue : array_like
  2543. The p-value for the hypothesis test
  2544. Notes
  2545. -----
  2546. For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
  2547. """
  2548. a, axis = _chk_asarray(a, axis)
  2549. n = a.count(axis=axis)
  2550. if np.min(n) < 5:
  2551. raise ValueError(f"kurtosistest requires at least 5 observations; "
  2552. f"{np.min(n)} observations were given.")
  2553. if np.min(n) < 20:
  2554. warnings.warn(f"kurtosistest only valid for n>=20 ... continuing "
  2555. f"anyway, n={np.min(n)}", stacklevel=2)
  2556. b2 = kurtosis(a, axis, fisher=False)
  2557. E = 3.0*(n-1) / (n+1)
  2558. varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
  2559. x = (b2-E)/ma.sqrt(varb2)
  2560. sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
  2561. (n*(n-2)*(n-3)))
  2562. A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
  2563. term1 = 1 - 2./(9.0*A)
  2564. denom = 1 + x*ma.sqrt(2/(A-4.0))
  2565. if np.ma.isMaskedArray(denom):
  2566. # For multi-dimensional array input
  2567. denom[denom == 0.0] = masked
  2568. elif denom == 0.0:
  2569. denom = masked
  2570. term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
  2571. -ma.power(-(1-2.0/A)/denom, 1/3.0))
  2572. Z = (term1 - term2) / np.sqrt(2/(9.0*A))
  2573. pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
  2574. return KurtosistestResult(Z[()], pvalue[()])
  2575. NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
  2576. def normaltest(a, axis=0):
  2577. """
  2578. Tests whether a sample differs from a normal distribution.
  2579. Parameters
  2580. ----------
  2581. a : array_like
  2582. The array containing the data to be tested.
  2583. axis : int or None, optional
  2584. Axis along which to compute test. Default is 0. If None,
  2585. compute over the whole array `a`.
  2586. Returns
  2587. -------
  2588. statistic : float or array
  2589. ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
  2590. ``k`` is the z-score returned by `kurtosistest`.
  2591. pvalue : float or array
  2592. A 2-sided chi squared probability for the hypothesis test.
  2593. Notes
  2594. -----
  2595. For more details about `normaltest`, see `scipy.stats.normaltest`.
  2596. """
  2597. a, axis = _chk_asarray(a, axis)
  2598. s, _ = skewtest(a, axis)
  2599. k, _ = kurtosistest(a, axis)
  2600. k2 = s*s + k*k
  2601. return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
  2602. def mquantiles(a, prob=(.25, .5, .75), alphap=.4, betap=.4, axis=None,
  2603. limit=()):
  2604. """
  2605. Computes empirical quantiles for a data array.
  2606. Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
  2607. where ``x[j]`` is the j-th order statistic, and gamma is a function of
  2608. ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
  2609. ``g = n*p + m - j``.
  2610. Reinterpreting the above equations to compare to **R** lead to the
  2611. equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
  2612. Typical values of (alphap,betap) are:
  2613. - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf
  2614. (**R** type 4)
  2615. - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
  2616. (**R** type 5)
  2617. - (0,0) : ``p(k) = k/(n+1)`` :
  2618. (**R** type 6)
  2619. - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
  2620. (**R** type 7, **R** default)
  2621. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
  2622. The resulting quantile estimates are approximately median-unbiased
  2623. regardless of the distribution of x.
  2624. (**R** type 8)
  2625. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
  2626. The resulting quantile estimates are approximately unbiased
  2627. if x is normally distributed
  2628. (**R** type 9)
  2629. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2630. - (.35,.35): APL, used with PWM
  2631. Parameters
  2632. ----------
  2633. a : array_like
  2634. Input data, as a sequence or array of dimension at most 2.
  2635. prob : array_like, optional
  2636. List of quantiles to compute.
  2637. alphap : float, optional
  2638. Plotting positions parameter, default is 0.4.
  2639. betap : float, optional
  2640. Plotting positions parameter, default is 0.4.
  2641. axis : int, optional
  2642. Axis along which to perform the trimming.
  2643. If None (default), the input array is first flattened.
  2644. limit : tuple, optional
  2645. Tuple of (lower, upper) values.
  2646. Values of `a` outside this open interval are ignored.
  2647. Returns
  2648. -------
  2649. mquantiles : MaskedArray
  2650. An array containing the calculated quantiles.
  2651. Notes
  2652. -----
  2653. This formulation is very similar to **R** except the calculation of
  2654. ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
  2655. with each type.
  2656. References
  2657. ----------
  2658. .. [1] *R* statistical software: https://www.r-project.org/
  2659. .. [2] *R* ``quantile`` function:
  2660. http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
  2661. Examples
  2662. --------
  2663. >>> import numpy as np
  2664. >>> from scipy.stats.mstats import mquantiles
  2665. >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
  2666. >>> mquantiles(a)
  2667. array([ 19.2, 40. , 42.8])
  2668. Using a 2D array, specifying axis and limit.
  2669. >>> data = np.array([[ 6., 7., 1.],
  2670. ... [ 47., 15., 2.],
  2671. ... [ 49., 36., 3.],
  2672. ... [ 15., 39., 4.],
  2673. ... [ 42., 40., -999.],
  2674. ... [ 41., 41., -999.],
  2675. ... [ 7., -999., -999.],
  2676. ... [ 39., -999., -999.],
  2677. ... [ 43., -999., -999.],
  2678. ... [ 40., -999., -999.],
  2679. ... [ 36., -999., -999.]])
  2680. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2681. [[19.2 14.6 1.45]
  2682. [40. 37.5 2.5 ]
  2683. [42.8 40.05 3.55]]
  2684. >>> data[:, 2] = -999.
  2685. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2686. [[19.200000000000003 14.6 --]
  2687. [40.0 37.5 --]
  2688. [42.800000000000004 40.05 --]]
  2689. """
  2690. def _quantiles1D(data,m,p):
  2691. x = np.sort(data.compressed())
  2692. n = len(x)
  2693. if n == 0:
  2694. return ma.array(np.empty(len(p), dtype=float), mask=True)
  2695. elif n == 1:
  2696. return ma.array(np.resize(x, p.shape), mask=nomask)
  2697. aleph = (n*p + m)
  2698. k = np.floor(aleph.clip(1, n-1)).astype(int)
  2699. gamma = (aleph-k).clip(0,1)
  2700. return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
  2701. data = ma.array(a, copy=False)
  2702. if data.ndim > 2:
  2703. raise TypeError("Array should be 2D at most !")
  2704. if limit:
  2705. condition = (limit[0] < data) & (data < limit[1])
  2706. data[~condition.filled(True)] = masked
  2707. p = np.atleast_1d(np.asarray(prob))
  2708. m = alphap + p*(1.-alphap-betap)
  2709. # Computes quantiles along axis (or globally)
  2710. if (axis is None):
  2711. return _quantiles1D(data, m, p)
  2712. return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
  2713. def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
  2714. """Calculate the score at the given 'per' percentile of the
  2715. sequence a. For example, the score at per=50 is the median.
  2716. This function is a shortcut to mquantile
  2717. """
  2718. if (per < 0) or (per > 100.):
  2719. raise ValueError(f"The percentile should be between 0. and 100. ! (got {per})")
  2720. return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
  2721. limit=limit, axis=0).squeeze()
  2722. def plotting_positions(data, alpha=0.4, beta=0.4):
  2723. """
  2724. Returns plotting positions (or empirical percentile points) for the data.
  2725. Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
  2726. - i is the rank order statistics
  2727. - n is the number of unmasked values along the given axis
  2728. - `alpha` and `beta` are two parameters.
  2729. Typical values for `alpha` and `beta` are:
  2730. - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
  2731. - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function
  2732. (R, type 5)
  2733. - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)
  2734. - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,
  2735. ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
  2736. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
  2737. ``p(k) ~ median[F(x[k])]``.
  2738. The resulting quantile estimates are approximately median-unbiased
  2739. regardless of the distribution of x. (R type 8)
  2740. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
  2741. The resulting quantile estimates are approximately unbiased
  2742. if x is normally distributed (R type 9)
  2743. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2744. - (.35,.35): APL, used with PWM
  2745. - (.3175, .3175): used in scipy.stats.probplot
  2746. Parameters
  2747. ----------
  2748. data : array_like
  2749. Input data, as a sequence or array of dimension at most 2.
  2750. alpha : float, optional
  2751. Plotting positions parameter. Default is 0.4.
  2752. beta : float, optional
  2753. Plotting positions parameter. Default is 0.4.
  2754. Returns
  2755. -------
  2756. positions : MaskedArray
  2757. The calculated plotting positions.
  2758. """
  2759. data = ma.array(data, copy=False).reshape(1,-1)
  2760. n = data.count()
  2761. plpos = np.empty(data.size, dtype=float)
  2762. plpos[n:] = 0
  2763. plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
  2764. (n + 1.0 - alpha - beta))
  2765. return ma.array(plpos, mask=data._mask)
  2766. meppf = plotting_positions
  2767. def obrientransform(*args):
  2768. """
  2769. Computes a transform on input data (any number of columns). Used to
  2770. test for homogeneity of variance prior to running one-way stats. Each
  2771. array in ``*args`` is one level of a factor. If an `f_oneway()` run on
  2772. the transformed data and found significant, variances are unequal. From
  2773. Maxwell and Delaney, p.112.
  2774. Returns: transformed data for use in an ANOVA
  2775. """
  2776. data = argstoarray(*args).T
  2777. v = data.var(axis=0,ddof=1)
  2778. m = data.mean(0)
  2779. n = data.count(0).astype(float)
  2780. # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
  2781. data -= m
  2782. data **= 2
  2783. data *= (n-1.5)*n
  2784. data -= 0.5*v*(n-1)
  2785. data /= (n-1.)*(n-2.)
  2786. if not ma.allclose(v,data.mean(0)):
  2787. raise ValueError("Lack of convergence in obrientransform.")
  2788. return data
  2789. def sem(a, axis=0, ddof=1):
  2790. """
  2791. Calculates the standard error of the mean of the input array.
  2792. Also sometimes called standard error of measurement.
  2793. Parameters
  2794. ----------
  2795. a : array_like
  2796. An array containing the values for which the standard error is
  2797. returned.
  2798. axis : int or None, optional
  2799. If axis is None, ravel `a` first. If axis is an integer, this will be
  2800. the axis over which to operate. Defaults to 0.
  2801. ddof : int, optional
  2802. Delta degrees-of-freedom. How many degrees of freedom to adjust
  2803. for bias in limited samples relative to the population estimate
  2804. of variance. Defaults to 1.
  2805. Returns
  2806. -------
  2807. s : ndarray or float
  2808. The standard error of the mean in the sample(s), along the input axis.
  2809. Notes
  2810. -----
  2811. The default value for `ddof` changed in scipy 0.15.0 to be consistent with
  2812. `scipy.stats.sem` as well as with the most common definition used (like in
  2813. the R documentation).
  2814. Examples
  2815. --------
  2816. Find standard error along the first axis:
  2817. >>> import numpy as np
  2818. >>> from scipy import stats
  2819. >>> a = np.arange(20).reshape(5,4)
  2820. >>> print(stats.mstats.sem(a))
  2821. [2.8284271247461903 2.8284271247461903 2.8284271247461903
  2822. 2.8284271247461903]
  2823. Find standard error across the whole array, using n degrees of freedom:
  2824. >>> print(stats.mstats.sem(a, axis=None, ddof=0))
  2825. 1.2893796958227628
  2826. """
  2827. a, axis = _chk_asarray(a, axis)
  2828. n = a.count(axis=axis)
  2829. s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
  2830. return s
  2831. F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
  2832. def f_oneway(*args):
  2833. """
  2834. Performs a 1-way ANOVA, returning an F-value and probability given
  2835. any number of groups. From Heiman, pp.394-7.
  2836. Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
  2837. one per treatment group.
  2838. Returns
  2839. -------
  2840. statistic : float
  2841. The computed F-value of the test.
  2842. pvalue : float
  2843. The associated p-value from the F-distribution.
  2844. """
  2845. # Construct a single array of arguments: each row is a group
  2846. data = argstoarray(*args)
  2847. ngroups = len(data)
  2848. ntot = data.count()
  2849. sstot = (data**2).sum() - (data.sum())**2/float(ntot)
  2850. ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
  2851. sswg = sstot-ssbg
  2852. dfbg = ngroups-1
  2853. dfwg = ntot - ngroups
  2854. msb = ssbg/float(dfbg)
  2855. msw = sswg/float(dfwg)
  2856. f = msb/msw
  2857. prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf
  2858. return F_onewayResult(f, prob)
  2859. FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
  2860. ('statistic', 'pvalue'))
  2861. def friedmanchisquare(*args):
  2862. """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
  2863. This function calculates the Friedman Chi-square test for repeated measures
  2864. and returns the result, along with the associated probability value.
  2865. Each input is considered a given group. Ideally, the number of treatments
  2866. among each group should be equal. If this is not the case, only the first
  2867. n treatments are taken into account, where n is the number of treatments
  2868. of the smallest group.
  2869. If a group has some missing values, the corresponding treatments are masked
  2870. in the other groups.
  2871. The test statistic is corrected for ties.
  2872. Masked values in one group are propagated to the other groups.
  2873. Returns
  2874. -------
  2875. statistic : float
  2876. the test statistic.
  2877. pvalue : float
  2878. the associated p-value.
  2879. """
  2880. data = argstoarray(*args).astype(float)
  2881. k = len(data)
  2882. if k < 3:
  2883. raise ValueError(f"Less than 3 groups ({k}): the Friedman test "
  2884. f"is NOT appropriate.")
  2885. ranked = ma.masked_values(rankdata(data, axis=0), 0)
  2886. if ranked._mask is not nomask:
  2887. ranked = ma.mask_cols(ranked)
  2888. ranked = ranked.compressed().reshape(k,-1).view(ndarray)
  2889. else:
  2890. ranked = ranked._data
  2891. (k,n) = ranked.shape
  2892. # Ties correction
  2893. repeats = [find_repeats(row) for row in ranked.T]
  2894. ties = np.array([y for x, y in repeats if x.size > 0])
  2895. tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
  2896. ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
  2897. chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
  2898. return FriedmanchisquareResult(chisq,
  2899. distributions.chi2.sf(chisq, k-1))
  2900. BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
  2901. def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
  2902. """
  2903. Compute the Brunner-Munzel test on samples x and y.
  2904. Any missing values in `x` and/or `y` are discarded.
  2905. The Brunner-Munzel test is a nonparametric test of the null hypothesis that
  2906. when values are taken one by one from each group, the probabilities of
  2907. getting large values in both groups are equal.
  2908. Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
  2909. assumption of equivariance of two groups. Note that this does not assume
  2910. the distributions are same. This test works on two independent samples,
  2911. which may have different sizes.
  2912. Parameters
  2913. ----------
  2914. x, y : array_like
  2915. Array of samples, should be one-dimensional.
  2916. alternative : 'less', 'two-sided', or 'greater', optional
  2917. Whether to get the p-value for the one-sided hypothesis ('less'
  2918. or 'greater') or for the two-sided hypothesis ('two-sided').
  2919. Defaults value is 'two-sided' .
  2920. distribution : 't' or 'normal', optional
  2921. Whether to get the p-value by t-distribution or by standard normal
  2922. distribution.
  2923. Defaults value is 't' .
  2924. Returns
  2925. -------
  2926. statistic : float
  2927. The Brunner-Munzer W statistic.
  2928. pvalue : float
  2929. p-value assuming an t distribution. One-sided or
  2930. two-sided, depending on the choice of `alternative` and `distribution`.
  2931. See Also
  2932. --------
  2933. mannwhitneyu : Mann-Whitney rank test on two samples.
  2934. Notes
  2935. -----
  2936. For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
  2937. Examples
  2938. --------
  2939. >>> from scipy.stats.mstats import brunnermunzel
  2940. >>> import numpy as np
  2941. >>> x1 = [1, 2, np.nan, np.nan, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1]
  2942. >>> x2 = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
  2943. >>> brunnermunzel(x1, x2)
  2944. BrunnerMunzelResult(statistic=1.4723186918922935, pvalue=0.15479415300426624) # may vary
  2945. """ # noqa: E501
  2946. x = ma.asarray(x).compressed().view(ndarray)
  2947. y = ma.asarray(y).compressed().view(ndarray)
  2948. nx = len(x)
  2949. ny = len(y)
  2950. if nx == 0 or ny == 0:
  2951. return BrunnerMunzelResult(np.nan, np.nan)
  2952. rankc = rankdata(np.concatenate((x,y)))
  2953. rankcx = rankc[0:nx]
  2954. rankcy = rankc[nx:nx+ny]
  2955. rankcx_mean = np.mean(rankcx)
  2956. rankcy_mean = np.mean(rankcy)
  2957. rankx = rankdata(x)
  2958. ranky = rankdata(y)
  2959. rankx_mean = np.mean(rankx)
  2960. ranky_mean = np.mean(ranky)
  2961. Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
  2962. Sx /= nx - 1
  2963. Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
  2964. Sy /= ny - 1
  2965. wbfn = nx * ny * (rankcy_mean - rankcx_mean)
  2966. wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
  2967. if distribution == "t":
  2968. df_numer = np.power(nx * Sx + ny * Sy, 2.0)
  2969. df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
  2970. df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
  2971. df = df_numer / df_denom
  2972. p = distributions.t.cdf(wbfn, df)
  2973. elif distribution == "normal":
  2974. p = distributions.norm.cdf(wbfn)
  2975. else:
  2976. raise ValueError(
  2977. "distribution should be 't' or 'normal'")
  2978. if alternative == "greater":
  2979. pass
  2980. elif alternative == "less":
  2981. p = 1 - p
  2982. elif alternative == "two-sided":
  2983. p = 2 * np.min([p, 1-p])
  2984. else:
  2985. raise ValueError(
  2986. "alternative should be 'less', 'greater' or 'two-sided'")
  2987. return BrunnerMunzelResult(wbfn, p)