_mstats_extras.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. """
  2. Additional statistics functions with support for masked arrays.
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. __all__ = ['compare_medians_ms',
  6. 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
  7. 'idealfourths',
  8. 'median_cihs','mjci','mquantiles_cimj',
  9. 'rsh',
  10. 'trimmed_mean_ci',]
  11. import numpy as np
  12. from numpy import float64, ndarray
  13. import numpy.ma as ma
  14. from numpy.ma import MaskedArray
  15. from . import _mstats_basic as mstats
  16. from scipy.stats.distributions import norm, beta, t, binom
  17. def hdquantiles(data, prob=(.25, .5, .75), axis=None, var=False,):
  18. """
  19. Computes quantile estimates with the Harrell-Davis method.
  20. The quantile estimates are calculated as a weighted linear combination
  21. of order statistics.
  22. Parameters
  23. ----------
  24. data : array_like
  25. Data array.
  26. prob : sequence, optional
  27. Sequence of probabilities at which to compute the quantiles.
  28. axis : int or None, optional
  29. Axis along which to compute the quantiles. If None, use a flattened
  30. array.
  31. var : bool, optional
  32. Whether to return the variance of the estimate.
  33. Returns
  34. -------
  35. hdquantiles : MaskedArray
  36. A (p,) array of quantiles (if `var` is False), or a (2,p) array of
  37. quantiles and variances (if `var` is True), where ``p`` is the
  38. number of quantiles.
  39. See Also
  40. --------
  41. hdquantiles_sd
  42. Examples
  43. --------
  44. >>> import numpy as np
  45. >>> from scipy.stats.mstats import hdquantiles
  46. >>>
  47. >>> # Sample data
  48. >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
  49. >>>
  50. >>> # Probabilities at which to compute quantiles
  51. >>> probabilities = [0.25, 0.5, 0.75]
  52. >>>
  53. >>> # Compute Harrell-Davis quantile estimates
  54. >>> quantile_estimates = hdquantiles(data, prob=probabilities)
  55. >>>
  56. >>> # Display the quantile estimates
  57. >>> for i, quantile in enumerate(probabilities):
  58. ... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
  59. 25th percentile: 3.1505820231763066 # may vary
  60. 50th percentile: 5.194344084883956
  61. 75th percentile: 7.430626414674935
  62. """
  63. def _hd_1D(data,prob,var):
  64. "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
  65. xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
  66. # Don't use length here, in case we have a numpy scalar
  67. n = xsorted.size
  68. hd = np.empty((2,len(prob)), float64)
  69. if n < 2:
  70. hd.flat = np.nan
  71. if var:
  72. return hd
  73. return hd[0]
  74. v = np.arange(n+1) / float(n)
  75. betacdf = beta.cdf
  76. for (i,p) in enumerate(prob):
  77. _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
  78. w = _w[1:] - _w[:-1]
  79. hd_mean = np.dot(w, xsorted)
  80. hd[0,i] = hd_mean
  81. #
  82. hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
  83. #
  84. hd[0, prob == 0] = xsorted[0]
  85. hd[0, prob == 1] = xsorted[-1]
  86. if var:
  87. hd[1, prob == 0] = hd[1, prob == 1] = np.nan
  88. return hd
  89. return hd[0]
  90. # Initialization & checks
  91. data = ma.array(data, copy=False, dtype=float64)
  92. p = np.atleast_1d(np.asarray(prob))
  93. # Computes quantiles along axis (or globally)
  94. if (axis is None) or (data.ndim == 1):
  95. result = _hd_1D(data, p, var)
  96. else:
  97. if data.ndim > 2:
  98. raise ValueError(f"Array 'data' must be at most two dimensional, "
  99. f"but got data.ndim = {data.ndim}")
  100. result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
  101. return ma.fix_invalid(result, copy=False)
  102. def hdmedian(data, axis=-1, var=False):
  103. """
  104. Returns the Harrell-Davis estimate of the median along the given axis.
  105. Parameters
  106. ----------
  107. data : ndarray
  108. Data array.
  109. axis : int, optional
  110. Axis along which to compute the quantiles. If None, use a flattened
  111. array.
  112. var : bool, optional
  113. Whether to return the variance of the estimate.
  114. Returns
  115. -------
  116. hdmedian : MaskedArray
  117. The median values. If ``var=True``, the variance is returned inside
  118. the masked array. E.g. for a 1-D array the shape change from (1,) to
  119. (2,).
  120. """
  121. result = hdquantiles(data,[0.5], axis=axis, var=var)
  122. return result.squeeze()
  123. def hdquantiles_sd(data, prob=(.25, .5, .75), axis=None):
  124. """
  125. The standard error of the Harrell-Davis quantile estimates by jackknife.
  126. Parameters
  127. ----------
  128. data : array_like
  129. Data array.
  130. prob : sequence, optional
  131. Sequence of quantiles to compute.
  132. axis : int, optional
  133. Axis along which to compute the quantiles. If None, use a flattened
  134. array.
  135. Returns
  136. -------
  137. hdquantiles_sd : MaskedArray
  138. Standard error of the Harrell-Davis quantile estimates.
  139. See Also
  140. --------
  141. hdquantiles
  142. """
  143. def _hdsd_1D(data, prob):
  144. "Computes the std error for 1D arrays."
  145. xsorted = np.sort(data.compressed())
  146. n = len(xsorted)
  147. hdsd = np.empty(len(prob), float64)
  148. if n < 2:
  149. hdsd.flat = np.nan
  150. vv = np.arange(n) / float(n-1)
  151. betacdf = beta.cdf
  152. for (i,p) in enumerate(prob):
  153. _w = betacdf(vv, n*p, n*(1-p))
  154. w = _w[1:] - _w[:-1]
  155. # cumulative sum of weights and data points if
  156. # ith point is left out for jackknife
  157. mx_ = np.zeros_like(xsorted)
  158. mx_[1:] = np.cumsum(w * xsorted[:-1])
  159. # similar but from the right
  160. mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
  161. hdsd[i] = np.sqrt(mx_.var() * (n - 1))
  162. return hdsd
  163. # Initialization & checks
  164. data = ma.array(data, copy=False, dtype=float64)
  165. p = np.atleast_1d(np.asarray(prob))
  166. # Computes quantiles along axis (or globally)
  167. if (axis is None):
  168. result = _hdsd_1D(data, p)
  169. else:
  170. if data.ndim > 2:
  171. raise ValueError(f"Array 'data' must be at most two dimensional, "
  172. f"but got data.ndim = {data.ndim}")
  173. result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
  174. return ma.fix_invalid(result, copy=False).ravel()
  175. def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
  176. alpha=0.05, axis=None):
  177. """
  178. Selected confidence interval of the trimmed mean along the given axis.
  179. Parameters
  180. ----------
  181. data : array_like
  182. Input data.
  183. limits : {None, tuple}, optional
  184. None or a two item tuple.
  185. Tuple of the percentages to cut on each side of the array, with respect
  186. to the number of unmasked data, as floats between 0. and 1. If ``n``
  187. is the number of unmasked data before trimming, then
  188. (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
  189. largest data are masked. The total number of unmasked data after
  190. trimming is ``n * (1. - sum(limits))``.
  191. The value of one limit can be set to None to indicate an open interval.
  192. Defaults to (0.2, 0.2).
  193. inclusive : (2,) tuple of boolean, optional
  194. If relative==False, tuple indicating whether values exactly equal to
  195. the absolute limits are allowed.
  196. If relative==True, tuple indicating whether the number of data being
  197. masked on each side should be rounded (True) or truncated (False).
  198. Defaults to (True, True).
  199. alpha : float, optional
  200. Confidence level of the intervals.
  201. Defaults to 0.05.
  202. axis : int, optional
  203. Axis along which to cut. If None, uses a flattened version of `data`.
  204. Defaults to None.
  205. Returns
  206. -------
  207. trimmed_mean_ci : (2,) ndarray
  208. The lower and upper confidence intervals of the trimmed data.
  209. """
  210. data = ma.array(data, copy=False)
  211. trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
  212. tmean = trimmed.mean(axis)
  213. tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
  214. df = trimmed.count(axis) - 1
  215. tppf = t.ppf(1-alpha/2.,df)
  216. return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
  217. def mjci(data, prob=(0.25, 0.5, 0.75), axis=None):
  218. """
  219. Returns the Maritz-Jarrett estimators of the standard error of selected
  220. experimental quantiles of the data.
  221. Parameters
  222. ----------
  223. data : ndarray
  224. Data array.
  225. prob : sequence, optional
  226. Sequence of quantiles to compute.
  227. axis : int or None, optional
  228. Axis along which to compute the quantiles. If None, use a flattened
  229. array.
  230. """
  231. def _mjci_1D(data, p):
  232. data = np.sort(data.compressed())
  233. n = data.size
  234. prob = (np.array(p) * n + 0.5).astype(int)
  235. betacdf = beta.cdf
  236. mj = np.empty(len(prob), float64)
  237. x = np.arange(1,n+1, dtype=float64) / n
  238. y = x - 1./n
  239. for (i,m) in enumerate(prob):
  240. W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
  241. C1 = np.dot(W,data)
  242. C2 = np.dot(W,data**2)
  243. mj[i] = np.sqrt(C2 - C1**2)
  244. return mj
  245. data = ma.array(data, copy=False)
  246. if data.ndim > 2:
  247. raise ValueError(f"Array 'data' must be at most two dimensional, "
  248. f"but got data.ndim = {data.ndim}")
  249. p = np.atleast_1d(np.asarray(prob))
  250. # Computes quantiles along axis (or globally)
  251. if (axis is None):
  252. return _mjci_1D(data, p)
  253. else:
  254. return ma.apply_along_axis(_mjci_1D, axis, data, p)
  255. def mquantiles_cimj(data, prob=(0.25, 0.50, 0.75), alpha=0.05, axis=None):
  256. """
  257. Computes the alpha confidence interval for the selected quantiles of the
  258. data, with Maritz-Jarrett estimators.
  259. Parameters
  260. ----------
  261. data : ndarray
  262. Data array.
  263. prob : sequence, optional
  264. Sequence of quantiles to compute.
  265. alpha : float, optional
  266. Confidence level of the intervals.
  267. axis : int or None, optional
  268. Axis along which to compute the quantiles.
  269. If None, use a flattened array.
  270. Returns
  271. -------
  272. ci_lower : ndarray
  273. The lower boundaries of the confidence interval. Of the same length as
  274. `prob`.
  275. ci_upper : ndarray
  276. The upper boundaries of the confidence interval. Of the same length as
  277. `prob`.
  278. """
  279. alpha = min(alpha, 1 - alpha)
  280. z = norm.ppf(1 - alpha/2.)
  281. xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
  282. smj = mjci(data, prob, axis=axis)
  283. return (xq - z * smj, xq + z * smj)
  284. def median_cihs(data, alpha=0.05, axis=None):
  285. """
  286. Computes the alpha-level confidence interval for the median of the data.
  287. Uses the Hettmasperger-Sheather method.
  288. Parameters
  289. ----------
  290. data : array_like
  291. Input data. Masked values are discarded. The input should be 1D only,
  292. or `axis` should be set to None.
  293. alpha : float, optional
  294. Confidence level of the intervals.
  295. axis : int or None, optional
  296. Axis along which to compute the quantiles. If None, use a flattened
  297. array.
  298. Returns
  299. -------
  300. median_cihs
  301. Alpha level confidence interval.
  302. """
  303. def _cihs_1D(data, alpha):
  304. data = np.sort(data.compressed())
  305. n = len(data)
  306. alpha = min(alpha, 1-alpha)
  307. k = int(binom._ppf(alpha/2., n, 0.5))
  308. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  309. if gk < 1-alpha:
  310. k -= 1
  311. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  312. gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
  313. I = (gk - 1 + alpha)/(gk - gkk)
  314. lambd = (n-k) * I / float(k + (n-2*k)*I)
  315. lims = (lambd*data[k] + (1-lambd)*data[k-1],
  316. lambd*data[n-k-1] + (1-lambd)*data[n-k])
  317. return lims
  318. data = ma.array(data, copy=False)
  319. # Computes quantiles along axis (or globally)
  320. if (axis is None):
  321. result = _cihs_1D(data, alpha)
  322. else:
  323. if data.ndim > 2:
  324. raise ValueError(f"Array 'data' must be at most two dimensional, "
  325. f"but got data.ndim = {data.ndim}")
  326. result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
  327. return result
  328. def compare_medians_ms(group_1, group_2, axis=None):
  329. """
  330. Compares the medians from two independent groups along the given axis.
  331. The comparison is performed using the McKean-Schrader estimate of the
  332. standard error of the medians.
  333. Parameters
  334. ----------
  335. group_1 : array_like
  336. First dataset. Has to be of size >=7.
  337. group_2 : array_like
  338. Second dataset. Has to be of size >=7.
  339. axis : int, optional
  340. Axis along which the medians are estimated. If None, the arrays are
  341. flattened. If `axis` is not None, then `group_1` and `group_2`
  342. should have the same shape.
  343. Returns
  344. -------
  345. compare_medians_ms : {float, ndarray}
  346. If `axis` is None, then returns a float, otherwise returns a 1-D
  347. ndarray of floats with a length equal to the length of `group_1`
  348. along `axis`.
  349. Examples
  350. --------
  351. >>> from scipy import stats
  352. >>> a = [1, 2, 3, 4, 5, 6, 7]
  353. >>> b = [8, 9, 10, 11, 12, 13, 14]
  354. >>> stats.mstats.compare_medians_ms(a, b, axis=None)
  355. 1.0693225866553746e-05
  356. The function is vectorized to compute along a given axis.
  357. >>> import numpy as np
  358. >>> rng = np.random.default_rng()
  359. >>> x = rng.random(size=(3, 7))
  360. >>> y = rng.random(size=(3, 8))
  361. >>> stats.mstats.compare_medians_ms(x, y, axis=1)
  362. array([0.36908985, 0.36092538, 0.2765313 ])
  363. References
  364. ----------
  365. .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
  366. for studentizing the sample median." Communications in
  367. Statistics-Simulation and Computation 13.6 (1984): 751-773.
  368. """
  369. (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
  370. (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
  371. mstats.stde_median(group_2, axis=axis))
  372. W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
  373. return 1 - norm.cdf(W)
  374. def idealfourths(data, axis=None):
  375. """
  376. Returns an estimate of the lower and upper quartiles.
  377. Uses the ideal fourths algorithm.
  378. Parameters
  379. ----------
  380. data : array_like
  381. Input array.
  382. axis : int, optional
  383. Axis along which the quartiles are estimated. If None, the arrays are
  384. flattened.
  385. Returns
  386. -------
  387. idealfourths : {list of floats, masked array}
  388. Returns the two internal values that divide `data` into four parts
  389. using the ideal fourths algorithm either along the flattened array
  390. (if `axis` is None) or along `axis` of `data`.
  391. """
  392. def _idf(data):
  393. x = data.compressed()
  394. n = len(x)
  395. if n < 3:
  396. return [np.nan,np.nan]
  397. (j,h) = divmod(n/4. + 5/12.,1)
  398. j = int(j)
  399. qlo = (1-h)*x[j-1] + h*x[j]
  400. k = n - j
  401. qup = (1-h)*x[k] + h*x[k-1]
  402. return [qlo, qup]
  403. data = ma.sort(data, axis=axis).view(MaskedArray)
  404. if (axis is None):
  405. return _idf(data)
  406. else:
  407. return ma.apply_along_axis(_idf, axis, data)
  408. def rsh(data, points=None):
  409. """
  410. Evaluates Rosenblatt's shifted histogram estimators for each data point.
  411. Rosenblatt's estimator is a centered finite-difference approximation to the
  412. derivative of the empirical cumulative distribution function.
  413. Parameters
  414. ----------
  415. data : sequence
  416. Input data, should be 1-D. Masked values are ignored.
  417. points : sequence or None, optional
  418. Sequence of points where to evaluate Rosenblatt shifted histogram.
  419. If None, use the data.
  420. """
  421. data = ma.array(data, copy=False)
  422. if points is None:
  423. points = data
  424. else:
  425. points = np.atleast_1d(np.asarray(points))
  426. if data.ndim != 1:
  427. raise AttributeError("The input array should be 1D only !")
  428. n = data.count()
  429. r = idealfourths(data, axis=None)
  430. h = 1.2 * (r[-1]-r[0]) / n**(1./5)
  431. nhi = (data[:,None] <= points[None,:] + h).sum(0)
  432. nlo = (data[:,None] < points[None,:] - h).sum(0)
  433. return (nhi-nlo) / (2.*n*h)