_stats_mstats_common.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. import warnings
  2. import numpy as np
  3. from . import distributions
  4. from .._lib._array_api import xp_capabilities
  5. from .._lib._bunch import _make_tuple_bunch
  6. from ._axis_nan_policy import _axis_nan_policy_factory
  7. from ._stats_pythran import siegelslopes as siegelslopes_pythran
  8. __all__ = ['_find_repeats', 'theilslopes', 'siegelslopes']
  9. # This is not a namedtuple for backwards compatibility. See PR #12983
  10. TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
  11. ['slope', 'intercept',
  12. 'low_slope', 'high_slope'])
  13. SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
  14. ['slope', 'intercept'])
  15. def _n_samples_optional_x(kwargs):
  16. return 2 if kwargs.get('x', None) is not None else 1
  17. @xp_capabilities(np_only=True)
  18. @_axis_nan_policy_factory(TheilslopesResult, default_axis=None, n_outputs=4,
  19. n_samples=_n_samples_optional_x,
  20. result_to_tuple=lambda x, _: tuple(x), paired=True,
  21. too_small=1)
  22. def theilslopes(y, x=None, alpha=0.95, method='separate'):
  23. r"""
  24. Computes the Theil-Sen estimator for a set of points (x, y).
  25. `theilslopes` implements a method for robust linear regression. It
  26. computes the slope as the median of all slopes between paired values.
  27. Parameters
  28. ----------
  29. y : array_like
  30. Dependent variable.
  31. x : array_like or None, optional
  32. Independent variable. If None, use ``arange(len(y))`` instead.
  33. alpha : float, optional
  34. Confidence degree between 0 and 1. Default is 95% confidence.
  35. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  36. interpreted as "find the 90% confidence interval".
  37. method : {'joint', 'separate'}, optional
  38. Method to be used for computing estimate for intercept.
  39. Following methods are supported,
  40. * 'joint': Uses np.median(y - slope * x) as intercept.
  41. * 'separate': Uses np.median(y) - slope * np.median(x)
  42. as intercept.
  43. The default is 'separate'.
  44. .. versionadded:: 1.8.0
  45. Returns
  46. -------
  47. result : ``TheilslopesResult`` instance
  48. The return value is an object with the following attributes:
  49. slope : float
  50. Theil slope.
  51. intercept : float
  52. Intercept of the Theil line.
  53. low_slope : float
  54. Lower bound of the confidence interval on `slope`.
  55. high_slope : float
  56. Upper bound of the confidence interval on `slope`.
  57. See Also
  58. --------
  59. siegelslopes : a similar technique using repeated medians
  60. Notes
  61. -----
  62. The implementation of `theilslopes` follows [1]_. The intercept is
  63. not defined in [1]_, and here it is defined as ``median(y) -
  64. slope*median(x)``, which is given in [3]_. Other definitions of
  65. the intercept exist in the literature such as ``median(y - slope*x)``
  66. in [4]_. The approach to compute the intercept can be determined by the
  67. parameter ``method``. A confidence interval for the intercept is not
  68. given as this question is not addressed in [1]_.
  69. For compatibility with older versions of SciPy, the return value acts
  70. like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
  71. ``low_slope``, and ``high_slope``, so one can continue to write::
  72. slope, intercept, low_slope, high_slope = theilslopes(y, x)
  73. References
  74. ----------
  75. .. [1] P.K. Sen, "Estimates of the regression coefficient based on
  76. Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
  77. .. [2] H. Theil, "A rank-invariant method of linear and polynomial
  78. regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
  79. 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
  80. .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
  81. John Wiley and Sons, New York, pp. 493.
  82. .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
  83. Examples
  84. --------
  85. >>> import numpy as np
  86. >>> from scipy import stats
  87. >>> import matplotlib.pyplot as plt
  88. >>> x = np.linspace(-5, 5, num=150)
  89. >>> y = x + np.random.normal(size=x.size)
  90. >>> y[11:15] += 10 # add outliers
  91. >>> y[-5:] -= 7
  92. Compute the slope, intercept and 90% confidence interval. For comparison,
  93. also compute the least-squares fit with `linregress`:
  94. >>> res = stats.theilslopes(y, x, 0.90, method='separate')
  95. >>> lsq_res = stats.linregress(x, y)
  96. Plot the results. The Theil-Sen regression line is shown in red, with the
  97. dashed red lines illustrating the confidence interval of the slope (note
  98. that the dashed red lines are not the confidence interval of the regression
  99. as the confidence interval of the intercept is not included). The green
  100. line shows the least-squares fit for comparison.
  101. >>> fig = plt.figure()
  102. >>> ax = fig.add_subplot(111)
  103. >>> ax.plot(x, y, 'b.')
  104. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  105. >>> ax.plot(x, res[1] + res[2] * x, 'r--')
  106. >>> ax.plot(x, res[1] + res[3] * x, 'r--')
  107. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  108. >>> plt.show()
  109. """
  110. if method not in ['joint', 'separate']:
  111. raise ValueError("method must be either 'joint' or 'separate'."
  112. f"'{method}' is invalid.")
  113. # We copy both x and y so we can use _find_repeats.
  114. y = np.array(y, dtype=float, copy=True).ravel()
  115. if x is None:
  116. x = np.arange(len(y), dtype=float)
  117. else:
  118. x = np.array(x, dtype=float, copy=True).ravel()
  119. if len(x) != len(y):
  120. raise ValueError("Array shapes are incompatible for broadcasting.")
  121. if len(x) < 2:
  122. raise ValueError("`x` and `y` must have length at least 2.")
  123. # Compute sorted slopes only when deltax > 0
  124. deltax = x[:, np.newaxis] - x
  125. deltay = y[:, np.newaxis] - y
  126. slopes = deltay[deltax > 0] / deltax[deltax > 0]
  127. if not slopes.size:
  128. msg = "All `x` coordinates are identical."
  129. warnings.warn(msg, RuntimeWarning, stacklevel=2)
  130. slopes.sort()
  131. medslope = np.median(slopes)
  132. if method == 'joint':
  133. medinter = np.median(y - medslope * x)
  134. else:
  135. medinter = np.median(y) - medslope * np.median(x)
  136. # Now compute confidence intervals
  137. if alpha > 0.5:
  138. alpha = 1. - alpha
  139. z = distributions.norm.ppf(alpha / 2.)
  140. # This implements (2.6) from Sen (1968)
  141. _, nxreps = _find_repeats(x)
  142. _, nyreps = _find_repeats(y)
  143. nt = len(slopes) # N in Sen (1968)
  144. ny = len(y) # n in Sen (1968)
  145. # Equation 2.6 in Sen (1968):
  146. sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
  147. sum(k * (k-1) * (2*k + 5) for k in nxreps) -
  148. sum(k * (k-1) * (2*k + 5) for k in nyreps))
  149. # Find the confidence interval indices in `slopes`
  150. try:
  151. sigma = np.sqrt(sigsq)
  152. Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
  153. Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
  154. delta = slopes[[Rl, Ru]]
  155. except (ValueError, IndexError):
  156. delta = (np.nan, np.nan)
  157. return TheilslopesResult(slope=medslope, intercept=medinter,
  158. low_slope=delta[0], high_slope=delta[1])
  159. def _find_repeats(arr):
  160. # This function assumes it may clobber its input.
  161. if len(arr) == 0:
  162. return np.array(0, np.float64), np.array(0, np.intp)
  163. # XXX This cast was previously needed for the Fortran implementation,
  164. # should we ditch it?
  165. arr = np.asarray(arr, np.float64).ravel()
  166. arr.sort()
  167. # Taken from NumPy 1.9's np.unique.
  168. change = np.concatenate(([True], arr[1:] != arr[:-1]))
  169. unique = arr[change]
  170. change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
  171. freq = np.diff(change_idx)
  172. atleast2 = freq > 1
  173. return unique[atleast2], freq[atleast2]
  174. @xp_capabilities(np_only=True)
  175. @_axis_nan_policy_factory(SiegelslopesResult, default_axis=None, n_outputs=2,
  176. n_samples=_n_samples_optional_x,
  177. result_to_tuple=lambda x, _: tuple(x), paired=True,
  178. too_small=1)
  179. def siegelslopes(y, x=None, method="hierarchical"):
  180. r"""
  181. Computes the Siegel estimator for a set of points (x, y).
  182. `siegelslopes` implements a method for robust linear regression
  183. using repeated medians (see [1]_) to fit a line to the points (x, y).
  184. The method is robust to outliers with an asymptotic breakdown point
  185. of 50%.
  186. Parameters
  187. ----------
  188. y : array_like
  189. Dependent variable.
  190. x : array_like or None, optional
  191. Independent variable. If None, use ``arange(len(y))`` instead.
  192. method : {'hierarchical', 'separate'}
  193. If 'hierarchical', estimate the intercept using the estimated
  194. slope ``slope`` (default option).
  195. If 'separate', estimate the intercept independent of the estimated
  196. slope. See Notes for details.
  197. Returns
  198. -------
  199. result : ``SiegelslopesResult`` instance
  200. The return value is an object with the following attributes:
  201. slope : float
  202. Estimate of the slope of the regression line.
  203. intercept : float
  204. Estimate of the intercept of the regression line.
  205. See Also
  206. --------
  207. theilslopes : a similar technique without repeated medians
  208. Notes
  209. -----
  210. With ``n = len(y)``, compute ``m_j`` as the median of
  211. the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
  212. ``slope`` is then the median of all slopes ``m_j``.
  213. Two ways are given to estimate the intercept in [1]_ which can be chosen
  214. via the parameter ``method``.
  215. The hierarchical approach uses the estimated slope ``slope``
  216. and computes ``intercept`` as the median of ``y - slope*x``.
  217. The other approach estimates the intercept separately as follows: for
  218. each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
  219. lines through the remaining points and take the median ``i_j``.
  220. ``intercept`` is the median of the ``i_j``.
  221. The implementation computes `n` times the median of a vector of size `n`
  222. which can be slow for large vectors. There are more efficient algorithms
  223. (see [2]_) which are not implemented here.
  224. For compatibility with older versions of SciPy, the return value acts
  225. like a ``namedtuple`` of length 2, with fields ``slope`` and
  226. ``intercept``, so one can continue to write::
  227. slope, intercept = siegelslopes(y, x)
  228. References
  229. ----------
  230. .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
  231. Biometrika, Vol. 69, pp. 242-244, 1982.
  232. .. [2] A. Stein and M. Werman, "Finding the repeated median regression
  233. line", Proceedings of the Third Annual ACM-SIAM Symposium on
  234. Discrete Algorithms, pp. 409-413, 1992.
  235. Examples
  236. --------
  237. >>> import numpy as np
  238. >>> from scipy import stats
  239. >>> import matplotlib.pyplot as plt
  240. >>> x = np.linspace(-5, 5, num=150)
  241. >>> y = x + np.random.normal(size=x.size)
  242. >>> y[11:15] += 10 # add outliers
  243. >>> y[-5:] -= 7
  244. Compute the slope and intercept. For comparison, also compute the
  245. least-squares fit with `linregress`:
  246. >>> res = stats.siegelslopes(y, x)
  247. >>> lsq_res = stats.linregress(x, y)
  248. Plot the results. The Siegel regression line is shown in red. The green
  249. line shows the least-squares fit for comparison.
  250. >>> fig = plt.figure()
  251. >>> ax = fig.add_subplot(111)
  252. >>> ax.plot(x, y, 'b.')
  253. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  254. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  255. >>> plt.show()
  256. """
  257. if method not in ['hierarchical', 'separate']:
  258. raise ValueError("method can only be 'hierarchical' or 'separate'")
  259. y = np.asarray(y).ravel()
  260. if x is None:
  261. x = np.arange(len(y), dtype=float)
  262. else:
  263. x = np.asarray(x, dtype=float).ravel()
  264. if len(x) != len(y):
  265. raise ValueError("Array shapes are incompatible for broadcasting.")
  266. if len(x) < 2:
  267. raise ValueError("`x` and `y` must have length at least 2.")
  268. dtype = np.result_type(x, y, np.float32) # use at least float32
  269. y, x = y.astype(dtype), x.astype(dtype)
  270. medslope, medinter = siegelslopes_pythran(y, x, method)
  271. medslope, medinter = np.asarray(medslope)[()], np.asarray(medinter)[()]
  272. return SiegelslopesResult(slope=medslope, intercept=medinter)