_new_distributions.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. import sys
  2. import numpy as np
  3. from numpy import inf
  4. from scipy._lib import array_api_extra as xpx
  5. from scipy import special
  6. from scipy.special import _ufuncs as scu
  7. from scipy.stats._distribution_infrastructure import (
  8. ContinuousDistribution, DiscreteDistribution, _RealInterval, _IntegerInterval,
  9. _RealParameter, _Parameterization, _combine_docs)
  10. __all__ = ['Normal', 'Logistic', 'Uniform', 'Binomial']
  11. class Normal(ContinuousDistribution):
  12. r"""Normal distribution with prescribed mean and standard deviation.
  13. The probability density function of the normal distribution is:
  14. .. math::
  15. f(x) = \frac{1}{\sigma \sqrt{2 \pi}} \exp {
  16. \left( -\frac{1}{2}\left( \frac{x - \mu}{\sigma} \right)^2 \right)}
  17. """
  18. # `ShiftedScaledDistribution` allows this to be generated automatically from
  19. # an instance of `StandardNormal`, but the normal distribution is so frequently
  20. # used that it's worth a bit of code duplication to get better performance.
  21. _mu_domain = _RealInterval(endpoints=(-inf, inf))
  22. _sigma_domain = _RealInterval(endpoints=(0, inf))
  23. _x_support = _RealInterval(endpoints=(-inf, inf))
  24. _mu_param = _RealParameter('mu', symbol=r'\mu', domain=_mu_domain,
  25. typical=(-1, 1))
  26. _sigma_param = _RealParameter('sigma', symbol=r'\sigma', domain=_sigma_domain,
  27. typical=(0.5, 1.5))
  28. _x_param = _RealParameter('x', domain=_x_support, typical=(-1, 1))
  29. _parameterizations = [_Parameterization(_mu_param, _sigma_param)]
  30. _variable = _x_param
  31. _normalization = 1/np.sqrt(2*np.pi)
  32. _log_normalization = np.log(2*np.pi)/2
  33. def __new__(cls, mu=None, sigma=None, **kwargs):
  34. if mu is None and sigma is None:
  35. return super().__new__(StandardNormal)
  36. return super().__new__(cls)
  37. def __init__(self, *, mu=0., sigma=1., **kwargs):
  38. super().__init__(mu=mu, sigma=sigma, **kwargs)
  39. def _logpdf_formula(self, x, *, mu, sigma, **kwargs):
  40. return StandardNormal._logpdf_formula(self, (x - mu)/sigma) - np.log(sigma)
  41. def _pdf_formula(self, x, *, mu, sigma, **kwargs):
  42. return StandardNormal._pdf_formula(self, (x - mu)/sigma) / sigma
  43. def _logcdf_formula(self, x, *, mu, sigma, **kwargs):
  44. return StandardNormal._logcdf_formula(self, (x - mu)/sigma)
  45. def _cdf_formula(self, x, *, mu, sigma, **kwargs):
  46. return StandardNormal._cdf_formula(self, (x - mu)/sigma)
  47. def _logccdf_formula(self, x, *, mu, sigma, **kwargs):
  48. return StandardNormal._logccdf_formula(self, (x - mu)/sigma)
  49. def _ccdf_formula(self, x, *, mu, sigma, **kwargs):
  50. return StandardNormal._ccdf_formula(self, (x - mu)/sigma)
  51. def _icdf_formula(self, x, *, mu, sigma, **kwargs):
  52. return StandardNormal._icdf_formula(self, x) * sigma + mu
  53. def _ilogcdf_formula(self, x, *, mu, sigma, **kwargs):
  54. return StandardNormal._ilogcdf_formula(self, x) * sigma + mu
  55. def _iccdf_formula(self, x, *, mu, sigma, **kwargs):
  56. return StandardNormal._iccdf_formula(self, x) * sigma + mu
  57. def _ilogccdf_formula(self, x, *, mu, sigma, **kwargs):
  58. return StandardNormal._ilogccdf_formula(self, x) * sigma + mu
  59. def _entropy_formula(self, *, mu, sigma, **kwargs):
  60. return StandardNormal._entropy_formula(self) + np.log(abs(sigma))
  61. def _logentropy_formula(self, *, mu, sigma, **kwargs):
  62. lH0 = StandardNormal._logentropy_formula(self)
  63. with np.errstate(divide='ignore'):
  64. # sigma = 1 -> log(sigma) = 0 -> log(log(sigma)) = -inf
  65. # Silence the unnecessary runtime warning
  66. lls = np.log(np.log(abs(sigma))+0j)
  67. return special.logsumexp(np.broadcast_arrays(lH0, lls), axis=0)
  68. def _median_formula(self, *, mu, sigma, **kwargs):
  69. return mu
  70. def _mode_formula(self, *, mu, sigma, **kwargs):
  71. return mu
  72. def _moment_raw_formula(self, order, *, mu, sigma, **kwargs):
  73. if order == 0:
  74. return np.ones_like(mu)
  75. elif order == 1:
  76. return mu
  77. else:
  78. return None
  79. _moment_raw_formula.orders = [0, 1] # type: ignore[attr-defined]
  80. def _moment_central_formula(self, order, *, mu, sigma, **kwargs):
  81. if order == 0:
  82. return np.ones_like(mu)
  83. elif order % 2:
  84. return np.zeros_like(mu)
  85. else:
  86. # exact is faster (and obviously more accurate) for reasonable orders
  87. return sigma**order * special.factorial2(int(order) - 1, exact=True)
  88. def _sample_formula(self, full_shape, rng, *, mu, sigma, **kwargs):
  89. return rng.normal(loc=mu, scale=sigma, size=full_shape)[()]
  90. def _log_diff(log_p, log_q):
  91. return special.logsumexp([log_p, log_q+np.pi*1j], axis=0)
  92. class StandardNormal(Normal):
  93. r"""Standard normal distribution.
  94. The probability density function of the standard normal distribution is:
  95. .. math::
  96. f(x) = \frac{1}{\sqrt{2 \pi}} \exp \left( -\frac{1}{2} x^2 \right)
  97. """
  98. _x_support = _RealInterval(endpoints=(-inf, inf))
  99. _x_param = _RealParameter('x', domain=_x_support, typical=(-5, 5))
  100. _variable = _x_param
  101. _parameterizations = []
  102. _normalization = 1/np.sqrt(2*np.pi)
  103. _log_normalization = np.log(2*np.pi)/2
  104. mu = np.float64(0.)
  105. sigma = np.float64(1.)
  106. def __init__(self, **kwargs):
  107. ContinuousDistribution.__init__(self, **kwargs)
  108. def _logpdf_formula(self, x, **kwargs):
  109. return -(self._log_normalization + x**2/2)
  110. def _pdf_formula(self, x, **kwargs):
  111. return self._normalization * np.exp(-x**2/2)
  112. def _logcdf_formula(self, x, **kwargs):
  113. return special.log_ndtr(x)
  114. def _cdf_formula(self, x, **kwargs):
  115. return special.ndtr(x)
  116. def _logccdf_formula(self, x, **kwargs):
  117. return special.log_ndtr(-x)
  118. def _ccdf_formula(self, x, **kwargs):
  119. return special.ndtr(-x)
  120. def _icdf_formula(self, x, **kwargs):
  121. return special.ndtri(x)
  122. def _ilogcdf_formula(self, x, **kwargs):
  123. return special.ndtri_exp(x)
  124. def _iccdf_formula(self, x, **kwargs):
  125. return -special.ndtri(x)
  126. def _ilogccdf_formula(self, x, **kwargs):
  127. return -special.ndtri_exp(x)
  128. def _entropy_formula(self, **kwargs):
  129. return (1 + np.log(2*np.pi))/2
  130. def _logentropy_formula(self, **kwargs):
  131. return np.log1p(np.log(2*np.pi)) - np.log(2)
  132. def _median_formula(self, **kwargs):
  133. return 0
  134. def _mode_formula(self, **kwargs):
  135. return 0
  136. def _moment_raw_formula(self, order, **kwargs):
  137. raw_moments = {0: 1, 1: 0, 2: 1, 3: 0, 4: 3, 5: 0}
  138. return raw_moments.get(order, None)
  139. def _moment_central_formula(self, order, **kwargs):
  140. return self._moment_raw_formula(order, **kwargs)
  141. def _moment_standardized_formula(self, order, **kwargs):
  142. return self._moment_raw_formula(order, **kwargs)
  143. def _sample_formula(self, full_shape, rng, **kwargs):
  144. return rng.normal(size=full_shape)[()]
  145. class Logistic(ContinuousDistribution):
  146. r"""Standard logistic distribution.
  147. The probability density function of the standard logistic distribution is:
  148. .. math::
  149. f(x) = \frac{1}{\left( e^{x / 2} + e^{-x / 2} \right)^2}
  150. """
  151. _x_support = _RealInterval(endpoints=(-inf, inf))
  152. _variable = _x_param = _RealParameter('x', domain=_x_support, typical=(-9, 9))
  153. _parameterizations = ()
  154. _scale = np.pi / np.sqrt(3)
  155. def _logpdf_formula(self, x, **kwargs):
  156. y = -np.abs(x)
  157. return y - 2 * special.log1p(np.exp(y))
  158. def _pdf_formula(self, x, **kwargs):
  159. # f(x) = sech(x / 2)**2 / 4
  160. return (.5 / np.cosh(x / 2))**2
  161. def _logcdf_formula(self, x, **kwargs):
  162. return special.log_expit(x)
  163. def _cdf_formula(self, x, **kwargs):
  164. return special.expit(x)
  165. def _logccdf_formula(self, x, **kwargs):
  166. return special.log_expit(-x)
  167. def _ccdf_formula(self, x, **kwargs):
  168. return special.expit(-x)
  169. def _icdf_formula(self, x, **kwargs):
  170. return special.logit(x)
  171. def _iccdf_formula(self, x, **kwargs):
  172. return -special.logit(x)
  173. def _entropy_formula(self, **kwargs):
  174. return 2.0
  175. def _logentropy_formula(self, **kwargs):
  176. return np.log(2)
  177. def _median_formula(self, **kwargs):
  178. return 0
  179. def _mode_formula(self, **kwargs):
  180. return 0
  181. def _moment_raw_formula(self, order, **kwargs):
  182. n = int(order)
  183. if n % 2:
  184. return 0.0
  185. return np.pi**n * abs((2**n - 2) * float(special.bernoulli(n)[-1]))
  186. def _moment_central_formula(self, order, **kwargs):
  187. return self._moment_raw_formula(order, **kwargs)
  188. def _moment_standardized_formula(self, order, **kwargs):
  189. return self._moment_raw_formula(order, **kwargs) / self._scale**order
  190. def _sample_formula(self, full_shape, rng, **kwargs):
  191. return rng.logistic(size=full_shape)[()]
  192. # currently for testing only
  193. class _LogUniform(ContinuousDistribution):
  194. r"""Log-uniform distribution.
  195. The probability density function of the log-uniform distribution is:
  196. .. math::
  197. f(x; a, b) = \frac{1}
  198. {x (\log(b) - \log(a))}
  199. If :math:`\log(X)` is a random variable that follows a uniform distribution
  200. between :math:`\log(a)` and :math:`\log(b)`, then :math:`X` is log-uniformly
  201. distributed with shape parameters :math:`a` and :math:`b`.
  202. """
  203. _a_domain = _RealInterval(endpoints=(0, inf))
  204. _b_domain = _RealInterval(endpoints=('a', inf))
  205. _log_a_domain = _RealInterval(endpoints=(-inf, inf))
  206. _log_b_domain = _RealInterval(endpoints=('log_a', inf))
  207. _x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
  208. _a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
  209. _b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
  210. _log_a_param = _RealParameter('log_a', symbol=r'\log(a)',
  211. domain=_log_a_domain, typical=(-3, -0.1))
  212. _log_b_param = _RealParameter('log_b', symbol=r'\log(b)',
  213. domain=_log_b_domain, typical=(0.1, 3))
  214. _x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
  215. _b_domain.define_parameters(_a_param)
  216. _log_b_domain.define_parameters(_log_a_param)
  217. _x_support.define_parameters(_a_param, _b_param)
  218. _parameterizations = [_Parameterization(_log_a_param, _log_b_param),
  219. _Parameterization(_a_param, _b_param)]
  220. _variable = _x_param
  221. def __init__(self, *, a=None, b=None, log_a=None, log_b=None, **kwargs):
  222. super().__init__(a=a, b=b, log_a=log_a, log_b=log_b, **kwargs)
  223. def _process_parameters(self, a=None, b=None, log_a=None, log_b=None, **kwargs):
  224. a = np.exp(log_a) if a is None else a
  225. b = np.exp(log_b) if b is None else b
  226. log_a = np.log(a) if log_a is None else log_a
  227. log_b = np.log(b) if log_b is None else log_b
  228. kwargs.update(dict(a=a, b=b, log_a=log_a, log_b=log_b))
  229. return kwargs
  230. # def _logpdf_formula(self, x, *, log_a, log_b, **kwargs):
  231. # return -np.log(x) - np.log(log_b - log_a)
  232. def _pdf_formula(self, x, *, log_a, log_b, **kwargs):
  233. return ((log_b - log_a)*x)**-1
  234. # def _cdf_formula(self, x, *, log_a, log_b, **kwargs):
  235. # return (np.log(x) - log_a)/(log_b - log_a)
  236. def _moment_raw_formula(self, order, log_a, log_b, **kwargs):
  237. if order == 0:
  238. return self._one
  239. t1 = self._one / (log_b - log_a) / order
  240. t2 = np.real(np.exp(_log_diff(order * log_b, order * log_a)))
  241. return t1 * t2
  242. class Uniform(ContinuousDistribution):
  243. r"""Uniform distribution.
  244. The probability density function of the uniform distribution is:
  245. .. math::
  246. f(x; a, b) = \frac{1}
  247. {b - a}
  248. """
  249. _a_domain = _RealInterval(endpoints=(-inf, inf))
  250. _b_domain = _RealInterval(endpoints=('a', inf))
  251. _x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
  252. _a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
  253. _b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
  254. _x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
  255. _b_domain.define_parameters(_a_param)
  256. _x_support.define_parameters(_a_param, _b_param)
  257. _parameterizations = [_Parameterization(_a_param, _b_param)]
  258. _variable = _x_param
  259. def __init__(self, *, a=None, b=None, **kwargs):
  260. super().__init__(a=a, b=b, **kwargs)
  261. def _process_parameters(self, a=None, b=None, ab=None, **kwargs):
  262. ab = b - a
  263. kwargs.update(dict(a=a, b=b, ab=ab))
  264. return kwargs
  265. def _logpdf_formula(self, x, *, ab, **kwargs):
  266. return np.where(np.isnan(x), np.nan, -np.log(ab))
  267. def _pdf_formula(self, x, *, ab, **kwargs):
  268. return np.where(np.isnan(x), np.nan, 1/ab)
  269. def _logcdf_formula(self, x, *, a, ab, **kwargs):
  270. with np.errstate(divide='ignore'):
  271. return np.log(x - a) - np.log(ab)
  272. def _cdf_formula(self, x, *, a, ab, **kwargs):
  273. return (x - a) / ab
  274. def _logccdf_formula(self, x, *, b, ab, **kwargs):
  275. with np.errstate(divide='ignore'):
  276. return np.log(b - x) - np.log(ab)
  277. def _ccdf_formula(self, x, *, b, ab, **kwargs):
  278. return (b - x) / ab
  279. def _icdf_formula(self, p, *, a, ab, **kwargs):
  280. return a + ab*p
  281. def _iccdf_formula(self, p, *, b, ab, **kwargs):
  282. return b - ab*p
  283. def _entropy_formula(self, *, ab, **kwargs):
  284. return np.log(ab)
  285. def _mode_formula(self, *, a, b, ab, **kwargs):
  286. return a + 0.5*ab
  287. def _median_formula(self, *, a, b, ab, **kwargs):
  288. return a + 0.5*ab
  289. def _moment_raw_formula(self, order, a, b, ab, **kwargs):
  290. np1 = order + 1
  291. return (b**np1 - a**np1) / (np1 * ab)
  292. def _moment_central_formula(self, order, ab, **kwargs):
  293. return ab**2/12 if order == 2 else None
  294. _moment_central_formula.orders = [2] # type: ignore[attr-defined]
  295. def _sample_formula(self, full_shape, rng, a, b, ab, **kwargs):
  296. try:
  297. return rng.uniform(a, b, size=full_shape)[()]
  298. except OverflowError: # happens when there are NaNs
  299. return rng.uniform(0, 1, size=full_shape)*ab + a
  300. class _Gamma(ContinuousDistribution):
  301. # Gamma distribution for testing only
  302. _a_domain = _RealInterval(endpoints=(0, inf))
  303. _x_support = _RealInterval(endpoints=(0, inf), inclusive=(False, False))
  304. _a_param = _RealParameter('a', domain=_a_domain, typical=(0.1, 10))
  305. _x_param = _RealParameter('x', domain=_x_support, typical=(0.1, 10))
  306. _parameterizations = [_Parameterization(_a_param)]
  307. _variable = _x_param
  308. def _pdf_formula(self, x, *, a, **kwargs):
  309. return x ** (a - 1) * np.exp(-x) / special.gamma(a)
  310. class Binomial(DiscreteDistribution):
  311. r"""Binomial distribution with prescribed success probability and number of trials
  312. The probability density function of the binomial distribution is:
  313. .. math::
  314. f(x) = {n \choose x} p^x (1 - p)^{n-x}
  315. """
  316. _n_domain = _IntegerInterval(endpoints=(0, inf), inclusive=(False, False))
  317. _p_domain = _RealInterval(endpoints=(0, 1), inclusive=(False, False))
  318. _x_support = _IntegerInterval(endpoints=(0, 'n'), inclusive=(True, True))
  319. _n_param = _RealParameter('n', domain=_n_domain, typical=(10, 20))
  320. _p_param = _RealParameter('p', domain=_p_domain, typical=(0.25, 0.75))
  321. _x_param = _RealParameter('x', domain=_x_support, typical=(0, 10))
  322. _parameterizations = [_Parameterization(_n_param, _p_param)]
  323. _variable = _x_param
  324. def __init__(self, *, n, p, **kwargs):
  325. super().__init__(n=n, p=p, **kwargs)
  326. def _pmf_formula(self, x, *, n, p, **kwargs):
  327. return scu._binom_pmf(x, n, p)
  328. def _logpmf_formula(self, x, *, n, p, **kwargs):
  329. # This implementation is from the ``scipy.stats.binom`` and could be improved
  330. # by using a more numerically sound implementation of the absolute value of
  331. # the binomial coefficient.
  332. combiln = (
  333. special.gammaln(n+1) - (special.gammaln(x+1) + special.gammaln(n-x+1))
  334. )
  335. return combiln + special.xlogy(x, p) + special.xlog1py(n-x, -p)
  336. def _cdf_formula(self, x, *, n, p, **kwargs):
  337. return scu._binom_cdf(x, n, p)
  338. def _logcdf_formula(self, x, *, n, p, **kwargs):
  339. # todo: add this strategy to infrastructure more generally, but allow dist
  340. # author to specify threshold other than median in case median is expensive
  341. median = self._icdf_formula(0.5, n=n, p=p)
  342. return xpx.apply_where(x < median, (x, n, p),
  343. lambda *args: np.log(scu._binom_cdf(*args)),
  344. lambda *args: np.log1p(-scu._binom_sf(*args))
  345. )
  346. def _ccdf_formula(self, x, *, n, p, **kwargs):
  347. return scu._binom_sf(x, n, p)
  348. def _logccdf_formula(self, x, *, n, p, **kwargs):
  349. median = self._icdf_formula(0.5, n=n, p=p)
  350. return xpx.apply_where(x < median, (x, n, p),
  351. lambda *args: np.log1p(-scu._binom_cdf(*args)),
  352. lambda *args: np.log(scu._binom_sf(*args))
  353. )
  354. def _icdf_formula(self, x, *, n, p, **kwargs):
  355. return scu._binom_ppf(x, n, p)
  356. def _iccdf_formula(self, x, *, n, p, **kwargs):
  357. return scu._binom_isf(x, n, p)
  358. def _mode_formula(self, *, n, p, **kwargs):
  359. # https://en.wikipedia.org/wiki/Binomial_distribution#Mode
  360. mode = np.floor((n+1)*p)
  361. mode = np.where(p == 1, mode - 1, mode)
  362. return mode[()]
  363. def _moment_raw_formula(self, order, *, n, p, **kwargs):
  364. # https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
  365. if order == 1:
  366. return n*p
  367. if order == 2:
  368. return n*p*(1 - p + n*p)
  369. return None
  370. _moment_raw_formula.orders = [1, 2] # type: ignore[attr-defined]
  371. def _moment_central_formula(self, order, *, n, p, **kwargs):
  372. # https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
  373. if order == 1:
  374. return np.zeros_like(n)
  375. if order == 2:
  376. return n*p*(1 - p)
  377. if order == 3:
  378. return n*p*(1 - p)*(1 - 2*p)
  379. if order == 4:
  380. return n*p*(1 - p)*(1 + (3*n - 6)*p*(1 - p))
  381. return None
  382. _moment_central_formula.orders = [1, 2, 3, 4] # type: ignore[attr-defined]
  383. # Distribution classes need only define the summary and beginning of the extended
  384. # summary portion of the class documentation. All other documentation, including
  385. # examples, is generated automatically.
  386. _module = sys.modules[__name__].__dict__
  387. for dist_name in __all__:
  388. _module[dist_name].__doc__ = _combine_docs(_module[dist_name])