_entropy.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """
  2. Created on Fri Apr 2 09:06:05 2021
  3. @author: matth
  4. """
  5. import math
  6. import numpy as np
  7. from scipy import special
  8. from ._axis_nan_policy import _axis_nan_policy_factory
  9. from scipy._lib._array_api import (array_namespace, xp_promote, xp_device,
  10. is_marray, _share_masks, xp_capabilities)
  11. __all__ = ['entropy', 'differential_entropy']
  12. @xp_capabilities()
  13. @_axis_nan_policy_factory(
  14. lambda x: x,
  15. n_samples=lambda kwgs: (
  16. 2 if ("qk" in kwgs and kwgs["qk"] is not None)
  17. else 1
  18. ),
  19. n_outputs=1, result_to_tuple=lambda x, _: (x,), paired=True,
  20. too_small=-1 # entropy doesn't have too small inputs
  21. )
  22. def entropy(pk: np.typing.ArrayLike,
  23. qk: np.typing.ArrayLike | None = None,
  24. base: float | None = None,
  25. axis: int = 0
  26. ) -> np.number | np.ndarray:
  27. """
  28. Calculate the Shannon entropy/relative entropy of given distribution(s).
  29. If only probabilities `pk` are given, the Shannon entropy is calculated as
  30. ``H = -sum(pk * log(pk))``.
  31. If `qk` is not None, then compute the relative entropy
  32. ``D = sum(pk * log(pk / qk))``. This quantity is also known
  33. as the Kullback-Leibler divergence.
  34. This routine will normalize `pk` and `qk` if they don't sum to 1.
  35. Parameters
  36. ----------
  37. pk : array_like
  38. Defines the (discrete) distribution. Along each axis-slice of ``pk``,
  39. element ``i`` is the (possibly unnormalized) probability of event
  40. ``i``.
  41. qk : array_like, optional
  42. Sequence against which the relative entropy is computed. Should be in
  43. the same format as `pk`.
  44. base : float, optional
  45. The logarithmic base to use, defaults to ``e`` (natural logarithm).
  46. axis : int, optional
  47. The axis along which the entropy is calculated. Default is 0.
  48. Returns
  49. -------
  50. S : {float, array_like}
  51. The calculated entropy.
  52. Notes
  53. -----
  54. Informally, the Shannon entropy quantifies the expected uncertainty
  55. inherent in the possible outcomes of a discrete random variable.
  56. For example,
  57. if messages consisting of sequences of symbols from a set are to be
  58. encoded and transmitted over a noiseless channel, then the Shannon entropy
  59. ``H(pk)`` gives a tight lower bound for the average number of units of
  60. information needed per symbol if the symbols occur with frequencies
  61. governed by the discrete distribution `pk` [1]_. The choice of base
  62. determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
  63. The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
  64. number of units of information needed per symbol if the encoding is
  65. optimized for the probability distribution `qk` instead of the true
  66. distribution `pk`. Informally, the relative entropy quantifies the expected
  67. excess in surprise experienced if one believes the true distribution is
  68. `qk` when it is actually `pk`.
  69. A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
  70. equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
  71. the formula ``CE = -sum(pk * log(qk))``. It gives the average
  72. number of units of information needed per symbol if an encoding is
  73. optimized for the probability distribution `qk` when the true distribution
  74. is `pk`. It is not computed directly by `entropy`, but it can be computed
  75. using two calls to the function (see Examples).
  76. See [2]_ for more information.
  77. References
  78. ----------
  79. .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
  80. Bell System Technical Journal, 27: 379-423.
  81. https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
  82. .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
  83. Theory (Wiley Series in Telecommunications and Signal Processing).
  84. Wiley-Interscience, USA.
  85. Examples
  86. --------
  87. The outcome of a fair coin is the most uncertain:
  88. >>> import numpy as np
  89. >>> from scipy.stats import entropy
  90. >>> base = 2 # work in units of bits
  91. >>> pk = np.array([1/2, 1/2]) # fair coin
  92. >>> H = entropy(pk, base=base)
  93. >>> H
  94. 1.0
  95. >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
  96. True
  97. The outcome of a biased coin is less uncertain:
  98. >>> qk = np.array([9/10, 1/10]) # biased coin
  99. >>> entropy(qk, base=base)
  100. 0.46899559358928117
  101. The relative entropy between the fair coin and biased coin is calculated
  102. as:
  103. >>> D = entropy(pk, qk, base=base)
  104. >>> D
  105. 0.7369655941662062
  106. >>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0)
  107. True
  108. The cross entropy can be calculated as the sum of the entropy and
  109. relative entropy`:
  110. >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
  111. >>> CE
  112. 1.736965594166206
  113. >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
  114. True
  115. """
  116. if base is not None and base <= 0:
  117. raise ValueError("`base` must be a positive number or `None`.")
  118. xp = array_namespace(pk, qk)
  119. pk, qk = xp_promote(pk, qk, broadcast=True, xp=xp)
  120. with np.errstate(invalid='ignore'):
  121. if qk is not None:
  122. pk, qk = _share_masks(pk, qk, xp=xp)
  123. qk = qk / xp.sum(qk, axis=axis, keepdims=True)
  124. pk = pk / xp.sum(pk, axis=axis, keepdims=True)
  125. if qk is None:
  126. vec = special.entr(pk)
  127. else:
  128. if is_marray(xp): # compensate for mdhaber/marray#97
  129. vec = special.rel_entr(pk.data, qk.data) # type: ignore[union-attr]
  130. vec = xp.asarray(vec, mask=pk.mask) # type: ignore[union-attr]
  131. else:
  132. vec = special.rel_entr(pk, qk)
  133. S = xp.sum(vec, axis=axis)
  134. if base is not None:
  135. S /= math.log(base)
  136. return S
  137. def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
  138. values = samples[0]
  139. n = values.shape[axis]
  140. window_length = kwargs.get("window_length")
  141. if window_length is None:
  142. window_length = math.floor(math.sqrt(n) + 0.5)
  143. if not 2 <= 2 * window_length < n:
  144. return True
  145. return False
  146. @xp_capabilities()
  147. @_axis_nan_policy_factory(
  148. lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,),
  149. too_small=_differential_entropy_is_too_small
  150. )
  151. def differential_entropy(
  152. values: np.typing.ArrayLike,
  153. *,
  154. window_length: int | None = None,
  155. base: float | None = None,
  156. axis: int = 0,
  157. method: str = "auto",
  158. ) -> np.number | np.ndarray:
  159. r"""Given a sample of a distribution, estimate the differential entropy.
  160. Several estimation methods are available using the `method` parameter. By
  161. default, a method is selected based the size of the sample.
  162. Parameters
  163. ----------
  164. values : sequence
  165. Sample from a continuous distribution.
  166. window_length : int, optional
  167. Window length for computing Vasicek estimate. Must be an integer
  168. between 1 and half of the sample size. If ``None`` (the default), it
  169. uses the heuristic value
  170. .. math::
  171. \left \lfloor \sqrt{n} + 0.5 \right \rfloor
  172. where :math:`n` is the sample size. This heuristic was originally
  173. proposed in [2]_ and has become common in the literature.
  174. base : float, optional
  175. The logarithmic base to use, defaults to ``e`` (natural logarithm).
  176. axis : int, optional
  177. The axis along which the differential entropy is calculated.
  178. Default is 0.
  179. method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
  180. The method used to estimate the differential entropy from the sample.
  181. Default is ``'auto'``. See Notes for more information.
  182. Returns
  183. -------
  184. entropy : float
  185. The calculated differential entropy.
  186. Notes
  187. -----
  188. This function will converge to the true differential entropy in the limit
  189. .. math::
  190. n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
  191. The optimal choice of ``window_length`` for a given sample size depends on
  192. the (unknown) distribution. Typically, the smoother the density of the
  193. distribution, the larger the optimal value of ``window_length`` [1]_.
  194. The following options are available for the `method` parameter.
  195. * ``'vasicek'`` uses the estimator presented in [1]_. This is
  196. one of the first and most influential estimators of differential entropy.
  197. * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
  198. is not only consistent but, under some conditions, asymptotically normal.
  199. * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
  200. in simulation to have smaller bias and mean squared error than
  201. the Vasicek estimator.
  202. * ``'correa'`` uses the estimator presented in [5]_ based on local linear
  203. regression. In a simulation study, it had consistently smaller mean
  204. square error than the Vasiceck estimator, but it is more expensive to
  205. compute.
  206. * ``'auto'`` selects the method automatically (default). Currently,
  207. this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
  208. for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
  209. samples, but this behavior is subject to change in future versions.
  210. All estimators are implemented as described in [6]_.
  211. References
  212. ----------
  213. .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
  214. Journal of the Royal Statistical Society:
  215. Series B (Methodological), 38(1), 54-59.
  216. .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
  217. goodness-of-fit test for exponentiality. Communications in
  218. Statistics-Theory and Methods, 28(5), 1183-1202.
  219. .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
  220. class of statistics based on spacings. Scandinavian Journal of
  221. Statistics, 61-72.
  222. .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
  223. of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
  224. .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
  225. in Statistics-Theory and Methods, 24(10), 2439-2449.
  226. .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
  227. Annals of Data Science, 2(2), 231-241.
  228. https://link.springer.com/article/10.1007/s40745-015-0045-9
  229. Examples
  230. --------
  231. >>> import numpy as np
  232. >>> from scipy.stats import differential_entropy, norm
  233. Entropy of a standard normal distribution:
  234. >>> rng = np.random.default_rng()
  235. >>> values = rng.standard_normal(100)
  236. >>> differential_entropy(values)
  237. 1.3407817436640392
  238. Compare with the true entropy:
  239. >>> float(norm.entropy())
  240. 1.4189385332046727
  241. For several sample sizes between 5 and 1000, compare the accuracy of
  242. the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
  243. compare the root mean squared error (over 1000 trials) between the estimate
  244. and the true differential entropy of the distribution.
  245. >>> from scipy import stats
  246. >>> import matplotlib.pyplot as plt
  247. >>>
  248. >>>
  249. >>> def rmse(res, expected):
  250. ... '''Root mean squared error'''
  251. ... return np.sqrt(np.mean((res - expected)**2))
  252. >>>
  253. >>>
  254. >>> a, b = np.log10(5), np.log10(1000)
  255. >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
  256. >>> reps = 1000 # number of repetitions for each sample size
  257. >>> expected = stats.expon.entropy()
  258. >>>
  259. >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
  260. >>> for method in method_errors:
  261. ... for n in ns:
  262. ... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
  263. ... res = stats.differential_entropy(rvs, method=method, axis=-1)
  264. ... error = rmse(res, expected)
  265. ... method_errors[method].append(error)
  266. >>>
  267. >>> for method, errors in method_errors.items():
  268. ... plt.loglog(ns, errors, label=method)
  269. >>>
  270. >>> plt.legend()
  271. >>> plt.xlabel('sample size')
  272. >>> plt.ylabel('RMSE (1000 trials)')
  273. >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
  274. """
  275. xp = array_namespace(values)
  276. values = xp_promote(values, force_floating=True, xp=xp)
  277. values = xp.moveaxis(values, axis, -1)
  278. n = values.shape[-1] # type: ignore[union-attr]
  279. if window_length is None:
  280. window_length = math.floor(math.sqrt(n) + 0.5)
  281. if not 2 <= 2 * window_length < n:
  282. raise ValueError(
  283. f"Window length ({window_length}) must be positive and less "
  284. f"than half the sample size ({n}).",
  285. )
  286. if base is not None and base <= 0:
  287. raise ValueError("`base` must be a positive number or `None`.")
  288. sorted_data = xp.sort(values, axis=-1)
  289. methods = {"vasicek": _vasicek_entropy,
  290. "van es": _van_es_entropy,
  291. "correa": _correa_entropy,
  292. "ebrahimi": _ebrahimi_entropy,
  293. "auto": _vasicek_entropy}
  294. method = method.lower()
  295. if method not in methods:
  296. message = f"`method` must be one of {set(methods)}"
  297. raise ValueError(message)
  298. if method == "auto":
  299. if n <= 10:
  300. method = 'van es'
  301. elif n <= 1000:
  302. method = 'ebrahimi'
  303. else:
  304. method = 'vasicek'
  305. res = methods[method](sorted_data, window_length, xp=xp)
  306. if base is not None:
  307. res /= math.log(base)
  308. # avoid dtype changes due to data-apis/array-api-compat#152
  309. # can be removed when data-apis/array-api-compat#152 is resolved
  310. return xp.astype(res, values.dtype) # type: ignore[union-attr]
  311. def _pad_along_last_axis(X, m, *, xp):
  312. """Pad the data for computing the rolling window difference."""
  313. # scales a bit better than method in _vasicek_like_entropy
  314. shape = X.shape[:-1] + (m,)
  315. Xl = xp.broadcast_to(X[..., :1], shape) # :1 vs 0 to maintain shape
  316. Xr = xp.broadcast_to(X[..., -1:], shape)
  317. return xp.concat((Xl, X, Xr), axis=-1)
  318. def _vasicek_entropy(X, m, *, xp):
  319. """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
  320. n = X.shape[-1]
  321. X = _pad_along_last_axis(X, m, xp=xp)
  322. differences = X[..., 2 * m:] - X[..., : -2 * m:]
  323. logs = xp.log(n/(2*m) * differences)
  324. return xp.mean(logs, axis=-1)
  325. def _van_es_entropy(X, m, *, xp):
  326. """Compute the van Es estimator as described in [6]."""
  327. # No equation number, but referred to as HVE_mn.
  328. # Typo: there should be a log within the summation.
  329. n = X.shape[-1]
  330. difference = X[..., m:] - X[..., :-m]
  331. term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1)
  332. k = xp.arange(m, n+1, dtype=term1.dtype, device=xp_device(X))
  333. return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1)
  334. def _ebrahimi_entropy(X, m, *, xp):
  335. """Compute the Ebrahimi estimator as described in [6]."""
  336. # No equation number, but referred to as HE_mn
  337. n = X.shape[-1]
  338. X = _pad_along_last_axis(X, m, xp=xp)
  339. differences = X[..., 2 * m:] - X[..., : -2 * m:]
  340. i = xp.arange(1, n+1, dtype=X.dtype, device=xp_device(X))
  341. ci = xp.where(i <= m, 1 + (i - 1)/m, 2.)
  342. ci = xp.where(i >= n - m + 1, 1 + (n - i)/m, ci)
  343. logs = xp.log(n * differences / (ci * m))
  344. return xp.mean(logs, axis=-1)
  345. def _correa_entropy(X, m, *, xp):
  346. """Compute the Correa estimator as described in [6]."""
  347. # No equation number, but referred to as HC_mn
  348. n = X.shape[-1]
  349. X = _pad_along_last_axis(X, m, xp=xp)
  350. i = xp.arange(1, n+1, device=xp_device(X))
  351. dj = xp.arange(-m, m+1, device=xp_device(X))[:, None]
  352. j = i + dj
  353. j0 = j + m - 1 # 0-indexed version of j
  354. Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True)
  355. difference = X[..., j0] - Xibar
  356. num = xp.sum(difference*dj, axis=-2) # dj is d-i
  357. den = n*xp.sum(difference**2, axis=-2)
  358. return -xp.mean(xp.log(num/den), axis=-1)