_kde.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733
  1. #-------------------------------------------------------------------------------
  2. #
  3. # Define classes for (uni/multi)-variate kernel density estimation.
  4. #
  5. # Currently, only Gaussian kernels are implemented.
  6. #
  7. # Written by: Robert Kern
  8. #
  9. # Date: 2004-08-09
  10. #
  11. # Modified: 2005-02-10 by Robert Kern.
  12. # Contributed to SciPy
  13. # 2005-10-07 by Robert Kern.
  14. # Some fixes to match the new scipy_core
  15. #
  16. # Copyright 2004-2005 by Enthought, Inc.
  17. #
  18. #-------------------------------------------------------------------------------
  19. # SciPy imports.
  20. from scipy import linalg, special
  21. from scipy._lib._util import check_random_state, np_vecdot
  22. from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
  23. sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
  24. ones, cov)
  25. import numpy as np
  26. # Local imports.
  27. from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
  28. from ._multivariate import multivariate_normal
  29. __all__ = ['gaussian_kde']
  30. class gaussian_kde:
  31. """Representation of a kernel-density estimate using Gaussian kernels.
  32. Kernel density estimation is a way to estimate the probability density
  33. function (PDF) of a random variable in a non-parametric way.
  34. `gaussian_kde` works for both uni-variate and multi-variate data. It
  35. includes automatic bandwidth determination. The estimation works best for
  36. a unimodal distribution; bimodal or multi-modal distributions tend to be
  37. oversmoothed.
  38. Parameters
  39. ----------
  40. dataset : array_like
  41. Datapoints to estimate from. In case of univariate data this is a 1-D
  42. array, otherwise a 2-D array with shape (# of dims, # of data).
  43. bw_method : str, scalar or callable, optional
  44. The method used to calculate the bandwidth factor. This can be
  45. 'scott', 'silverman', a scalar constant or a callable. If a scalar,
  46. this will be used directly as `factor`. If a callable, it should
  47. take a `gaussian_kde` instance as only parameter and return a scalar.
  48. If None (default), 'scott' is used. See Notes for more details.
  49. weights : array_like, optional
  50. weights of datapoints. This must be the same shape as dataset.
  51. If None (default), the samples are assumed to be equally weighted
  52. Attributes
  53. ----------
  54. dataset : ndarray
  55. The dataset with which `gaussian_kde` was initialized.
  56. d : int
  57. Number of dimensions.
  58. n : int
  59. Number of datapoints.
  60. neff : int
  61. Effective number of datapoints.
  62. .. versionadded:: 1.2.0
  63. factor : float
  64. The bandwidth factor obtained from `covariance_factor`.
  65. covariance : ndarray
  66. The kernel covariance matrix; this is the data covariance matrix
  67. multiplied by the square of the bandwidth factor, e.g.
  68. ``np.cov(dataset) * factor**2``.
  69. inv_cov : ndarray
  70. The inverse of `covariance`.
  71. Methods
  72. -------
  73. evaluate
  74. __call__
  75. integrate_gaussian
  76. integrate_box_1d
  77. integrate_box
  78. integrate_kde
  79. pdf
  80. logpdf
  81. resample
  82. set_bandwidth
  83. covariance_factor
  84. marginal
  85. Notes
  86. -----
  87. Bandwidth selection strongly influences the estimate obtained from the KDE
  88. (much more so than the actual shape of the kernel). Bandwidth selection
  89. can be done by a "rule of thumb", by cross-validation, by "plug-in
  90. methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
  91. uses a rule of thumb, the default is Scott's Rule.
  92. Scott's Rule [1]_, implemented as `scotts_factor`, is::
  93. n**(-1./(d+4)),
  94. with ``n`` the number of data points and ``d`` the number of dimensions.
  95. In the case of unequally weighted points, `scotts_factor` becomes::
  96. neff**(-1./(d+4)),
  97. with ``neff`` the effective number of datapoints.
  98. Silverman's suggestion for *multivariate* data [2]_, implemented as
  99. `silverman_factor`, is::
  100. (n * (d + 2) / 4.)**(-1. / (d + 4)).
  101. or in the case of unequally weighted points::
  102. (neff * (d + 2) / 4.)**(-1. / (d + 4)).
  103. Note that this is not the same as "Silverman's rule of thumb" [6]_, which
  104. may be more robust in the univariate case; see documentation of the
  105. ``set_bandwidth`` method for implementing a custom bandwidth rule.
  106. Good general descriptions of kernel density estimation can be found in [1]_
  107. and [2]_, the mathematics for this multi-dimensional implementation can be
  108. found in [1]_.
  109. With a set of weighted samples, the effective number of datapoints ``neff``
  110. is defined by::
  111. neff = sum(weights)^2 / sum(weights^2)
  112. as detailed in [5]_.
  113. `gaussian_kde` does not currently support data that lies in a
  114. lower-dimensional subspace of the space in which it is expressed. For such
  115. data, consider performing principal component analysis / dimensionality
  116. reduction and using `gaussian_kde` with the transformed data.
  117. References
  118. ----------
  119. .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
  120. Visualization", John Wiley & Sons, New York, Chicester, 1992.
  121. .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
  122. Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
  123. Chapman and Hall, London, 1986.
  124. .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
  125. Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
  126. .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
  127. conditional density estimation", Computational Statistics & Data
  128. Analysis, Vol. 36, pp. 279-298, 2001.
  129. .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
  130. Series A (General), 132, 272
  131. .. [6] Kernel density estimation. *Wikipedia.*
  132. https://en.wikipedia.org/wiki/Kernel_density_estimation
  133. Examples
  134. --------
  135. Generate some random two-dimensional data:
  136. >>> import numpy as np
  137. >>> from scipy import stats
  138. >>> def measure(n):
  139. ... "Measurement model, return two coupled measurements."
  140. ... m1 = np.random.normal(size=n)
  141. ... m2 = np.random.normal(scale=0.5, size=n)
  142. ... return m1+m2, m1-m2
  143. >>> m1, m2 = measure(2000)
  144. >>> xmin = m1.min()
  145. >>> xmax = m1.max()
  146. >>> ymin = m2.min()
  147. >>> ymax = m2.max()
  148. Perform a kernel density estimate on the data:
  149. >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
  150. >>> positions = np.vstack([X.ravel(), Y.ravel()])
  151. >>> values = np.vstack([m1, m2])
  152. >>> kernel = stats.gaussian_kde(values)
  153. >>> Z = np.reshape(kernel(positions).T, X.shape)
  154. Plot the results:
  155. >>> import matplotlib.pyplot as plt
  156. >>> fig, ax = plt.subplots()
  157. >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
  158. ... extent=[xmin, xmax, ymin, ymax])
  159. >>> ax.plot(m1, m2, 'k.', markersize=2)
  160. >>> ax.set_xlim([xmin, xmax])
  161. >>> ax.set_ylim([ymin, ymax])
  162. >>> plt.show()
  163. Compare against manual KDE at a point:
  164. >>> point = [1, 2]
  165. >>> mean = values.T
  166. >>> cov = kernel.factor**2 * np.cov(values)
  167. >>> X = stats.multivariate_normal(cov=cov)
  168. >>> res = kernel.pdf(point)
  169. >>> ref = X.pdf(point - mean).sum() / len(mean)
  170. >>> np.allclose(res, ref)
  171. True
  172. """
  173. def __init__(self, dataset, bw_method=None, weights=None):
  174. self.dataset = atleast_2d(asarray(dataset))
  175. if not self.dataset.size > 1:
  176. raise ValueError("`dataset` input should have multiple elements.")
  177. self.d, self.n = self.dataset.shape
  178. if weights is not None:
  179. self._weights = atleast_1d(weights).astype(float)
  180. self._weights /= sum(self._weights)
  181. if self.weights.ndim != 1:
  182. raise ValueError("`weights` input should be one-dimensional.")
  183. if len(self._weights) != self.n:
  184. raise ValueError("`weights` input should be of length n")
  185. self._neff = 1/np_vecdot(self._weights, self._weights)
  186. # This can be converted to a warning once gh-10205 is resolved
  187. if self.d > self.n:
  188. msg = ("Number of dimensions is greater than number of samples. "
  189. "This results in a singular data covariance matrix, which "
  190. "cannot be treated using the algorithms implemented in "
  191. "`gaussian_kde`. Note that `gaussian_kde` interprets each "
  192. "*column* of `dataset` to be a point; consider transposing "
  193. "the input to `dataset`.")
  194. raise ValueError(msg)
  195. try:
  196. self.set_bandwidth(bw_method=bw_method)
  197. except linalg.LinAlgError as e:
  198. msg = ("The data appears to lie in a lower-dimensional subspace "
  199. "of the space in which it is expressed. This has resulted "
  200. "in a singular data covariance matrix, which cannot be "
  201. "treated using the algorithms implemented in "
  202. "`gaussian_kde`. Consider performing principal component "
  203. "analysis / dimensionality reduction and using "
  204. "`gaussian_kde` with the transformed data.")
  205. raise linalg.LinAlgError(msg) from e
  206. def evaluate(self, points):
  207. """Evaluate the estimated pdf on a set of points.
  208. Parameters
  209. ----------
  210. points : (# of dimensions, # of points)-array
  211. Alternatively, a (# of dimensions,) vector can be passed in and
  212. treated as a single point.
  213. Returns
  214. -------
  215. values : (# of points,)-array
  216. The values at each point.
  217. Raises
  218. ------
  219. ValueError : if the dimensionality of the input points is different than
  220. the dimensionality of the KDE.
  221. """
  222. points = atleast_2d(asarray(points))
  223. d, m = points.shape
  224. if d != self.d:
  225. if d == 1 and m == self.d:
  226. # points was passed in as a row vector
  227. points = reshape(points, (self.d, 1))
  228. m = 1
  229. else:
  230. msg = (f"points have dimension {d}, "
  231. f"dataset has dimension {self.d}")
  232. raise ValueError(msg)
  233. output_dtype, spec = _get_output_dtype(self.covariance, points)
  234. result = gaussian_kernel_estimate[spec](
  235. self.dataset.T, self.weights[:, None],
  236. points.T, self.cho_cov, output_dtype)
  237. return result[:, 0]
  238. __call__ = evaluate
  239. def integrate_gaussian(self, mean, cov):
  240. """
  241. Multiply estimated density by a multivariate Gaussian and integrate
  242. over the whole space.
  243. Parameters
  244. ----------
  245. mean : aray_like
  246. A 1-D array, specifying the mean of the Gaussian.
  247. cov : array_like
  248. A 2-D array, specifying the covariance matrix of the Gaussian.
  249. Returns
  250. -------
  251. result : scalar
  252. The value of the integral.
  253. Raises
  254. ------
  255. ValueError
  256. If the mean or covariance of the input Gaussian differs from
  257. the KDE's dimensionality.
  258. """
  259. mean = atleast_1d(squeeze(mean))
  260. cov = atleast_2d(cov)
  261. if mean.shape != (self.d,):
  262. raise ValueError(f"mean does not have dimension {self.d}")
  263. if cov.shape != (self.d, self.d):
  264. raise ValueError(f"covariance does not have dimension {self.d}")
  265. # make mean a column vector
  266. mean = mean[:, newaxis]
  267. sum_cov = self.covariance + cov
  268. # This will raise LinAlgError if the new cov matrix is not s.p.d
  269. # cho_factor returns (ndarray, bool) where bool is a flag for whether
  270. # or not ndarray is upper or lower triangular
  271. sum_cov_chol = linalg.cho_factor(sum_cov)
  272. diff = self.dataset - mean
  273. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  274. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  275. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  276. energies = np_vecdot(diff, tdiff, axis=0) / 2.0
  277. result = np_vecdot(exp(-energies), self.weights, axis=0) / norm_const
  278. return result
  279. def integrate_box_1d(self, low, high):
  280. """
  281. Computes the integral of a 1D pdf between two bounds.
  282. Parameters
  283. ----------
  284. low : scalar
  285. Lower bound of integration.
  286. high : scalar
  287. Upper bound of integration.
  288. Returns
  289. -------
  290. value : scalar
  291. The result of the integral.
  292. Raises
  293. ------
  294. ValueError
  295. If the KDE is over more than one dimension.
  296. """
  297. if self.d != 1:
  298. raise ValueError("integrate_box_1d() only handles 1D pdfs")
  299. stdev = ravel(sqrt(self.covariance))[0]
  300. normalized_low = ravel((low - self.dataset) / stdev)
  301. normalized_high = ravel((high - self.dataset) / stdev)
  302. delta = special.ndtr(normalized_high) - special.ndtr(normalized_low)
  303. value = np_vecdot(self.weights, delta)
  304. return value
  305. def integrate_box(self, low_bounds, high_bounds, maxpts=None, *, rng=None):
  306. """Computes the integral of a pdf over a rectangular interval.
  307. Parameters
  308. ----------
  309. low_bounds : array_like
  310. A 1-D array containing the lower bounds of integration.
  311. high_bounds : array_like
  312. A 1-D array containing the upper bounds of integration.
  313. maxpts : int, optional
  314. The maximum number of points to use for integration.
  315. rng : `numpy.random.Generator`, optional
  316. Pseudorandom number generator state. When `rng` is None, a new
  317. generator is created using entropy from the operating system. Types
  318. other than `numpy.random.Generator` are passed to
  319. `numpy.random.default_rng` to instantiate a ``Generator``.
  320. Returns
  321. -------
  322. value : scalar
  323. The result of the integral.
  324. """
  325. low, high = low_bounds - self.dataset.T, high_bounds - self.dataset.T
  326. values = multivariate_normal.cdf(
  327. high, lower_limit=low, cov=self.covariance, maxpts=maxpts,
  328. rng=rng
  329. )
  330. return np_vecdot(values, self.weights, axis=-1)
  331. def integrate_kde(self, other):
  332. """
  333. Computes the integral of the product of this kernel density estimate
  334. with another.
  335. Parameters
  336. ----------
  337. other : gaussian_kde instance
  338. The other kde.
  339. Returns
  340. -------
  341. value : scalar
  342. The result of the integral.
  343. Raises
  344. ------
  345. ValueError
  346. If the KDEs have different dimensionality.
  347. """
  348. if other.d != self.d:
  349. raise ValueError("KDEs are not the same dimensionality")
  350. # we want to iterate over the smallest number of points
  351. if other.n < self.n:
  352. small = other
  353. large = self
  354. else:
  355. small = self
  356. large = other
  357. sum_cov = small.covariance + large.covariance
  358. sum_cov_chol = linalg.cho_factor(sum_cov)
  359. result = 0.0
  360. for i in range(small.n):
  361. mean = small.dataset[:, i, newaxis]
  362. diff = large.dataset - mean
  363. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  364. energies = np_vecdot(diff, tdiff, axis=0) / 2.0
  365. result += np_vecdot(exp(-energies), large.weights, axis=0)*small.weights[i]
  366. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  367. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  368. result /= norm_const
  369. return result
  370. def resample(self, size=None, seed=None):
  371. """Randomly sample a dataset from the estimated pdf.
  372. Parameters
  373. ----------
  374. size : int, optional
  375. The number of samples to draw. If not provided, then the size is
  376. the same as the effective number of samples in the underlying
  377. dataset.
  378. seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
  379. If `seed` is None (or `np.random`), the `numpy.random.RandomState`
  380. singleton is used.
  381. If `seed` is an int, a new ``RandomState`` instance is used,
  382. seeded with `seed`.
  383. If `seed` is already a ``Generator`` or ``RandomState`` instance then
  384. that instance is used.
  385. Returns
  386. -------
  387. resample : (self.d, `size`) ndarray
  388. The sampled dataset.
  389. """ # numpy/numpydoc#87 # noqa: E501
  390. if size is None:
  391. size = int(self.neff)
  392. random_state = check_random_state(seed)
  393. norm = transpose(random_state.multivariate_normal(
  394. zeros((self.d,), float), self.covariance, size=size
  395. ))
  396. indices = random_state.choice(self.n, size=size, p=self.weights)
  397. means = self.dataset[:, indices]
  398. return means + norm
  399. def scotts_factor(self):
  400. """Compute Scott's factor.
  401. Returns
  402. -------
  403. s : float
  404. Scott's factor.
  405. """
  406. return power(self.neff, -1./(self.d+4))
  407. def silverman_factor(self):
  408. """Compute the Silverman factor.
  409. Returns
  410. -------
  411. s : float
  412. The silverman factor.
  413. """
  414. return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
  415. # Default method to calculate bandwidth, can be overwritten by subclass
  416. covariance_factor = scotts_factor
  417. covariance_factor.__doc__ = """Computes the bandwidth factor `factor`.
  418. The default is `scotts_factor`. A subclass can overwrite this
  419. method to provide a different method, or set it through a call to
  420. `set_bandwidth`."""
  421. def set_bandwidth(self, bw_method=None):
  422. """Compute the bandwidth factor with given method.
  423. The new bandwidth calculated after a call to `set_bandwidth` is used
  424. for subsequent evaluations of the estimated density.
  425. Parameters
  426. ----------
  427. bw_method : str, scalar or callable, optional
  428. The method used to calculate the bandwidth factor. This can be
  429. 'scott', 'silverman', a scalar constant or a callable. If a
  430. scalar, this will be used directly as `factor`. If a callable,
  431. it should take a `gaussian_kde` instance as only parameter and
  432. return a scalar. If None (default), nothing happens; the current
  433. `covariance_factor` method is kept.
  434. Notes
  435. -----
  436. .. versionadded:: 0.11
  437. Examples
  438. --------
  439. >>> import numpy as np
  440. >>> import scipy.stats as stats
  441. >>> x1 = np.array([-7, -5, 1, 4, 5.])
  442. >>> kde = stats.gaussian_kde(x1)
  443. >>> xs = np.linspace(-10, 10, num=50)
  444. >>> y1 = kde(xs)
  445. >>> kde.set_bandwidth(bw_method='silverman')
  446. >>> y2 = kde(xs)
  447. >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
  448. >>> y3 = kde(xs)
  449. >>> import matplotlib.pyplot as plt
  450. >>> fig, ax = plt.subplots()
  451. >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
  452. ... label='Data points (rescaled)')
  453. >>> ax.plot(xs, y1, label='Scott (default)')
  454. >>> ax.plot(xs, y2, label='Silverman')
  455. >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
  456. >>> ax.legend()
  457. >>> plt.show()
  458. """
  459. if bw_method is None:
  460. pass
  461. elif bw_method == 'scott':
  462. self.covariance_factor = self.scotts_factor
  463. elif bw_method == 'silverman':
  464. self.covariance_factor = self.silverman_factor
  465. elif np.isscalar(bw_method) and not isinstance(bw_method, str):
  466. self._bw_method = 'use constant'
  467. self.covariance_factor = lambda: bw_method
  468. elif callable(bw_method):
  469. self._bw_method = bw_method
  470. self.covariance_factor = lambda: self._bw_method(self)
  471. else:
  472. msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
  473. "or a callable."
  474. raise ValueError(msg)
  475. self._compute_covariance()
  476. def _compute_covariance(self):
  477. """Computes the covariance matrix for each Gaussian kernel using
  478. covariance_factor().
  479. """
  480. self.factor = self.covariance_factor()
  481. # Cache covariance and Cholesky decomp of covariance
  482. if not hasattr(self, '_data_cho_cov'):
  483. self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
  484. bias=False,
  485. aweights=self.weights))
  486. self._data_cho_cov = linalg.cholesky(self._data_covariance,
  487. lower=True)
  488. self.covariance = self._data_covariance * self.factor**2
  489. self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
  490. self.log_det = 2*np.log(np.diag(self.cho_cov
  491. * np.sqrt(2*pi))).sum()
  492. @property
  493. def inv_cov(self):
  494. # Re-compute from scratch each time because I'm not sure how this is
  495. # used in the wild. (Perhaps users change the `dataset`, since it's
  496. # not a private attribute?) `_compute_covariance` used to recalculate
  497. # all these, so we'll recalculate everything now that this is a
  498. # a property.
  499. self.factor = self.covariance_factor()
  500. self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
  501. bias=False, aweights=self.weights))
  502. return linalg.inv(self._data_covariance) / self.factor**2
  503. def pdf(self, x):
  504. """
  505. Evaluate the estimated pdf on a provided set of points.
  506. Notes
  507. -----
  508. This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
  509. docstring for more details.
  510. """
  511. return self.evaluate(x)
  512. def logpdf(self, x):
  513. """
  514. Evaluate the log of the estimated pdf on a provided set of points.
  515. """
  516. points = atleast_2d(x)
  517. d, m = points.shape
  518. if d != self.d:
  519. if d == 1 and m == self.d:
  520. # points was passed in as a row vector
  521. points = reshape(points, (self.d, 1))
  522. m = 1
  523. else:
  524. msg = (f"points have dimension {d}, "
  525. f"dataset has dimension {self.d}")
  526. raise ValueError(msg)
  527. output_dtype, spec = _get_output_dtype(self.covariance, points)
  528. result = gaussian_kernel_estimate_log[spec](
  529. self.dataset.T, self.weights[:, None],
  530. points.T, self.cho_cov, output_dtype)
  531. return result[:, 0]
  532. def marginal(self, dimensions):
  533. """Return a marginal KDE distribution
  534. Parameters
  535. ----------
  536. dimensions : int or 1-d array_like
  537. The dimensions of the multivariate distribution corresponding
  538. with the marginal variables, that is, the indices of the dimensions
  539. that are being retained. The other dimensions are marginalized out.
  540. Returns
  541. -------
  542. marginal_kde : gaussian_kde
  543. An object representing the marginal distribution.
  544. Notes
  545. -----
  546. .. versionadded:: 1.10.0
  547. """
  548. dims = np.atleast_1d(dimensions)
  549. if not np.issubdtype(dims.dtype, np.integer):
  550. msg = ("Elements of `dimensions` must be integers - the indices "
  551. "of the marginal variables being retained.")
  552. raise ValueError(msg)
  553. n = len(self.dataset) # number of dimensions
  554. original_dims = dims.copy()
  555. dims[dims < 0] = n + dims[dims < 0]
  556. if len(np.unique(dims)) != len(dims):
  557. msg = ("All elements of `dimensions` must be unique.")
  558. raise ValueError(msg)
  559. i_invalid = (dims < 0) | (dims >= n)
  560. if np.any(i_invalid):
  561. msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
  562. f"for a distribution in {n} dimensions.")
  563. raise ValueError(msg)
  564. dataset = self.dataset[dims]
  565. weights = self.weights
  566. return gaussian_kde(dataset, bw_method=self.covariance_factor(),
  567. weights=weights)
  568. @property
  569. def weights(self):
  570. try:
  571. return self._weights
  572. except AttributeError:
  573. self._weights = ones(self.n)/self.n
  574. return self._weights
  575. @property
  576. def neff(self):
  577. try:
  578. return self._neff
  579. except AttributeError:
  580. self._neff = 1/np_vecdot(self.weights, self.weights)
  581. return self._neff
  582. def _get_output_dtype(covariance, points):
  583. """
  584. Calculates the output dtype and the "spec" (=C type name).
  585. This was necessary in order to deal with the fused types in the Cython
  586. routine `gaussian_kernel_estimate`. See gh-10824 for details.
  587. """
  588. output_dtype = np.common_type(covariance, points)
  589. itemsize = np.dtype(output_dtype).itemsize
  590. if itemsize == 4:
  591. spec = 'float'
  592. elif itemsize == 8:
  593. spec = 'double'
  594. elif itemsize in (12, 16):
  595. spec = 'long double'
  596. else:
  597. raise ValueError(
  598. f"{output_dtype} has unexpected item size: {itemsize}"
  599. )
  600. return output_dtype, spec