test_kdeoth.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. from scipy import stats, linalg, integrate
  2. import numpy as np
  3. from numpy.testing import (assert_almost_equal, assert_, assert_equal,
  4. assert_array_almost_equal,
  5. assert_array_almost_equal_nulp, assert_allclose)
  6. import pytest
  7. from pytest import raises as assert_raises
  8. def test_kde_1d():
  9. #some basic tests comparing to normal distribution
  10. rng = np.random.default_rng(8765678)
  11. n_basesample = 500
  12. xn = rng.normal(0, 1, n_basesample)
  13. xnmean = xn.mean()
  14. xnstd = xn.std(ddof=1)
  15. # get kde for original sample
  16. gkde = stats.gaussian_kde(xn)
  17. # evaluate the density function for the kde for some points
  18. xx = np.asarray([0.1, 0.5, 0.9])
  19. loc, scale = gkde.dataset, np.sqrt(gkde.covariance)
  20. assert_allclose(
  21. gkde(xx),
  22. stats.norm.pdf(xx[:, None], loc=loc, scale=scale).sum(axis=-1) / gkde.n,
  23. rtol=5e-14
  24. )
  25. xs = np.linspace(-7, 7, 501)
  26. kdepdf = gkde.evaluate(xs)
  27. normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
  28. intervall = xs[1] - xs[0]
  29. assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
  30. prob1 = gkde.integrate_box_1d(xnmean, np.inf)
  31. prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
  32. assert_almost_equal(prob1, 0.5, decimal=1)
  33. assert_almost_equal(prob2, 0.5, decimal=1)
  34. assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
  35. assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
  36. assert_almost_equal(gkde.integrate_kde(gkde),
  37. (kdepdf**2).sum()*intervall, decimal=2)
  38. assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
  39. (kdepdf*normpdf).sum()*intervall, decimal=2)
  40. def test_kde_1d_weighted():
  41. #some basic tests comparing to normal distribution
  42. rng = np.random.default_rng(8765678)
  43. n_basesample = 500
  44. xn = rng.normal(0, 1, n_basesample)
  45. wn = rng.random(n_basesample)
  46. xnmean = np.average(xn, weights=wn)
  47. xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
  48. # get kde for original sample
  49. gkde = stats.gaussian_kde(xn, weights=wn)
  50. # evaluate the density function for the kde for some points
  51. # evaluate the density function for the kde for some points
  52. xx = np.asarray([0.1, 0.5, 0.9])
  53. loc, scale = gkde.dataset, np.sqrt(gkde.covariance)
  54. pdf = stats.norm.pdf
  55. assert_allclose(
  56. gkde(xx),
  57. np.sum(pdf(xx[:, None], loc=loc, scale=scale) * gkde.weights, axis=-1),
  58. rtol=5e-14
  59. )
  60. xs = np.linspace(-7, 7, 501)
  61. kdepdf = gkde.evaluate(xs)
  62. normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
  63. intervall = xs[1] - xs[0]
  64. assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
  65. prob1 = gkde.integrate_box_1d(xnmean, np.inf)
  66. prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
  67. assert_almost_equal(prob1, 0.5, decimal=1)
  68. assert_almost_equal(prob2, 0.5, decimal=1)
  69. assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
  70. assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
  71. assert_almost_equal(gkde.integrate_kde(gkde),
  72. (kdepdf**2).sum()*intervall, decimal=2)
  73. assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
  74. (kdepdf*normpdf).sum()*intervall, decimal=2)
  75. @pytest.mark.parametrize("n_basesample",
  76. [
  77. 20,
  78. pytest.param(500, marks=[pytest.mark.xslow])
  79. ]
  80. )
  81. def test_kde_2d(n_basesample):
  82. #some basic tests comparing to normal distribution
  83. rng = np.random.default_rng(8765678)
  84. mean = np.array([1.0, 3.0])
  85. covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
  86. # Need transpose (shape (2, 500)) for kde
  87. xn = rng.multivariate_normal(mean, covariance, size=n_basesample).T
  88. # get kde for original sample
  89. gkde = stats.gaussian_kde(xn)
  90. # evaluate vs multivariate normal, using the KDE definition
  91. xx = np.asarray([[1, 2], [3, 4], [5, 6]])
  92. arg = xx[:, None, :] - gkde.dataset.T
  93. pdf = stats.multivariate_normal.pdf
  94. assert_allclose(
  95. gkde(xx.T),
  96. pdf(arg, cov=gkde.covariance).sum(axis=-1) / gkde.n,
  97. rtol=5e-14
  98. )
  99. # ... and cdf
  100. cdf = stats.multivariate_normal.cdf
  101. lo, hi = [-1, -2], [0, 0]
  102. lo_, hi_ = lo - gkde.dataset.T, hi - gkde.dataset.T
  103. assert_allclose(
  104. gkde.integrate_box(lo, hi, rng=rng),
  105. cdf(hi_, lower_limit=lo_, cov=gkde.covariance, rng=rng).sum(axis=-1) / gkde.n,
  106. rtol=5e-7
  107. )
  108. # evaluate the density function for the kde for some points
  109. x, y = np.mgrid[-7:7:500j, -7:7:500j]
  110. grid_coords = np.vstack([x.ravel(), y.ravel()])
  111. kdepdf = gkde.evaluate(grid_coords)
  112. kdepdf = kdepdf.reshape(500, 500)
  113. normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]),
  114. mean=mean, cov=covariance)
  115. intervall = y.ravel()[1] - y.ravel()[0]
  116. assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
  117. small = -1e100
  118. large = 1e100
  119. prob1 = gkde.integrate_box([small, mean[1]], [large, large], rng=rng)
  120. prob2 = gkde.integrate_box([small, small], [large, mean[1]], rng=rng)
  121. assert_almost_equal(prob1, 0.5, decimal=1)
  122. assert_almost_equal(prob2, 0.5, decimal=1)
  123. assert_almost_equal(gkde.integrate_kde(gkde),
  124. (kdepdf**2).sum()*(intervall**2), decimal=2)
  125. assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
  126. (kdepdf*normpdf).sum()*(intervall**2), decimal=2)
  127. @pytest.mark.parametrize("n_basesample",
  128. [
  129. 20,
  130. pytest.param(500, marks=[pytest.mark.xslow])
  131. ]
  132. )
  133. def test_kde_2d_weighted(n_basesample):
  134. #some basic tests comparing to normal distribution
  135. rng = np.random.RandomState(8765678)
  136. mean = np.array([1.0, 3.0])
  137. covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
  138. # Need transpose (shape (2, 500)) for kde
  139. xn = rng.multivariate_normal(mean, covariance, size=n_basesample).T
  140. wn = rng.rand(n_basesample)
  141. # get kde for original sample
  142. gkde = stats.gaussian_kde(xn, weights=wn)
  143. # evaluate vs multivariate normal, using the kde definition
  144. xx = np.asarray([[1, 2], [3, 4], [5, 6]])
  145. arg = xx[:, None, :] - gkde.dataset.T
  146. pdf = stats.multivariate_normal.pdf
  147. assert_allclose(
  148. gkde(xx.T),
  149. np.sum(pdf(arg, cov=gkde.covariance) * gkde.weights, axis=-1),
  150. rtol=5e-14
  151. )
  152. # ... and cdf
  153. cdf = stats.multivariate_normal.cdf
  154. lo, hi = [-1, -2], [0, 0]
  155. lo_, hi_ = lo - gkde.dataset.T, hi - gkde.dataset.T
  156. assert_allclose(
  157. gkde.integrate_box(lo, hi, rng=rng),
  158. np.sum(cdf(hi_, lower_limit=lo_, cov=gkde.covariance, rng=rng) *
  159. gkde.weights, axis=-1),
  160. rtol=5e-6
  161. )
  162. # evaluate the density function for the kde for some points
  163. x, y = np.mgrid[-7:7:500j, -7:7:500j]
  164. grid_coords = np.vstack([x.ravel(), y.ravel()])
  165. kdepdf = gkde.evaluate(grid_coords)
  166. kdepdf = kdepdf.reshape(500, 500)
  167. normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]),
  168. mean=mean, cov=covariance)
  169. intervall = y.ravel()[1] - y.ravel()[0]
  170. assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
  171. small = -1e100
  172. large = 1e100
  173. prob1 = gkde.integrate_box([small, mean[1]], [large, large], rng=rng)
  174. prob2 = gkde.integrate_box([small, small], [large, mean[1]], rng=rng)
  175. assert_almost_equal(prob1, 0.5, decimal=1)
  176. assert_almost_equal(prob2, 0.5, decimal=1)
  177. assert_almost_equal(gkde.integrate_kde(gkde),
  178. (kdepdf**2).sum()*(intervall**2), decimal=2)
  179. assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
  180. (kdepdf*normpdf).sum()*(intervall**2), decimal=2)
  181. def test_kde_bandwidth_method():
  182. def scotts_factor(kde_obj):
  183. """Same as default, just check that it works."""
  184. return np.power(kde_obj.n, -1./(kde_obj.d+4))
  185. rng = np.random.default_rng(8765678)
  186. n_basesample = 50
  187. xn = rng.normal(0, 1, n_basesample)
  188. # Default
  189. gkde = stats.gaussian_kde(xn)
  190. # Supply a callable
  191. gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
  192. # Supply a scalar
  193. gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
  194. xs = np.linspace(-7,7,51)
  195. kdepdf = gkde.evaluate(xs)
  196. kdepdf2 = gkde2.evaluate(xs)
  197. assert_almost_equal(kdepdf, kdepdf2)
  198. kdepdf3 = gkde3.evaluate(xs)
  199. assert_almost_equal(kdepdf, kdepdf3)
  200. assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
  201. def test_kde_bandwidth_method_weighted():
  202. def scotts_factor(kde_obj):
  203. """Same as default, just check that it works."""
  204. return np.power(kde_obj.neff, -1./(kde_obj.d+4))
  205. rng = np.random.default_rng(8765678)
  206. n_basesample = 50
  207. xn = rng.normal(0, 1, n_basesample)
  208. # Default
  209. gkde = stats.gaussian_kde(xn)
  210. # Supply a callable
  211. gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
  212. # Supply a scalar
  213. gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
  214. xs = np.linspace(-7,7,51)
  215. kdepdf = gkde.evaluate(xs)
  216. kdepdf2 = gkde2.evaluate(xs)
  217. assert_almost_equal(kdepdf, kdepdf2)
  218. kdepdf3 = gkde3.evaluate(xs)
  219. assert_almost_equal(kdepdf, kdepdf3)
  220. assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
  221. # Subclasses that should stay working (extracted from various sources).
  222. # Unfortunately the earlier design of gaussian_kde made it necessary for users
  223. # to create these kinds of subclasses, or call _compute_covariance() directly.
  224. class _kde_subclass1(stats.gaussian_kde):
  225. def __init__(self, dataset):
  226. self.dataset = np.atleast_2d(dataset)
  227. self.d, self.n = self.dataset.shape
  228. self.covariance_factor = self.scotts_factor
  229. self._compute_covariance()
  230. class _kde_subclass2(stats.gaussian_kde):
  231. def __init__(self, dataset):
  232. self.covariance_factor = self.scotts_factor
  233. super().__init__(dataset)
  234. class _kde_subclass4(stats.gaussian_kde):
  235. def covariance_factor(self):
  236. return 0.5 * self.silverman_factor()
  237. def test_gaussian_kde_subclassing():
  238. x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
  239. xs = np.linspace(-10, 10, num=50)
  240. # gaussian_kde itself
  241. kde = stats.gaussian_kde(x1)
  242. ys = kde(xs)
  243. # subclass 1
  244. kde1 = _kde_subclass1(x1)
  245. y1 = kde1(xs)
  246. assert_array_almost_equal_nulp(ys, y1, nulp=10)
  247. # subclass 2
  248. kde2 = _kde_subclass2(x1)
  249. y2 = kde2(xs)
  250. assert_array_almost_equal_nulp(ys, y2, nulp=10)
  251. # subclass 3 was removed because we have no obligation to maintain support
  252. # for user invocation of private methods
  253. # subclass 4
  254. kde4 = _kde_subclass4(x1)
  255. y4 = kde4(x1)
  256. y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
  257. assert_array_almost_equal(y_expected, y4, decimal=6)
  258. # Not a subclass, but check for use of _compute_covariance()
  259. kde5 = kde
  260. kde5.covariance_factor = lambda: kde.factor
  261. kde5._compute_covariance()
  262. y5 = kde5(xs)
  263. assert_array_almost_equal_nulp(ys, y5, nulp=10)
  264. def test_gaussian_kde_covariance_caching():
  265. x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
  266. xs = np.linspace(-10, 10, num=5)
  267. # These expected values are from scipy 0.10, before some changes to
  268. # gaussian_kde. They were not compared with any external reference.
  269. y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
  270. # Set the bandwidth, then reset it to the default.
  271. kde = stats.gaussian_kde(x1)
  272. kde.set_bandwidth(bw_method=0.5)
  273. kde.set_bandwidth(bw_method='scott')
  274. y2 = kde(xs)
  275. assert_array_almost_equal(y_expected, y2, decimal=7)
  276. def test_gaussian_kde_monkeypatch():
  277. """Ugly, but people may rely on this. See scipy pull request 123,
  278. specifically the linked ML thread "Width of the Gaussian in stats.kde".
  279. If it is necessary to break this later on, that is to be discussed on ML.
  280. """
  281. x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
  282. xs = np.linspace(-10, 10, num=50)
  283. # The old monkeypatched version to get at Silverman's Rule.
  284. kde = stats.gaussian_kde(x1)
  285. kde.covariance_factor = kde.silverman_factor
  286. kde._compute_covariance()
  287. y1 = kde(xs)
  288. # The new saner version.
  289. kde2 = stats.gaussian_kde(x1, bw_method='silverman')
  290. y2 = kde2(xs)
  291. assert_array_almost_equal_nulp(y1, y2, nulp=10)
  292. def test_kde_integer_input():
  293. """Regression test for #1181."""
  294. x1 = np.arange(5)
  295. kde = stats.gaussian_kde(x1)
  296. y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
  297. assert_array_almost_equal(kde(x1), y_expected, decimal=6)
  298. _ftypes = ['float32', 'float64', 'float96', 'float128', 'int32', 'int64']
  299. @pytest.mark.parametrize("bw_type", _ftypes + ["scott", "silverman"])
  300. @pytest.mark.parametrize("dtype", _ftypes)
  301. def test_kde_output_dtype(dtype, bw_type):
  302. # Check whether the datatypes are available
  303. dtype = getattr(np, dtype, None)
  304. if bw_type in ["scott", "silverman"]:
  305. bw = bw_type
  306. else:
  307. bw_type = getattr(np, bw_type, None)
  308. bw = bw_type(3) if bw_type else None
  309. if any(dt is None for dt in [dtype, bw]):
  310. pytest.skip()
  311. weights = np.arange(5, dtype=dtype)
  312. dataset = np.arange(5, dtype=dtype)
  313. k = stats.gaussian_kde(dataset, bw_method=bw, weights=weights)
  314. points = np.arange(5, dtype=dtype)
  315. result = k(points)
  316. # weights are always cast to float64
  317. assert result.dtype == np.result_type(dataset, points, np.float64(weights),
  318. k.factor)
  319. def test_pdf_logpdf_validation():
  320. rng = np.random.default_rng(64202298293133848336925499069837723291)
  321. xn = rng.standard_normal((2, 10))
  322. gkde = stats.gaussian_kde(xn)
  323. xs = rng.standard_normal((3, 10))
  324. msg = "points have dimension 3, dataset has dimension 2"
  325. with pytest.raises(ValueError, match=msg):
  326. gkde.logpdf(xs)
  327. def test_pdf_logpdf():
  328. rng = np.random.default_rng(1)
  329. n_basesample = 50
  330. xn = rng.normal(0, 1, n_basesample)
  331. # Default
  332. gkde = stats.gaussian_kde(xn)
  333. xs = np.linspace(-15, 12, 25)
  334. pdf = gkde.evaluate(xs)
  335. pdf2 = gkde.pdf(xs)
  336. assert_almost_equal(pdf, pdf2, decimal=12)
  337. logpdf = np.log(pdf)
  338. logpdf2 = gkde.logpdf(xs)
  339. assert_almost_equal(logpdf, logpdf2, decimal=12)
  340. # There are more points than data
  341. gkde = stats.gaussian_kde(xs)
  342. pdf = np.log(gkde.evaluate(xn))
  343. pdf2 = gkde.logpdf(xn)
  344. assert_almost_equal(pdf, pdf2, decimal=12)
  345. def test_pdf_logpdf_weighted():
  346. rng = np.random.default_rng(1)
  347. n_basesample = 50
  348. xn = rng.normal(0, 1, n_basesample)
  349. wn = rng.random(n_basesample)
  350. # Default
  351. gkde = stats.gaussian_kde(xn, weights=wn)
  352. xs = np.linspace(-15, 12, 25)
  353. pdf = gkde.evaluate(xs)
  354. pdf2 = gkde.pdf(xs)
  355. assert_almost_equal(pdf, pdf2, decimal=12)
  356. logpdf = np.log(pdf)
  357. logpdf2 = gkde.logpdf(xs)
  358. assert_almost_equal(logpdf, logpdf2, decimal=12)
  359. # There are more points than data
  360. rng = np.random.default_rng(4531935345)
  361. gkde = stats.gaussian_kde(xs, weights=rng.random(len(xs)))
  362. pdf = np.log(gkde.evaluate(xn))
  363. pdf2 = gkde.logpdf(xn)
  364. assert_almost_equal(pdf, pdf2, decimal=12)
  365. def test_marginal_1_axis():
  366. rng = np.random.default_rng(6111799263660870475)
  367. n_data = 50
  368. n_dim = 10
  369. dataset = rng.normal(size=(n_dim, n_data))
  370. points = rng.normal(size=(n_dim, 3))
  371. dimensions = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) # dimensions to keep
  372. kde = stats.gaussian_kde(dataset)
  373. marginal = kde.marginal(dimensions)
  374. pdf = marginal.pdf(points[dimensions])
  375. def marginal_pdf_single(point):
  376. def f(x):
  377. x = np.concatenate(([x], point[dimensions]))
  378. return kde.pdf(x)[0]
  379. return integrate.quad(f, -np.inf, np.inf)[0]
  380. def marginal_pdf(points):
  381. return np.apply_along_axis(marginal_pdf_single, axis=0, arr=points)
  382. ref = marginal_pdf(points)
  383. assert_allclose(pdf, ref, rtol=1e-6)
  384. @pytest.mark.xslow
  385. def test_marginal_2_axis():
  386. rng = np.random.default_rng(6111799263660870475)
  387. n_data = 30
  388. n_dim = 4
  389. dataset = rng.normal(size=(n_dim, n_data))
  390. points = rng.normal(size=(n_dim, 3))
  391. dimensions = np.array([1, 3]) # dimensions to keep
  392. kde = stats.gaussian_kde(dataset)
  393. marginal = kde.marginal(dimensions)
  394. pdf = marginal.pdf(points[dimensions])
  395. def marginal_pdf(points):
  396. def marginal_pdf_single(point):
  397. def f(y, x):
  398. w, z = point[dimensions]
  399. x = np.array([x, w, y, z])
  400. return kde.pdf(x)[0]
  401. return integrate.dblquad(f, -np.inf, np.inf, -np.inf, np.inf)[0]
  402. return np.apply_along_axis(marginal_pdf_single, axis=0, arr=points)
  403. ref = marginal_pdf(points)
  404. assert_allclose(pdf, ref, rtol=1e-6)
  405. def test_marginal_iv():
  406. # test input validation
  407. rng = np.random.default_rng(6111799263660870475)
  408. n_data = 30
  409. n_dim = 4
  410. dataset = rng.normal(size=(n_dim, n_data))
  411. points = rng.normal(size=(n_dim, 3))
  412. kde = stats.gaussian_kde(dataset)
  413. # check that positive and negative indices are equivalent
  414. dimensions1 = [-1, 1]
  415. marginal1 = kde.marginal(dimensions1)
  416. pdf1 = marginal1.pdf(points[dimensions1])
  417. dimensions2 = [3, -3]
  418. marginal2 = kde.marginal(dimensions2)
  419. pdf2 = marginal2.pdf(points[dimensions2])
  420. assert_equal(pdf1, pdf2)
  421. # IV for non-integer dimensions
  422. message = "Elements of `dimensions` must be integers..."
  423. with pytest.raises(ValueError, match=message):
  424. kde.marginal([1, 2.5])
  425. # IV for uniqueness
  426. message = "All elements of `dimensions` must be unique."
  427. with pytest.raises(ValueError, match=message):
  428. kde.marginal([1, 2, 2])
  429. # IV for non-integer dimensions
  430. message = (r"Dimensions \[-5 6\] are invalid for a distribution in 4...")
  431. with pytest.raises(ValueError, match=message):
  432. kde.marginal([1, -5, 6])
  433. @pytest.mark.xslow
  434. def test_logpdf_overflow():
  435. # regression test for gh-12988; testing against linalg instability for
  436. # very high dimensionality kde
  437. rng = np.random.default_rng(1)
  438. n_dimensions = 2500
  439. n_samples = 5000
  440. xn = np.array([rng.normal(0, 1, n_samples) + (n) for n in range(
  441. 0, n_dimensions)])
  442. # Default
  443. gkde = stats.gaussian_kde(xn)
  444. logpdf = gkde.logpdf(np.arange(0, n_dimensions))
  445. np.testing.assert_equal(np.isneginf(logpdf[0]), False)
  446. np.testing.assert_equal(np.isnan(logpdf[0]), False)
  447. def test_weights_intact():
  448. # regression test for gh-9709: weights are not modified
  449. rng = np.random.default_rng(12345)
  450. vals = rng.lognormal(size=100)
  451. weights = rng.choice([1.0, 10.0, 100], size=vals.size)
  452. orig_weights = weights.copy()
  453. stats.gaussian_kde(np.log10(vals), weights=weights)
  454. assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
  455. def test_weights_integer():
  456. # integer weights are OK, cf gh-9709 (comment)
  457. values = [0.2, 13.5, 21.0, 75.0, 99.0]
  458. weights = [1, 2, 4, 8, 16] # a list of integers
  459. pdf_i = stats.gaussian_kde(values, weights=weights)
  460. pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))
  461. xn = [0.3, 11, 88]
  462. assert_allclose(pdf_i.evaluate(xn),
  463. pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
  464. def test_seed():
  465. # Test the seed option of the resample method
  466. def test_seed_sub(gkde_trail):
  467. n_sample = 200
  468. # The results should be different without using seed
  469. samp1 = gkde_trail.resample(n_sample)
  470. samp2 = gkde_trail.resample(n_sample)
  471. assert_raises(
  472. AssertionError, assert_allclose, samp1, samp2, atol=1e-13
  473. )
  474. # Use integer seed
  475. seed = 831
  476. samp1 = gkde_trail.resample(n_sample, seed=seed)
  477. samp2 = gkde_trail.resample(n_sample, seed=seed)
  478. assert_allclose(samp1, samp2, atol=1e-13)
  479. # Use RandomState
  480. rstate1 = np.random.RandomState(seed=138)
  481. samp1 = gkde_trail.resample(n_sample, seed=rstate1)
  482. rstate2 = np.random.RandomState(seed=138)
  483. samp2 = gkde_trail.resample(n_sample, seed=rstate2)
  484. assert_allclose(samp1, samp2, atol=1e-13)
  485. # check that np.random.Generator can be used (numpy >= 1.17)
  486. if hasattr(np.random, 'default_rng'):
  487. # obtain a np.random.Generator object
  488. rng = np.random.default_rng(1234)
  489. gkde_trail.resample(n_sample, seed=rng)
  490. rng = np.random.default_rng(8765678)
  491. n_basesample = 500
  492. wn = rng.random(n_basesample)
  493. # Test 1D case
  494. xn_1d = rng.normal(0, 1, n_basesample)
  495. gkde_1d = stats.gaussian_kde(xn_1d)
  496. test_seed_sub(gkde_1d)
  497. gkde_1d_weighted = stats.gaussian_kde(xn_1d, weights=wn)
  498. test_seed_sub(gkde_1d_weighted)
  499. # Test 2D case
  500. mean = np.array([1.0, 3.0])
  501. covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
  502. xn_2d = rng.multivariate_normal(mean, covariance, size=n_basesample).T
  503. gkde_2d = stats.gaussian_kde(xn_2d)
  504. test_seed_sub(gkde_2d)
  505. gkde_2d_weighted = stats.gaussian_kde(xn_2d, weights=wn)
  506. test_seed_sub(gkde_2d_weighted)
  507. def test_singular_data_covariance_gh10205():
  508. # When the data lie in a lower-dimensional subspace and this causes
  509. # and exception, check that the error message is informative.
  510. rng = np.random.default_rng(2321583144339784787)
  511. mu = np.array([1, 10, 20])
  512. sigma = np.array([[4, 10, 0], [10, 25, 0], [0, 0, 100]])
  513. data = rng.multivariate_normal(mu, sigma, 1000)
  514. try: # doesn't raise any error on some platforms, and that's OK
  515. stats.gaussian_kde(data.T)
  516. except linalg.LinAlgError:
  517. msg = "The data appears to lie in a lower-dimensional subspace..."
  518. with assert_raises(linalg.LinAlgError, match=msg):
  519. stats.gaussian_kde(data.T)
  520. def test_fewer_points_than_dimensions_gh17436():
  521. # When the number of points is fewer than the number of dimensions, the
  522. # the covariance matrix would be singular, and the exception tested in
  523. # test_singular_data_covariance_gh10205 would occur. However, sometimes
  524. # this occurs when the user passes in the transpose of what `gaussian_kde`
  525. # expects. This can result in a huge covariance matrix, so bail early.
  526. rng = np.random.default_rng(2046127537594925772)
  527. rvs = rng.multivariate_normal(np.zeros(3), np.eye(3), size=5)
  528. message = "Number of dimensions is greater than number of samples..."
  529. with pytest.raises(ValueError, match=message):
  530. stats.gaussian_kde(rvs)