test_binned_statistic.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. import numpy as np
  2. from numpy.testing import assert_allclose
  3. import pytest
  4. from pytest import raises as assert_raises
  5. from scipy.stats import (binned_statistic, binned_statistic_2d,
  6. binned_statistic_dd)
  7. from scipy._lib._util import check_random_state
  8. from .common_tests import check_named_results
  9. class TestBinnedStatistic:
  10. @classmethod
  11. def setup_class(cls):
  12. rng = check_random_state(9865)
  13. cls.x = rng.uniform(size=100)
  14. cls.y = rng.uniform(size=100)
  15. cls.v = rng.uniform(size=100)
  16. cls.X = rng.uniform(size=(100, 3))
  17. cls.w = rng.uniform(size=100)
  18. cls.u = rng.uniform(size=100) + 1e6
  19. def test_1d_count(self):
  20. x = self.x
  21. v = self.v
  22. count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
  23. count2, edges2 = np.histogram(x, bins=10)
  24. assert_allclose(count1, count2)
  25. assert_allclose(edges1, edges2)
  26. def test_gh5927(self):
  27. # smoke test for gh5927 - binned_statistic was using `is` for string
  28. # comparison
  29. x = self.x
  30. v = self.v
  31. statistics = ['mean', 'median', 'count', 'sum']
  32. for statistic in statistics:
  33. binned_statistic(x, v, statistic, bins=10)
  34. def test_big_number_std(self):
  35. # tests for numerical stability of std calculation
  36. # see issue gh-10126 for more
  37. x = self.x
  38. u = self.u
  39. stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
  40. stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
  41. assert_allclose(stat1, stat2)
  42. def test_empty_bins_std(self):
  43. # tests that std returns gives nan for empty bins
  44. x = self.x
  45. u = self.u
  46. print(binned_statistic(x, u, 'count', bins=1000))
  47. stat1, edges1, bc = binned_statistic(x, u, 'std', bins=1000)
  48. stat2, edges2, bc = binned_statistic(x, u, np.std, bins=1000)
  49. assert_allclose(stat1, stat2)
  50. def test_non_finite_inputs_and_int_bins(self):
  51. # if either `values` or `sample` contain np.inf or np.nan throw
  52. # see issue gh-9010 for more
  53. x = self.x
  54. u = self.u.copy() # take copy before modification
  55. u[0] = np.inf
  56. assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
  57. # need to test for non-python specific ints, e.g. np.int8, np.int64
  58. assert_raises(ValueError, binned_statistic, u, x, 'std',
  59. bins=np.int64(10))
  60. u[0] = np.nan
  61. assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
  62. def test_1d_result_attributes(self):
  63. x = self.x
  64. v = self.v
  65. res = binned_statistic(x, v, 'count', bins=10)
  66. attributes = ('statistic', 'bin_edges', 'binnumber')
  67. check_named_results(res, attributes)
  68. def test_1d_sum(self):
  69. x = self.x
  70. v = self.v
  71. sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
  72. sum2, edges2 = np.histogram(x, bins=10, weights=v)
  73. assert_allclose(sum1, sum2)
  74. assert_allclose(edges1, edges2)
  75. def test_1d_mean(self):
  76. x = self.x
  77. v = self.v
  78. stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
  79. stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
  80. assert_allclose(stat1, stat2)
  81. assert_allclose(edges1, edges2)
  82. def test_1d_std(self):
  83. x = self.x
  84. v = self.v
  85. stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
  86. stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
  87. assert_allclose(stat1, stat2)
  88. assert_allclose(edges1, edges2)
  89. def test_1d_min(self):
  90. x = self.x
  91. v = self.v
  92. stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
  93. stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
  94. assert_allclose(stat1, stat2)
  95. assert_allclose(edges1, edges2)
  96. def test_1d_max(self):
  97. x = self.x
  98. v = self.v
  99. stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
  100. stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
  101. assert_allclose(stat1, stat2)
  102. assert_allclose(edges1, edges2)
  103. def test_1d_median(self):
  104. x = self.x
  105. v = self.v
  106. stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
  107. stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
  108. assert_allclose(stat1, stat2)
  109. assert_allclose(edges1, edges2)
  110. def test_1d_bincode(self):
  111. x = self.x[:20]
  112. v = self.v[:20]
  113. count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
  114. bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
  115. 1, 2, 1])
  116. bcount = [(bc == i).sum() for i in np.unique(bc)]
  117. assert_allclose(bc, bc2)
  118. assert_allclose(bcount, count1)
  119. def test_1d_range_keyword(self):
  120. # Regression test for gh-3063, range can be (min, max) or [(min, max)]
  121. rng = np.random.default_rng(6823616729)
  122. x = np.arange(30)
  123. data = rng.random(30)
  124. mean, bins, _ = binned_statistic(x[:15], data[:15])
  125. mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
  126. mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
  127. assert_allclose(mean, mean_range)
  128. assert_allclose(bins, bins_range)
  129. assert_allclose(mean, mean_range2)
  130. assert_allclose(bins, bins_range2)
  131. def test_1d_multi_values(self):
  132. x = self.x
  133. v = self.v
  134. w = self.w
  135. stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
  136. stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
  137. stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
  138. assert_allclose(stat2[0], stat1v)
  139. assert_allclose(stat2[1], stat1w)
  140. assert_allclose(edges1v, edges2)
  141. assert_allclose(bc1v, bc2)
  142. def test_2d_count(self):
  143. x = self.x
  144. y = self.y
  145. v = self.v
  146. count1, binx1, biny1, bc = binned_statistic_2d(
  147. x, y, v, 'count', bins=5)
  148. count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
  149. assert_allclose(count1, count2)
  150. assert_allclose(binx1, binx2)
  151. assert_allclose(biny1, biny2)
  152. def test_2d_result_attributes(self):
  153. x = self.x
  154. y = self.y
  155. v = self.v
  156. res = binned_statistic_2d(x, y, v, 'count', bins=5)
  157. attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
  158. check_named_results(res, attributes)
  159. def test_2d_sum(self):
  160. x = self.x
  161. y = self.y
  162. v = self.v
  163. sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
  164. sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
  165. assert_allclose(sum1, sum2)
  166. assert_allclose(binx1, binx2)
  167. assert_allclose(biny1, biny2)
  168. def test_2d_mean(self):
  169. x = self.x
  170. y = self.y
  171. v = self.v
  172. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
  173. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
  174. assert_allclose(stat1, stat2)
  175. assert_allclose(binx1, binx2)
  176. assert_allclose(biny1, biny2)
  177. def test_2d_mean_unicode(self):
  178. x = self.x
  179. y = self.y
  180. v = self.v
  181. stat1, binx1, biny1, bc = binned_statistic_2d(
  182. x, y, v, 'mean', bins=5)
  183. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
  184. assert_allclose(stat1, stat2)
  185. assert_allclose(binx1, binx2)
  186. assert_allclose(biny1, biny2)
  187. def test_2d_std(self):
  188. x = self.x
  189. y = self.y
  190. v = self.v
  191. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
  192. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
  193. assert_allclose(stat1, stat2)
  194. assert_allclose(binx1, binx2)
  195. assert_allclose(biny1, biny2)
  196. def test_2d_min(self):
  197. x = self.x
  198. y = self.y
  199. v = self.v
  200. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
  201. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
  202. assert_allclose(stat1, stat2)
  203. assert_allclose(binx1, binx2)
  204. assert_allclose(biny1, biny2)
  205. def test_2d_max(self):
  206. x = self.x
  207. y = self.y
  208. v = self.v
  209. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
  210. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
  211. assert_allclose(stat1, stat2)
  212. assert_allclose(binx1, binx2)
  213. assert_allclose(biny1, biny2)
  214. def test_2d_median(self):
  215. x = self.x
  216. y = self.y
  217. v = self.v
  218. stat1, binx1, biny1, bc = binned_statistic_2d(
  219. x, y, v, 'median', bins=5)
  220. stat2, binx2, biny2, bc = binned_statistic_2d(
  221. x, y, v, np.median, bins=5)
  222. assert_allclose(stat1, stat2)
  223. assert_allclose(binx1, binx2)
  224. assert_allclose(biny1, biny2)
  225. def test_2d_bincode(self):
  226. x = self.x[:20]
  227. y = self.y[:20]
  228. v = self.v[:20]
  229. count1, binx1, biny1, bc = binned_statistic_2d(
  230. x, y, v, 'count', bins=3)
  231. bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
  232. 6, 11, 16, 6, 6, 11, 8])
  233. bcount = [(bc == i).sum() for i in np.unique(bc)]
  234. assert_allclose(bc, bc2)
  235. count1adj = count1[count1.nonzero()]
  236. assert_allclose(bcount, count1adj)
  237. def test_2d_multi_values(self):
  238. x = self.x
  239. y = self.y
  240. v = self.v
  241. w = self.w
  242. stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
  243. x, y, v, 'mean', bins=8)
  244. stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
  245. x, y, w, 'mean', bins=8)
  246. stat2, binx2, biny2, bc2 = binned_statistic_2d(
  247. x, y, [v, w], 'mean', bins=8)
  248. assert_allclose(stat2[0], stat1v)
  249. assert_allclose(stat2[1], stat1w)
  250. assert_allclose(binx1v, binx2)
  251. assert_allclose(biny1w, biny2)
  252. assert_allclose(bc1v, bc2)
  253. def test_2d_binnumbers_unraveled(self):
  254. x = self.x
  255. y = self.y
  256. v = self.v
  257. stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
  258. stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
  259. stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
  260. x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
  261. bcx3 = np.searchsorted(edgesx, x, side='right')
  262. bcy3 = np.searchsorted(edgesy, y, side='right')
  263. # `numpy.searchsorted` is non-inclusive on right-edge, compensate
  264. bcx3[x == x.max()] -= 1
  265. bcy3[y == y.max()] -= 1
  266. assert_allclose(bcx, bc2[0])
  267. assert_allclose(bcy, bc2[1])
  268. assert_allclose(bcx3, bc2[0])
  269. assert_allclose(bcy3, bc2[1])
  270. def test_dd_count(self):
  271. X = self.X
  272. v = self.v
  273. count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
  274. count2, edges2 = np.histogramdd(X, bins=3)
  275. assert_allclose(count1, count2)
  276. assert_allclose(edges1, edges2)
  277. def test_dd_result_attributes(self):
  278. X = self.X
  279. v = self.v
  280. res = binned_statistic_dd(X, v, 'count', bins=3)
  281. attributes = ('statistic', 'bin_edges', 'binnumber')
  282. check_named_results(res, attributes)
  283. def test_dd_sum(self):
  284. X = self.X
  285. v = self.v
  286. sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
  287. sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
  288. sum3, edges3, bc = binned_statistic_dd(X, v, np.sum, bins=3)
  289. assert_allclose(sum1, sum2)
  290. assert_allclose(edges1, edges2)
  291. assert_allclose(sum1, sum3)
  292. assert_allclose(edges1, edges3)
  293. def test_dd_mean(self):
  294. X = self.X
  295. v = self.v
  296. stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
  297. stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
  298. assert_allclose(stat1, stat2)
  299. assert_allclose(edges1, edges2)
  300. def test_dd_std(self):
  301. X = self.X
  302. v = self.v
  303. stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
  304. stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
  305. assert_allclose(stat1, stat2)
  306. assert_allclose(edges1, edges2)
  307. def test_dd_min(self):
  308. X = self.X
  309. v = self.v
  310. stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
  311. stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
  312. assert_allclose(stat1, stat2)
  313. assert_allclose(edges1, edges2)
  314. def test_dd_max(self):
  315. X = self.X
  316. v = self.v
  317. stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
  318. stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
  319. assert_allclose(stat1, stat2)
  320. assert_allclose(edges1, edges2)
  321. def test_dd_median(self):
  322. X = self.X
  323. v = self.v
  324. stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
  325. stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
  326. assert_allclose(stat1, stat2)
  327. assert_allclose(edges1, edges2)
  328. def test_dd_bincode(self):
  329. X = self.X[:20]
  330. v = self.v[:20]
  331. count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
  332. bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
  333. 32, 36, 91, 43, 87, 81, 81])
  334. bcount = [(bc == i).sum() for i in np.unique(bc)]
  335. assert_allclose(bc, bc2)
  336. count1adj = count1[count1.nonzero()]
  337. assert_allclose(bcount, count1adj)
  338. def test_dd_multi_values(self):
  339. X = self.X
  340. v = self.v
  341. w = self.w
  342. for stat in ["count", "sum", "mean", "std", "min", "max", "median",
  343. np.std]:
  344. stat1v, edges1v, bc1v = binned_statistic_dd(X, v, stat, bins=8)
  345. stat1w, edges1w, bc1w = binned_statistic_dd(X, w, stat, bins=8)
  346. stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], stat, bins=8)
  347. assert_allclose(stat2[0], stat1v)
  348. assert_allclose(stat2[1], stat1w)
  349. assert_allclose(edges1v, edges2)
  350. assert_allclose(edges1w, edges2)
  351. assert_allclose(bc1v, bc2)
  352. def test_dd_binnumbers_unraveled(self):
  353. X = self.X
  354. v = self.v
  355. stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
  356. stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
  357. stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
  358. stat2, edges2, bc2 = binned_statistic_dd(
  359. X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
  360. assert_allclose(bcx, bc2[0])
  361. assert_allclose(bcy, bc2[1])
  362. assert_allclose(bcz, bc2[2])
  363. def test_dd_binned_statistic_result(self):
  364. # NOTE: tests the reuse of bin_edges from previous call
  365. rng = np.random.default_rng(8111360615)
  366. x = rng.random((10000, 3))
  367. v = rng.random(10000)
  368. bins = np.linspace(0, 1, 10)
  369. bins = (bins, bins, bins)
  370. result = binned_statistic_dd(x, v, 'mean', bins=bins)
  371. stat = result.statistic
  372. result = binned_statistic_dd(x, v, 'mean',
  373. binned_statistic_result=result)
  374. stat2 = result.statistic
  375. assert_allclose(stat, stat2)
  376. def test_dd_zero_dedges(self):
  377. rng = np.random.default_rng(1132724173)
  378. x = rng.random((10000, 3))
  379. v = rng.random(10000)
  380. bins = np.linspace(0, 1, 10)
  381. bins = np.append(bins, 1)
  382. bins = (bins, bins, bins)
  383. with assert_raises(ValueError, match='difference is numerically 0'):
  384. binned_statistic_dd(x, v, 'mean', bins=bins)
  385. def test_dd_range_errors(self):
  386. # Test that descriptive exceptions are raised as appropriate for bad
  387. # values of the `range` argument. (See gh-12996)
  388. with assert_raises(ValueError,
  389. match='In range, start must be <= stop'):
  390. binned_statistic_dd([self.y], self.v,
  391. range=[[1, 0]])
  392. with assert_raises(
  393. ValueError,
  394. match='In dimension 1 of range, start must be <= stop'):
  395. binned_statistic_dd([self.x, self.y], self.v,
  396. range=[[1, 0], [0, 1]])
  397. with assert_raises(
  398. ValueError,
  399. match='In dimension 2 of range, start must be <= stop'):
  400. binned_statistic_dd([self.x, self.y], self.v,
  401. range=[[0, 1], [1, 0]])
  402. with assert_raises(
  403. ValueError,
  404. match='range given for 1 dimensions; 2 required'):
  405. binned_statistic_dd([self.x, self.y], self.v,
  406. range=[[0, 1]])
  407. def test_binned_statistic_float32(self):
  408. X = np.array([0, 0.42358226], dtype=np.float32)
  409. stat, _, _ = binned_statistic(X, None, 'count', bins=5)
  410. assert_allclose(stat, np.array([1, 0, 0, 0, 1], dtype=np.float64))
  411. def test_gh14332(self):
  412. # Test the wrong output when the `sample` is close to bin edge
  413. x = []
  414. size = 20
  415. for i in range(size):
  416. x += [1-0.1**i]
  417. bins = np.linspace(0,1,11)
  418. sum1, edges1, bc = binned_statistic_dd(x, np.ones(len(x)),
  419. bins=[bins], statistic='sum')
  420. sum2, edges2 = np.histogram(x, bins=bins)
  421. assert_allclose(sum1, sum2)
  422. assert_allclose(edges1[0], edges2)
  423. @pytest.mark.parametrize("dtype", [np.float64, np.complex128])
  424. @pytest.mark.parametrize("statistic", [np.mean, np.median, np.sum, np.std,
  425. np.min, np.max, 'count',
  426. lambda x: (x**2).sum(),
  427. lambda x: (x**2).sum() * 1j])
  428. def test_dd_all(self, dtype, statistic):
  429. def ref_statistic(x):
  430. return len(x) if statistic == 'count' else statistic(x)
  431. rng = np.random.default_rng(3704743126639371)
  432. n = 10
  433. x = rng.random(size=n)
  434. i = x >= 0.5
  435. v = rng.random(size=n)
  436. if dtype is np.complex128:
  437. v = v + rng.random(size=n)*1j
  438. stat, _, _ = binned_statistic_dd(x, v, statistic, bins=2)
  439. ref = np.array([ref_statistic(v[~i]), ref_statistic(v[i])])
  440. assert_allclose(stat, ref)
  441. assert stat.dtype == np.result_type(ref.dtype, np.float64)