test_contingency.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. import numpy as np
  2. from numpy.testing import (assert_equal, assert_array_equal,
  3. assert_array_almost_equal, assert_approx_equal,
  4. assert_allclose)
  5. import pytest
  6. from pytest import raises as assert_raises
  7. from scipy import stats
  8. from scipy.special import xlogy
  9. from scipy.stats.contingency import (margins, expected_freq,
  10. chi2_contingency, association)
  11. def test_margins():
  12. a = np.array([1])
  13. m = margins(a)
  14. assert_equal(len(m), 1)
  15. m0 = m[0]
  16. assert_array_equal(m0, np.array([1]))
  17. a = np.array([[1]])
  18. m0, m1 = margins(a)
  19. expected0 = np.array([[1]])
  20. expected1 = np.array([[1]])
  21. assert_array_equal(m0, expected0)
  22. assert_array_equal(m1, expected1)
  23. a = np.arange(12).reshape(2, 6)
  24. m0, m1 = margins(a)
  25. expected0 = np.array([[15], [51]])
  26. expected1 = np.array([[6, 8, 10, 12, 14, 16]])
  27. assert_array_equal(m0, expected0)
  28. assert_array_equal(m1, expected1)
  29. a = np.arange(24).reshape(2, 3, 4)
  30. m0, m1, m2 = margins(a)
  31. expected0 = np.array([[[66]], [[210]]])
  32. expected1 = np.array([[[60], [92], [124]]])
  33. expected2 = np.array([[[60, 66, 72, 78]]])
  34. assert_array_equal(m0, expected0)
  35. assert_array_equal(m1, expected1)
  36. assert_array_equal(m2, expected2)
  37. def test_expected_freq():
  38. assert_array_equal(expected_freq([1]), np.array([1.0]))
  39. observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
  40. e = expected_freq(observed)
  41. assert_array_equal(e, np.ones_like(observed))
  42. observed = np.array([[10, 10, 20], [20, 20, 20]])
  43. e = expected_freq(observed)
  44. correct = np.array([[12., 12., 16.], [18., 18., 24.]])
  45. assert_array_almost_equal(e, correct)
  46. class TestChi2Contingency:
  47. def test_chi2_contingency_trivial(self):
  48. # Some very simple tests for chi2_contingency.
  49. # A trivial case
  50. obs = np.array([[1, 2], [1, 2]])
  51. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  52. assert_equal(chi2, 0.0)
  53. assert_equal(p, 1.0)
  54. assert_equal(dof, 1)
  55. assert_array_equal(obs, expected)
  56. # A *really* trivial case: 1-D data.
  57. obs = np.array([1, 2, 3])
  58. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  59. assert_equal(chi2, 0.0)
  60. assert_equal(p, 1.0)
  61. assert_equal(dof, 0)
  62. assert_array_equal(obs, expected)
  63. def test_chi2_contingency_R(self):
  64. # Some test cases that were computed independently, using R.
  65. # Rcode = \
  66. # """
  67. # # Data vector.
  68. # data <- c(
  69. # 12, 34, 23, 4, 47, 11,
  70. # 35, 31, 11, 34, 10, 18,
  71. # 12, 32, 9, 18, 13, 19,
  72. # 12, 12, 14, 9, 33, 25
  73. # )
  74. #
  75. # # Create factor tags:r=rows, c=columns, t=tiers
  76. # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
  77. # c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
  78. # t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
  79. #
  80. # # 3-way Chi squared test of independence
  81. # s = summary(xtabs(data~r+c+t))
  82. # print(s)
  83. # """
  84. # Routput = \
  85. # """
  86. # Call: xtabs(formula = data ~ r + c + t)
  87. # Number of cases in table: 478
  88. # Number of factors: 3
  89. # Test for independence of all factors:
  90. # Chisq = 102.17, df = 17, p-value = 3.514e-14
  91. # """
  92. obs = np.array(
  93. [[[12, 34, 23],
  94. [35, 31, 11],
  95. [12, 32, 9],
  96. [12, 12, 14]],
  97. [[4, 47, 11],
  98. [34, 10, 18],
  99. [18, 13, 19],
  100. [9, 33, 25]]])
  101. chi2, p, dof, expected = chi2_contingency(obs)
  102. assert_approx_equal(chi2, 102.17, significant=5)
  103. assert_approx_equal(p, 3.514e-14, significant=4)
  104. assert_equal(dof, 17)
  105. # Rcode = \
  106. # """
  107. # # Data vector.
  108. # data <- c(
  109. # #
  110. # 12, 17,
  111. # 11, 16,
  112. # #
  113. # 11, 12,
  114. # 15, 16,
  115. # #
  116. # 23, 15,
  117. # 30, 22,
  118. # #
  119. # 14, 17,
  120. # 15, 16
  121. # )
  122. #
  123. # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
  124. # r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
  125. # c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
  126. # d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
  127. # t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
  128. #
  129. # # 4-way Chi squared test of independence
  130. # s = summary(xtabs(data~r+c+d+t))
  131. # print(s)
  132. # """
  133. # Routput = \
  134. # """
  135. # Call: xtabs(formula = data ~ r + c + d + t)
  136. # Number of cases in table: 262
  137. # Number of factors: 4
  138. # Test for independence of all factors:
  139. # Chisq = 8.758, df = 11, p-value = 0.6442
  140. # """
  141. obs = np.array(
  142. [[[[12, 17],
  143. [11, 16]],
  144. [[11, 12],
  145. [15, 16]]],
  146. [[[23, 15],
  147. [30, 22]],
  148. [[14, 17],
  149. [15, 16]]]])
  150. chi2, p, dof, expected = chi2_contingency(obs)
  151. assert_approx_equal(chi2, 8.758, significant=4)
  152. assert_approx_equal(p, 0.6442, significant=4)
  153. assert_equal(dof, 11)
  154. def test_chi2_contingency_g(self):
  155. c = np.array([[15, 60], [15, 90]])
  156. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
  157. correction=False)
  158. assert_allclose(g, 2*xlogy(c, c/e).sum())
  159. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
  160. correction=True)
  161. c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
  162. assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
  163. c = np.array([[10, 12, 10], [12, 10, 10]])
  164. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
  165. assert_allclose(g, 2*xlogy(c, c/e).sum())
  166. def test_chi2_contingency_bad_args(self):
  167. # Test that "bad" inputs raise a ValueError.
  168. # Negative value in the array of observed frequencies.
  169. obs = np.array([[-1, 10], [1, 2]])
  170. assert_raises(ValueError, chi2_contingency, obs)
  171. # The zeros in this will result in zeros in the array
  172. # of expected frequencies.
  173. obs = np.array([[0, 1], [0, 1]])
  174. assert_raises(ValueError, chi2_contingency, obs)
  175. # A degenerate case: `observed` has size 0.
  176. obs = np.empty((0, 8))
  177. assert_raises(ValueError, chi2_contingency, obs)
  178. def test_chi2_contingency_yates_gh13875(self):
  179. # Magnitude of Yates' continuity correction should not exceed difference
  180. # between expected and observed value of the statistic; see gh-13875
  181. observed = np.array([[1573, 3], [4, 0]])
  182. p = chi2_contingency(observed)[1]
  183. assert_allclose(p, 1, rtol=1e-12)
  184. @pytest.mark.parametrize("correction", [False, True])
  185. def test_result(self, correction):
  186. obs = np.array([[1, 2], [1, 2]])
  187. res = chi2_contingency(obs, correction=correction)
  188. assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
  189. @pytest.mark.slow
  190. def test_exact_permutation(self):
  191. table = np.arange(4).reshape(2, 2)
  192. ref_statistic = chi2_contingency(table, correction=False).statistic
  193. ref_pvalue = stats.fisher_exact(table).pvalue
  194. method = stats.PermutationMethod(n_resamples=50000)
  195. res = chi2_contingency(table, correction=False, method=method)
  196. assert_equal(res.statistic, ref_statistic)
  197. assert_allclose(res.pvalue, ref_pvalue, rtol=1e-15)
  198. @pytest.mark.slow
  199. @pytest.mark.parametrize('method', (stats.PermutationMethod,
  200. stats.MonteCarloMethod))
  201. def test_resampling_randomized(self, method):
  202. rng = np.random.default_rng(2592340925)
  203. # need to have big sum for asymptotic approximation to be good
  204. rows = [300, 1000, 800]
  205. cols = [200, 400, 800, 700]
  206. table = stats.random_table(rows, cols, seed=rng).rvs()
  207. res = chi2_contingency(table, correction=False, method=method(rng=rng))
  208. ref = chi2_contingency(table, correction=False)
  209. assert_equal(res.statistic, ref.statistic)
  210. assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)
  211. assert_equal(res.dof, np.nan)
  212. assert_equal(res.expected_freq, ref.expected_freq)
  213. def test_resampling_invalid_args(self):
  214. table = np.arange(8).reshape(2, 2, 2)
  215. method = stats.PermutationMethod()
  216. message = "Use of `method` is only compatible with two-way tables."
  217. with pytest.raises(ValueError, match=message):
  218. chi2_contingency(table, correction=False, method=method)
  219. table = np.arange(4).reshape(2, 2)
  220. method = stats.PermutationMethod()
  221. message = "`correction=True` is not compatible with..."
  222. with pytest.raises(ValueError, match=message):
  223. chi2_contingency(table, method=method)
  224. method = stats.MonteCarloMethod()
  225. message = "`lambda_=2` is not compatible with..."
  226. with pytest.raises(ValueError, match=message):
  227. chi2_contingency(table, correction=False, lambda_=2, method=method)
  228. method = 'herring'
  229. message = "`method='herring'` not recognized; if provided, `method`..."
  230. with pytest.raises(ValueError, match=message):
  231. chi2_contingency(table, correction=False, method=method)
  232. method = stats.MonteCarloMethod(rvs=stats.norm.rvs)
  233. message = "If the `method` argument of `chi2_contingency` is..."
  234. with pytest.raises(ValueError, match=message):
  235. chi2_contingency(table, correction=False, method=method)
  236. def test_bad_association_args():
  237. # Invalid Test Statistic
  238. assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
  239. # Invalid array shape
  240. assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
  241. # chi2_contingency exception
  242. assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
  243. # Invalid Array Item Data Type
  244. assert_raises(ValueError, association,
  245. np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
  246. @pytest.mark.parametrize('stat, expected',
  247. [('cramer', 0.09222412010290792),
  248. ('tschuprow', 0.0775509319944633),
  249. ('pearson', 0.12932925727138758)])
  250. def test_assoc(stat, expected):
  251. # 2d Array
  252. obs1 = np.array([[12, 13, 14, 15, 16],
  253. [17, 16, 18, 19, 11],
  254. [9, 15, 14, 12, 11]])
  255. a = association(observed=obs1, method=stat)
  256. assert_allclose(a, expected)