test_multicomp.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. import copy
  2. import numpy as np
  3. import pytest
  4. from numpy.testing import assert_allclose
  5. from scipy import stats
  6. from scipy.stats._multicomp import _pvalue_dunnett, DunnettResult
  7. class TestDunnett:
  8. # For the following tests, p-values were computed using Matlab, e.g.
  9. # sample = [18. 15. 18. 16. 17. 15. 14. 14. 14. 15. 15....
  10. # 14. 15. 14. 22. 18. 21. 21. 10. 10. 11. 9....
  11. # 25. 26. 17.5 16. 15.5 14.5 22. 22. 24. 22.5 29....
  12. # 24.5 20. 18. 18.5 17.5 26.5 13. 16.5 13. 13. 13....
  13. # 28. 27. 34. 31. 29. 27. 24. 23. 38. 36. 25....
  14. # 38. 26. 22. 36. 27. 27. 32. 28. 31....
  15. # 24. 27. 33. 32. 28. 19. 37. 31. 36. 36....
  16. # 34. 38. 32. 38. 32....
  17. # 26. 24. 26. 25. 29. 29.5 16.5 36. 44....
  18. # 25. 27. 19....
  19. # 25. 20....
  20. # 28.];
  21. # j = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
  22. # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
  23. # 0 0 0 0...
  24. # 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...
  25. # 2 2 2 2 2 2 2 2 2...
  26. # 3 3 3...
  27. # 4 4...
  28. # 5];
  29. # [~, ~, stats] = anova1(sample, j, "off");
  30. # [results, ~, ~, gnames] = multcompare(stats, ...
  31. # "CriticalValueType", "dunnett", ...
  32. # "Approximate", false);
  33. # tbl = array2table(results, "VariableNames", ...
  34. # ["Group", "Control Group", "Lower Limit", ...
  35. # "Difference", "Upper Limit", "P-value"]);
  36. # tbl.("Group") = gnames(tbl.("Group"));
  37. # tbl.("Control Group") = gnames(tbl.("Control Group"))
  38. # Matlab doesn't report the statistic, so the statistics were
  39. # computed using R multcomp `glht`, e.g.:
  40. # library(multcomp)
  41. # options(digits=16)
  42. # control < - c(18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0,
  43. # 15.0, 15.0, 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0,
  44. # 10.0, 10.0, 11.0, 9.0, 25.0, 26.0, 17.5, 16.0, 15.5,
  45. # 14.5, 22.0, 22.0, 24.0, 22.5, 29.0, 24.5, 20.0, 18.0,
  46. # 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0, 28.0,
  47. # 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0,
  48. # 25.0, 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0,
  49. # 31.0)
  50. # t < - c(24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
  51. # 34.0, 38.0, 32.0, 38.0, 32.0)
  52. # w < - c(26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0)
  53. # x < - c(25.0, 27.0, 19.0)
  54. # y < - c(25.0, 20.0)
  55. # z < - c(28.0)
  56. #
  57. # groups = factor(rep(c("control", "t", "w", "x", "y", "z"),
  58. # times=c(length(control), length(t), length(w),
  59. # length(x), length(y), length(z))))
  60. # df < - data.frame(response=c(control, t, w, x, y, z),
  61. # group=groups)
  62. # model < - aov(response
  63. # ~group, data = df)
  64. # test < - glht(model=model,
  65. # linfct=mcp(group="Dunnett"),
  66. # alternative="g")
  67. # summary(test)
  68. # confint(test)
  69. # p-values agreed with those produced by Matlab to at least atol=1e-3
  70. # From Matlab's documentation on multcompare
  71. samples_1 = [
  72. [
  73. 24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
  74. 34.0, 38.0, 32.0, 38.0, 32.0
  75. ],
  76. [26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0],
  77. [25.0, 27.0, 19.0],
  78. [25.0, 20.0],
  79. [28.0]
  80. ]
  81. control_1 = [
  82. 18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0, 15.0, 15.0,
  83. 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0, 10.0, 10.0, 11.0, 9.0,
  84. 25.0, 26.0, 17.5, 16.0, 15.5, 14.5, 22.0, 22.0, 24.0, 22.5, 29.0,
  85. 24.5, 20.0, 18.0, 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0,
  86. 28.0, 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0, 25.0,
  87. 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0, 31.0
  88. ]
  89. pvalue_1 = [4.727e-06, 0.022346, 0.97912, 0.99953, 0.86579] # Matlab
  90. # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
  91. p_1_twosided = [1e-4, 0.02237, 0.97913, 0.99953, 0.86583]
  92. p_1_greater = [1e-4, 0.011217, 0.768500, 0.896991, 0.577211]
  93. p_1_less = [1, 1, 0.99660, 0.98398, .99953]
  94. statistic_1 = [5.27356, 2.91270, 0.60831, 0.27002, 0.96637]
  95. ci_1_twosided = [[5.3633917835622, 0.7296142201217, -8.3879817106607,
  96. -11.9090753452911, -11.7655021543469],
  97. [15.9709832164378, 13.8936496687672, 13.4556900439941,
  98. 14.6434503452911, 25.4998771543469]]
  99. ci_1_greater = [5.9036402398526, 1.4000632918725, -7.2754756323636,
  100. -10.5567456382391, -9.8675629499576]
  101. ci_1_less = [15.4306165948619, 13.2230539537359, 12.3429406339544,
  102. 13.2908248513211, 23.6015228251660]
  103. pvalues_1 = dict(twosided=p_1_twosided, less=p_1_less, greater=p_1_greater)
  104. cis_1 = dict(twosided=ci_1_twosided, less=ci_1_less, greater=ci_1_greater)
  105. case_1 = dict(samples=samples_1, control=control_1, statistic=statistic_1,
  106. pvalues=pvalues_1, cis=cis_1)
  107. # From Dunnett1955 comparing with R's DescTools: DunnettTest
  108. samples_2 = [[9.76, 8.80, 7.68, 9.36], [12.80, 9.68, 12.16, 9.20, 10.55]]
  109. control_2 = [7.40, 8.50, 7.20, 8.24, 9.84, 8.32]
  110. pvalue_2 = [0.6201, 0.0058]
  111. # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
  112. p_2_twosided = [0.6201020, 0.0058254]
  113. p_2_greater = [0.3249776, 0.0029139]
  114. p_2_less = [0.91676, 0.99984]
  115. statistic_2 = [0.85703, 3.69375]
  116. ci_2_twosided = [[-1.2564116462124, 0.8396273539789],
  117. [2.5564116462124, 4.4163726460211]]
  118. ci_2_greater = [-0.9588591188156, 1.1187563667543]
  119. ci_2_less = [2.2588591188156, 4.1372436332457]
  120. pvalues_2 = dict(twosided=p_2_twosided, less=p_2_less, greater=p_2_greater)
  121. cis_2 = dict(twosided=ci_2_twosided, less=ci_2_less, greater=ci_2_greater)
  122. case_2 = dict(samples=samples_2, control=control_2, statistic=statistic_2,
  123. pvalues=pvalues_2, cis=cis_2)
  124. samples_3 = [[55, 64, 64], [55, 49, 52], [50, 44, 41]]
  125. control_3 = [55, 47, 48]
  126. pvalue_3 = [0.0364, 0.8966, 0.4091]
  127. # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
  128. p_3_twosided = [0.036407, 0.896539, 0.409295]
  129. p_3_greater = [0.018277, 0.521109, 0.981892]
  130. p_3_less = [0.99944, 0.90054, 0.20974]
  131. statistic_3 = [3.09073, 0.56195, -1.40488]
  132. ci_3_twosided = [[0.7529028025053, -8.2470971974947, -15.2470971974947],
  133. [21.2470971974947, 12.2470971974947, 5.2470971974947]]
  134. ci_3_greater = [2.4023682323149, -6.5976317676851, -13.5976317676851]
  135. ci_3_less = [19.5984402363662, 10.5984402363662, 3.5984402363662]
  136. pvalues_3 = dict(twosided=p_3_twosided, less=p_3_less, greater=p_3_greater)
  137. cis_3 = dict(twosided=ci_3_twosided, less=ci_3_less, greater=ci_3_greater)
  138. case_3 = dict(samples=samples_3, control=control_3, statistic=statistic_3,
  139. pvalues=pvalues_3, cis=cis_3)
  140. # From Thomson and Short,
  141. # Mucociliary function in health, chronic obstructive airway disease,
  142. # and asbestosis, Journal of Applied Physiology, 1969. Table 1
  143. # Comparing with R's DescTools: DunnettTest
  144. samples_4 = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]
  145. control_4 = [2.9, 3.0, 2.5, 2.6, 3.2]
  146. pvalue_4 = [0.5832, 0.9982]
  147. # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
  148. p_4_twosided = [0.58317, 0.99819]
  149. p_4_greater = [0.30225, 0.69115]
  150. p_4_less = [0.91929, 0.65212]
  151. statistic_4 = [0.90875, -0.05007]
  152. ci_4_twosided = [[-0.6898153448579, -1.0333456251632],
  153. [1.4598153448579, 0.9933456251632]]
  154. ci_4_greater = [-0.5186459268412, -0.8719655502147 ]
  155. ci_4_less = [1.2886459268412, 0.8319655502147]
  156. pvalues_4 = dict(twosided=p_4_twosided, less=p_4_less, greater=p_4_greater)
  157. cis_4 = dict(twosided=ci_4_twosided, less=ci_4_less, greater=ci_4_greater)
  158. case_4 = dict(samples=samples_4, control=control_4, statistic=statistic_4,
  159. pvalues=pvalues_4, cis=cis_4)
  160. @pytest.mark.parametrize(
  161. 'rho, n_groups, df, statistic, pvalue, alternative',
  162. [
  163. # From Dunnett1955
  164. # Tables 1a and 1b pages 1117-1118
  165. (0.5, 1, 10, 1.81, 0.05, "greater"), # different than two-sided
  166. (0.5, 3, 10, 2.34, 0.05, "greater"),
  167. (0.5, 2, 30, 1.99, 0.05, "greater"),
  168. (0.5, 5, 30, 2.33, 0.05, "greater"),
  169. (0.5, 4, 12, 3.32, 0.01, "greater"),
  170. (0.5, 7, 12, 3.56, 0.01, "greater"),
  171. (0.5, 2, 60, 2.64, 0.01, "greater"),
  172. (0.5, 4, 60, 2.87, 0.01, "greater"),
  173. (0.5, 4, 60, [2.87, 2.21], [0.01, 0.05], "greater"),
  174. # Tables 2a and 2b pages 1119-1120
  175. (0.5, 1, 10, 2.23, 0.05, "two-sided"), # two-sided
  176. (0.5, 3, 10, 2.81, 0.05, "two-sided"),
  177. (0.5, 2, 30, 2.32, 0.05, "two-sided"),
  178. (0.5, 3, 20, 2.57, 0.05, "two-sided"),
  179. (0.5, 4, 12, 3.76, 0.01, "two-sided"),
  180. (0.5, 7, 12, 4.08, 0.01, "two-sided"),
  181. (0.5, 2, 60, 2.90, 0.01, "two-sided"),
  182. (0.5, 4, 60, 3.14, 0.01, "two-sided"),
  183. (0.5, 4, 60, [3.14, 2.55], [0.01, 0.05], "two-sided"),
  184. ],
  185. )
  186. def test_critical_values(
  187. self, rho, n_groups, df, statistic, pvalue, alternative
  188. ):
  189. rng = np.random.default_rng(165250594791731684851746311027739134893)
  190. rho = np.full((n_groups, n_groups), rho)
  191. np.fill_diagonal(rho, 1)
  192. statistic = np.array(statistic)
  193. res = _pvalue_dunnett(
  194. rho=rho, df=df, statistic=statistic,
  195. alternative=alternative,
  196. rng=rng
  197. )
  198. assert_allclose(res, pvalue, atol=5e-3)
  199. @pytest.mark.parametrize(
  200. 'samples, control, pvalue, statistic',
  201. [
  202. (samples_1, control_1, pvalue_1, statistic_1),
  203. (samples_2, control_2, pvalue_2, statistic_2),
  204. (samples_3, control_3, pvalue_3, statistic_3),
  205. (samples_4, control_4, pvalue_4, statistic_4),
  206. ]
  207. )
  208. def test_basic(self, samples, control, pvalue, statistic):
  209. rng = np.random.default_rng(11681140010308601919115036826969764808)
  210. res = stats.dunnett(*samples, control=control, rng=rng)
  211. assert isinstance(res, DunnettResult)
  212. assert_allclose(res.statistic, statistic, rtol=5e-5)
  213. assert_allclose(res.pvalue, pvalue, rtol=1e-2, atol=1e-4)
  214. @pytest.mark.parametrize(
  215. 'alternative',
  216. ['two-sided', 'less', 'greater']
  217. )
  218. def test_ttest_ind(self, alternative):
  219. # check that `dunnett` agrees with `ttest_ind`
  220. # when there are only two groups
  221. rng = np.random.default_rng(114184017807316971636137493526995620351)
  222. for _ in range(10):
  223. sample = rng.integers(-100, 100, size=(10,))
  224. control = rng.integers(-100, 100, size=(10,))
  225. # preserve use of old random_state during SPEC 7 transition
  226. res = stats.dunnett(
  227. sample, control=control,
  228. alternative=alternative, random_state=rng
  229. )
  230. ref = stats.ttest_ind(
  231. sample, control,
  232. alternative=alternative
  233. )
  234. assert_allclose(res.statistic, ref.statistic, rtol=1e-3, atol=1e-5)
  235. assert_allclose(res.pvalue, ref.pvalue, rtol=1e-3, atol=1e-5)
  236. @pytest.mark.parametrize(
  237. 'alternative, pvalue',
  238. [
  239. ('less', [0, 1]),
  240. ('greater', [1, 0]),
  241. ('two-sided', [0, 0]),
  242. ]
  243. )
  244. def test_alternatives(self, alternative, pvalue):
  245. rng = np.random.default_rng(114184017807316971636137493526995620351)
  246. # width of 20 and min diff between samples/control is 60
  247. # and maximal diff would be 100
  248. sample_less = rng.integers(0, 20, size=(10,))
  249. control = rng.integers(80, 100, size=(10,))
  250. sample_greater = rng.integers(160, 180, size=(10,))
  251. res = stats.dunnett(
  252. sample_less, sample_greater, control=control,
  253. alternative=alternative, rng=rng
  254. )
  255. assert_allclose(res.pvalue, pvalue, atol=1e-7)
  256. ci = res.confidence_interval()
  257. # two-sided is comparable for high/low
  258. if alternative == 'less':
  259. assert np.isneginf(ci.low).all()
  260. assert -100 < ci.high[0] < -60
  261. assert 60 < ci.high[1] < 100
  262. elif alternative == 'greater':
  263. assert -100 < ci.low[0] < -60
  264. assert 60 < ci.low[1] < 100
  265. assert np.isposinf(ci.high).all()
  266. elif alternative == 'two-sided':
  267. assert -100 < ci.low[0] < -60
  268. assert 60 < ci.low[1] < 100
  269. assert -100 < ci.high[0] < -60
  270. assert 60 < ci.high[1] < 100
  271. @pytest.mark.parametrize("case", [case_1, case_2, case_3, case_4])
  272. @pytest.mark.parametrize("alternative", ['less', 'greater', 'two-sided'])
  273. def test_against_R_multicomp_glht(self, case, alternative):
  274. rng = np.random.default_rng(189117774084579816190295271136455278291)
  275. samples = case['samples']
  276. control = case['control']
  277. alternatives = {'less': 'less', 'greater': 'greater',
  278. 'two-sided': 'twosided'}
  279. p_ref = case['pvalues'][alternative.replace('-', '')]
  280. res = stats.dunnett(*samples, control=control, alternative=alternative,
  281. rng=rng)
  282. # atol can't be tighter because R reports some pvalues as "< 1e-4"
  283. assert_allclose(res.pvalue, p_ref, rtol=5e-3, atol=1e-4)
  284. ci_ref = case['cis'][alternatives[alternative]]
  285. if alternative == "greater":
  286. ci_ref = [ci_ref, np.inf]
  287. elif alternative == "less":
  288. ci_ref = [-np.inf, ci_ref]
  289. assert res._ci is None
  290. assert res._ci_cl is None
  291. ci = res.confidence_interval(confidence_level=0.95)
  292. assert_allclose(ci.low, ci_ref[0], rtol=5e-3, atol=1e-5)
  293. assert_allclose(ci.high, ci_ref[1], rtol=5e-3, atol=1e-5)
  294. # re-run to use the cached value "is" to check id as same object
  295. assert res._ci is ci
  296. assert res._ci_cl == 0.95
  297. ci_ = res.confidence_interval(confidence_level=0.95)
  298. assert ci_ is ci
  299. @pytest.mark.parametrize('alternative', ["two-sided", "less", "greater"])
  300. def test_str(self, alternative):
  301. rng = np.random.default_rng(189117774084579816190295271136455278291)
  302. res = stats.dunnett(
  303. *self.samples_3, control=self.control_3, alternative=alternative,
  304. rng=rng
  305. )
  306. # check some str output
  307. res_str = str(res)
  308. assert '(Sample 2 - Control)' in res_str
  309. assert '95.0%' in res_str
  310. if alternative == 'less':
  311. assert '-inf' in res_str
  312. assert '19.' in res_str
  313. elif alternative == 'greater':
  314. assert 'inf' in res_str
  315. assert '-13.' in res_str
  316. else:
  317. assert 'inf' not in res_str
  318. assert '21.' in res_str
  319. def test_warnings(self):
  320. rng = np.random.default_rng(189117774084579816190295271136455278291)
  321. res = stats.dunnett(
  322. *self.samples_3, control=self.control_3, rng=rng
  323. )
  324. msg = r"Computation of the confidence interval did not converge"
  325. with pytest.warns(UserWarning, match=msg):
  326. res._allowance(tol=1e-5)
  327. def test_raises(self):
  328. samples, control = self.samples_3, self.control_3
  329. # alternative
  330. with pytest.raises(ValueError, match="alternative must be"):
  331. stats.dunnett(*samples, control=control, alternative='bob')
  332. # 2D for a sample
  333. samples_ = copy.deepcopy(samples)
  334. samples_[0] = [samples_[0]]
  335. with pytest.raises(ValueError, match="must be 1D arrays"):
  336. stats.dunnett(*samples_, control=control)
  337. # 2D for control
  338. control_ = copy.deepcopy(control)
  339. control_ = [control_]
  340. with pytest.raises(ValueError, match="must be 1D arrays"):
  341. stats.dunnett(*samples, control=control_)
  342. # No obs in a sample
  343. samples_ = copy.deepcopy(samples)
  344. samples_[1] = []
  345. with pytest.raises(ValueError, match="at least 1 observation"):
  346. stats.dunnett(*samples_, control=control)
  347. # No obs in control
  348. control_ = []
  349. with pytest.raises(ValueError, match="at least 1 observation"):
  350. stats.dunnett(*samples, control=control_)
  351. res = stats.dunnett(*samples, control=control)
  352. with pytest.raises(ValueError, match="Confidence level must"):
  353. res.confidence_interval(confidence_level=3)
  354. @pytest.mark.filterwarnings("ignore:Computation of the confidence")
  355. @pytest.mark.parametrize('n_samples', [1, 2, 3])
  356. def test_shapes(self, n_samples):
  357. rng = np.random.default_rng(689448934110805334)
  358. samples = rng.normal(size=(n_samples, 10))
  359. control = rng.normal(size=10)
  360. res = stats.dunnett(*samples, control=control, rng=rng)
  361. assert res.statistic.shape == (n_samples,)
  362. assert res.pvalue.shape == (n_samples,)
  363. ci = res.confidence_interval()
  364. assert ci.low.shape == (n_samples,)
  365. assert ci.high.shape == (n_samples,)