test_fit.py 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. import os
  2. import warnings
  3. import numpy as np
  4. from numpy.testing import assert_allclose, assert_equal
  5. import pytest
  6. from scipy import stats
  7. from scipy.optimize import differential_evolution
  8. from .test_continuous_basic import distcont
  9. from scipy.stats._distn_infrastructure import FitError
  10. from scipy.stats._distr_params import distdiscrete
  11. from scipy.stats import goodness_of_fit
  12. # this is not a proper statistical test for convergence, but only
  13. # verifies that the estimate and true values don't differ by too much
  14. fit_sizes = [1000, 5000, 10000] # sample sizes to try
  15. thresh_percent = 0.25 # percent of true parameters for fail cut-off
  16. thresh_min = 0.75 # minimum difference estimate - true to fail test
  17. mle_failing_fits = [
  18. 'dpareto_lognorm',
  19. 'gausshyper',
  20. 'genexpon',
  21. 'gengamma',
  22. 'irwinhall',
  23. 'kappa4',
  24. 'ksone',
  25. 'kstwo',
  26. 'ncf',
  27. 'ncx2',
  28. 'truncexpon',
  29. 'tukeylambda',
  30. 'vonmises',
  31. 'levy_stable',
  32. 'truncpareto',
  33. 'truncweibull_min',
  34. 'studentized_range',
  35. ]
  36. # these pass but are XSLOW (>1s)
  37. mle_Xslow_fits = ['betaprime', 'crystalball', 'exponweib', 'f', 'geninvgauss',
  38. 'jf_skew_t', 'nct', 'recipinvgauss', 'rel_breitwigner',
  39. 'vonmises_line']
  40. # The MLE fit method of these distributions doesn't perform well when all
  41. # parameters are fit, so test them with the location fixed at 0.
  42. mle_use_floc0 = [
  43. 'burr',
  44. 'chi',
  45. 'chi2',
  46. 'mielke',
  47. 'pearson3',
  48. 'genhalflogistic',
  49. 'rdist',
  50. 'pareto',
  51. 'powerlaw', # distfn.nnlf(est2, rvs) > distfn.nnlf(est1, rvs) otherwise
  52. 'powerlognorm',
  53. 'wrapcauchy',
  54. 'rel_breitwigner',
  55. ]
  56. mm_failing_fits = ['alpha', 'betaprime', 'burr', 'burr12', 'cauchy', 'chi',
  57. 'chi2', 'crystalball', 'dgamma', 'dpareto_lognorm', 'dweibull',
  58. 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'genextreme',
  59. 'gengamma', 'genhyperbolic', 'gennorm', 'genpareto',
  60. 'halfcauchy', 'invgamma', 'invweibull', 'irwinhall', 'jf_skew_t',
  61. 'johnsonsu', 'kappa3', 'ksone', 'kstwo', 'landau', 'levy', 'levy_l',
  62. 'levy_stable', 'loglaplace', 'lomax', 'mielke', 'nakagami',
  63. 'ncf', 'nct', 'ncx2', 'pareto', 'powerlognorm', 'powernorm',
  64. 'rel_breitwigner', 'skewcauchy', 't', 'triang',
  65. 'truncpareto', 'truncweibull_min', 'tukeylambda',
  66. 'studentized_range']
  67. # not sure if these fail, but they caused my patience to fail
  68. mm_XXslow_fits = ['argus', 'exponpow', 'exponweib', 'gausshyper', 'genexpon',
  69. 'genhalflogistic', 'halfgennorm', 'gompertz', 'johnsonsb',
  70. 'kappa4', 'kstwobign', 'recipinvgauss',
  71. 'truncexpon', 'vonmises', 'vonmises_line']
  72. # these pass but are XSLOW (>1s)
  73. mm_Xslow_fits = ['wrapcauchy']
  74. failing_fits = {"MM": mm_failing_fits + mm_XXslow_fits, "MLE": mle_failing_fits}
  75. xslow_fits = {"MM": mm_Xslow_fits, "MLE": mle_Xslow_fits}
  76. fail_interval_censored = {"truncpareto"}
  77. # Don't run the fit test on these:
  78. skip_fit = [
  79. 'erlang', # Subclass of gamma, generates a warning.
  80. 'genhyperbolic', 'norminvgauss', # too slow
  81. ]
  82. def cases_test_cont_fit():
  83. # this tests the closeness of the estimated parameters to the true
  84. # parameters with fit method of continuous distributions
  85. # Note: is slow, some distributions don't converge with sample
  86. # size <= 10000
  87. for distname, arg in distcont:
  88. if distname not in skip_fit:
  89. yield distname, arg
  90. @pytest.mark.slow
  91. @pytest.mark.parametrize('distname,arg', cases_test_cont_fit())
  92. @pytest.mark.parametrize('method', ["MLE", "MM"])
  93. def test_cont_fit(distname, arg, method):
  94. run_xfail = int(os.getenv('SCIPY_XFAIL', default=False))
  95. run_xslow = int(os.getenv('SCIPY_XSLOW', default=False))
  96. if distname in failing_fits[method] and not run_xfail:
  97. # The generic `fit` method can't be expected to work perfectly for all
  98. # distributions, data, and guesses. Some failures are expected.
  99. msg = "Failure expected; set environment variable SCIPY_XFAIL=1 to run."
  100. pytest.xfail(msg)
  101. if distname in xslow_fits[method] and not run_xslow:
  102. msg = "Very slow; set environment variable SCIPY_XSLOW=1 to run."
  103. pytest.skip(msg)
  104. distfn = getattr(stats, distname)
  105. truearg = np.hstack([arg, [0.0, 1.0]])
  106. diffthreshold = np.max(np.vstack([truearg*thresh_percent,
  107. np.full(distfn.numargs+2, thresh_min)]),
  108. 0)
  109. for fit_size in fit_sizes:
  110. # Note that if a fit succeeds, the other fit_sizes are skipped
  111. rng = np.random.default_rng(1234)
  112. with np.errstate(all='ignore'):
  113. rvs = distfn.rvs(size=fit_size, *arg, random_state=rng)
  114. if method == 'MLE' and distfn.name in mle_use_floc0:
  115. kwds = {'floc': 0}
  116. else:
  117. kwds = {}
  118. # start with default values
  119. est = distfn.fit(rvs, method=method, **kwds)
  120. if method == 'MLE':
  121. # Trivial test of the use of CensoredData. The fit() method
  122. # will check that data contains no actual censored data, and
  123. # do a regular uncensored fit.
  124. data1 = stats.CensoredData(rvs)
  125. est1 = distfn.fit(data1, **kwds)
  126. msg = ('Different results fitting uncensored data wrapped as'
  127. f' CensoredData: {distfn.name}: est={est} est1={est1}')
  128. assert_allclose(est1, est, rtol=1e-10, err_msg=msg)
  129. if method == 'MLE' and distname not in fail_interval_censored:
  130. # Convert the first `nic` values in rvs to interval-censored
  131. # values. The interval is small, so est2 should be close to
  132. # est.
  133. nic = 15
  134. interval = np.column_stack((rvs, rvs))
  135. interval[:nic, 0] *= 0.99
  136. interval[:nic, 1] *= 1.01
  137. interval.sort(axis=1)
  138. data2 = stats.CensoredData(interval=interval)
  139. est2 = distfn.fit(data2, **kwds)
  140. msg = ('Different results fitting interval-censored'
  141. f' data: {distfn.name}: est={est} est2={est2}')
  142. assert_allclose(est2, est, rtol=0.05, err_msg=msg)
  143. diff = est - truearg
  144. # threshold for location
  145. diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,
  146. thresh_min])
  147. if np.any(np.isnan(est)):
  148. raise AssertionError('nan returned in fit')
  149. else:
  150. if np.all(np.abs(diff) <= diffthreshold):
  151. break
  152. else:
  153. txt = f'parameter: {str(truearg)}\n'
  154. txt += f'estimated: {str(est)}\n'
  155. txt += f'diff : {str(diff)}\n'
  156. raise AssertionError(f'fit not very good in {distfn.name}\n' + txt)
  157. def _check_loc_scale_mle_fit(name, data, desired, atol=None):
  158. d = getattr(stats, name)
  159. actual = d.fit(data)[-2:]
  160. assert_allclose(actual, desired, atol=atol,
  161. err_msg=f'poor mle fit of (loc, scale) in {name}')
  162. def test_non_default_loc_scale_mle_fit():
  163. data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00])
  164. _check_loc_scale_mle_fit('uniform', data, [1.01, 0.99], 1e-3)
  165. _check_loc_scale_mle_fit('expon', data, [1.01, 0.73875], 1e-3)
  166. def test_expon_fit():
  167. """gh-6167"""
  168. data = [0, 0, 0, 0, 2, 2, 2, 2]
  169. phat = stats.expon.fit(data, floc=0)
  170. assert_allclose(phat, [0, 1.0], atol=1e-3)
  171. def test_fit_error():
  172. data = np.concatenate([np.zeros(29), np.ones(21)])
  173. message = "Optimization converged to parameters that are..."
  174. with pytest.raises(FitError, match=message), \
  175. pytest.warns(RuntimeWarning):
  176. stats.beta.fit(data)
  177. @pytest.mark.parametrize("dist, params",
  178. [(stats.norm, (0.5, 2.5)), # type: ignore[attr-defined]
  179. (stats.binom, (10, 0.3, 2))]) # type: ignore[attr-defined]
  180. def test_nnlf_and_related_methods(dist, params):
  181. rng = np.random.default_rng(983459824)
  182. if hasattr(dist, 'pdf'):
  183. logpxf = dist.logpdf
  184. else:
  185. logpxf = dist.logpmf
  186. x = dist.rvs(*params, size=100, random_state=rng)
  187. ref = -logpxf(x, *params).sum()
  188. res1 = dist.nnlf(params, x)
  189. res2 = dist._penalized_nnlf(params, x)
  190. assert_allclose(res1, ref)
  191. assert_allclose(res2, ref)
  192. def cases_test_fit_mle():
  193. # These fail default test or hang
  194. skip_basic_fit = {'argus', 'irwinhall', 'foldnorm', 'truncpareto',
  195. 'truncweibull_min', 'ksone', 'levy_stable',
  196. 'studentized_range', 'kstwo',
  197. 'beta', 'nakagami', 'truncnorm', # don't meet tolerance
  198. 'poisson_binom'} # vector-valued shape parameter
  199. # Please keep this list in alphabetical order...
  200. slow_basic_fit = {'alpha', 'arcsine', 'betaprime', 'binom', 'bradford', 'burr12',
  201. 'chi', 'crystalball', 'dweibull', 'erlang', 'exponnorm',
  202. 'exponpow', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'gamma',
  203. 'genexpon', 'genextreme', 'gennorm', 'genpareto',
  204. 'gompertz', 'halfgennorm', 'invgamma', 'invgauss', 'invweibull',
  205. 'jf_skew_t', 'johnsonsb', 'johnsonsu', 'kappa3',
  206. 'kstwobign', 'loglaplace', 'lognorm', 'lomax', 'mielke',
  207. 'nbinom', 'norminvgauss',
  208. 'pareto', 'pearson3', 'powerlaw', 'powernorm',
  209. 'randint', 'rdist', 'recipinvgauss', 'rice', 'skewnorm',
  210. 't', 'uniform', 'weibull_max', 'weibull_min', 'wrapcauchy',
  211. 'zipfian'}
  212. # Please keep this list in alphabetical order...
  213. xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'dpareto_lognorm',
  214. 'exponweib', 'gausshyper', 'gengamma', 'genhalflogistic',
  215. 'genhyperbolic', 'geninvgauss',
  216. 'hypergeom', 'kappa4', 'loguniform',
  217. 'ncf', 'nchypergeom_fisher', 'nchypergeom_wallenius',
  218. 'nct', 'ncx2', 'nhypergeom',
  219. 'powerlognorm', 'reciprocal', 'rel_breitwigner',
  220. 'skellam', 'trapezoid', 'triang',
  221. 'tukeylambda', 'vonmises'}
  222. for dist in dict(distdiscrete + distcont):
  223. if dist in skip_basic_fit or not isinstance(dist, str):
  224. reason = "tested separately"
  225. yield pytest.param(dist, marks=pytest.mark.skip(reason=reason))
  226. elif dist in slow_basic_fit:
  227. reason = "too slow (>= 0.25s)"
  228. yield pytest.param(dist, marks=pytest.mark.slow(reason=reason))
  229. elif dist in xslow_basic_fit:
  230. reason = "too slow (>= 1.0s)"
  231. yield pytest.param(dist, marks=pytest.mark.xslow(reason=reason))
  232. else:
  233. yield dist
  234. def cases_test_fit_mse():
  235. # the first four are so slow that I'm not sure whether they would pass
  236. skip_basic_fit = {'levy_stable', 'studentized_range', 'ksone', 'skewnorm',
  237. 'irwinhall', # hangs
  238. 'norminvgauss', # super slow (~1 hr) but passes
  239. 'kstwo', # very slow (~25 min) but passes
  240. 'geninvgauss', # quite slow (~4 minutes) but passes
  241. 'gausshyper', 'genhyperbolic', # integration warnings
  242. 'tukeylambda', # close, but doesn't meet tolerance
  243. 'vonmises', # can have negative CDF; doesn't play nice
  244. 'arcsine', 'argus', 'powerlaw', 'rdist', # don't meet tolerance
  245. 'poisson_binom', # vector-valued shape parameter
  246. }
  247. # Please keep this list in alphabetical order...
  248. slow_basic_fit = {'alpha', 'anglit', 'betabinom', 'bradford',
  249. 'chi', 'chi2', 'crystalball', 'dweibull',
  250. 'erlang', 'exponnorm', 'exponpow', 'exponweib',
  251. 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm',
  252. 'gamma', 'genexpon', 'genextreme', 'genhalflogistic',
  253. 'genlogistic', 'genpareto', 'gompertz',
  254. 'hypergeom', 'invweibull',
  255. 'johnsonsu', 'kappa3', 'kstwobign',
  256. 'laplace_asymmetric', 'loggamma', 'loglaplace',
  257. 'lognorm', 'lomax',
  258. 'maxwell', 'nhypergeom',
  259. 'pareto', 'powernorm', 'randint', 'recipinvgauss',
  260. 'semicircular',
  261. 't', 'triang', 'truncexpon', 'truncpareto',
  262. 'uniform',
  263. 'wald', 'weibull_max', 'weibull_min', 'wrapcauchy',
  264. 'zipfian'}
  265. # Please keep this list in alphabetical order...
  266. xslow_basic_fit = {'argus', 'beta', 'betaprime', 'burr', 'burr12',
  267. 'dgamma', 'dpareto_lognorm', 'f', 'gengamma', 'gennorm',
  268. 'halfgennorm', 'invgamma', 'invgauss', 'jf_skew_t',
  269. 'johnsonsb', 'kappa4', 'loguniform', 'mielke',
  270. 'nakagami', 'ncf', 'nchypergeom_fisher',
  271. 'nchypergeom_wallenius', 'nct', 'ncx2',
  272. 'pearson3', 'powerlognorm',
  273. 'reciprocal', 'rel_breitwigner', 'rice',
  274. 'trapezoid', 'truncnorm', 'truncweibull_min',
  275. 'vonmises_line'}
  276. warns_basic_fit = {'skellam'} # can remove mark after gh-14901 is resolved
  277. for dist in dict(distdiscrete + distcont):
  278. if dist in skip_basic_fit or not isinstance(dist, str):
  279. reason = "Fails. Oh well."
  280. yield pytest.param(dist, marks=pytest.mark.skip(reason=reason))
  281. elif dist in slow_basic_fit:
  282. reason = "too slow (>= 0.25s)"
  283. yield pytest.param(dist, marks=pytest.mark.slow(reason=reason))
  284. elif dist in xslow_basic_fit:
  285. reason = "too slow (>= 1.0s)"
  286. yield pytest.param(dist, marks=pytest.mark.xslow(reason=reason))
  287. elif dist in warns_basic_fit:
  288. mark = pytest.mark.filterwarnings('ignore::RuntimeWarning')
  289. yield pytest.param(dist, marks=mark)
  290. else:
  291. yield dist
  292. def cases_test_fitstart():
  293. for distname, shapes in dict(distcont).items():
  294. if (not isinstance(distname, str) or
  295. distname in {'studentized_range', 'recipinvgauss'}): # slow
  296. continue
  297. yield distname, shapes
  298. @pytest.mark.parametrize('distname, shapes', cases_test_fitstart())
  299. def test_fitstart(distname, shapes):
  300. dist = getattr(stats, distname)
  301. rng = np.random.default_rng(216342614)
  302. data = rng.random(10)
  303. with np.errstate(invalid='ignore', divide='ignore'): # irrelevant to test
  304. guess = dist._fitstart(data)
  305. assert dist._argcheck(*guess[:-2])
  306. def assert_nlff_less_or_close(dist, data, params1, params0, rtol=1e-7, atol=0,
  307. nlff_name='nnlf'):
  308. nlff = getattr(dist, nlff_name)
  309. nlff1 = nlff(params1, data)
  310. nlff0 = nlff(params0, data)
  311. if not (nlff1 < nlff0):
  312. np.testing.assert_allclose(nlff1, nlff0, rtol=rtol, atol=atol)
  313. class TestFit:
  314. dist = stats.binom # type: ignore[attr-defined]
  315. seed = 654634816187
  316. rng = np.random.default_rng(seed)
  317. data = stats.binom.rvs(5, 0.5, size=100, random_state=rng) # type: ignore[attr-defined] # noqa: E501
  318. shape_bounds_a = [(1, 10), (0, 1)]
  319. shape_bounds_d = {'n': (1, 10), 'p': (0, 1)}
  320. atol = 5e-2
  321. rtol = 1e-2
  322. tols = {'atol': atol, 'rtol': rtol}
  323. def opt(self, *args, rng=1, **kwds):
  324. return differential_evolution(*args, rng=rng, **kwds)
  325. def test_dist_iv(self):
  326. message = "`dist` must be an instance of..."
  327. with pytest.raises(ValueError, match=message):
  328. stats.fit(10, self.data, self.shape_bounds_a)
  329. def test_data_iv(self):
  330. message = "`data` must be exactly one-dimensional."
  331. with pytest.raises(ValueError, match=message):
  332. stats.fit(self.dist, [[1, 2, 3]], self.shape_bounds_a)
  333. message = "All elements of `data` must be finite numbers."
  334. with pytest.raises(ValueError, match=message):
  335. stats.fit(self.dist, [1, 2, 3, np.nan], self.shape_bounds_a)
  336. with pytest.raises(ValueError, match=message):
  337. stats.fit(self.dist, [1, 2, 3, np.inf], self.shape_bounds_a)
  338. with pytest.raises(ValueError, match=message):
  339. stats.fit(self.dist, ['1', '2', '3'], self.shape_bounds_a)
  340. def test_bounds_iv(self):
  341. message = "Bounds provided for the following unrecognized..."
  342. shape_bounds = {'n': (1, 10), 'p': (0, 1), '1': (0, 10)}
  343. with pytest.warns(RuntimeWarning, match=message):
  344. stats.fit(self.dist, self.data, shape_bounds)
  345. message = "Each element of a `bounds` sequence must be a tuple..."
  346. shape_bounds = [(1, 10, 3), (0, 1)]
  347. with pytest.raises(ValueError, match=message):
  348. stats.fit(self.dist, self.data, shape_bounds)
  349. message = "Each element of `bounds` must be a tuple specifying..."
  350. shape_bounds = [(1, 10, 3), (0, 1, 0.5)]
  351. with pytest.raises(ValueError, match=message):
  352. stats.fit(self.dist, self.data, shape_bounds)
  353. shape_bounds = [1, 0]
  354. with pytest.raises(ValueError, match=message):
  355. stats.fit(self.dist, self.data, shape_bounds)
  356. message = "A `bounds` sequence must contain at least 2 elements..."
  357. shape_bounds = [(1, 10)]
  358. with pytest.raises(ValueError, match=message):
  359. stats.fit(self.dist, self.data, shape_bounds)
  360. message = "A `bounds` sequence may not contain more than 3 elements..."
  361. bounds = [(1, 10), (1, 10), (1, 10), (1, 10)]
  362. with pytest.raises(ValueError, match=message):
  363. stats.fit(self.dist, self.data, bounds)
  364. message = "There are no values for `p` on the interval..."
  365. shape_bounds = {'n': (1, 10), 'p': (1, 0)}
  366. with pytest.raises(ValueError, match=message):
  367. stats.fit(self.dist, self.data, shape_bounds)
  368. message = "There are no values for `n` on the interval..."
  369. shape_bounds = [(10, 1), (0, 1)]
  370. with pytest.raises(ValueError, match=message):
  371. stats.fit(self.dist, self.data, shape_bounds)
  372. message = "There are no integer values for `n` on the interval..."
  373. shape_bounds = [(1.4, 1.6), (0, 1)]
  374. with pytest.raises(ValueError, match=message):
  375. stats.fit(self.dist, self.data, shape_bounds)
  376. message = "The intersection of user-provided bounds for `n`"
  377. with pytest.raises(ValueError, match=message):
  378. stats.fit(self.dist, self.data)
  379. shape_bounds = [(-np.inf, np.inf), (0, 1)]
  380. with pytest.raises(ValueError, match=message):
  381. stats.fit(self.dist, self.data, shape_bounds)
  382. def test_guess_iv(self):
  383. message = "Guesses provided for the following unrecognized..."
  384. guess = {'n': 1, 'p': 0.5, '1': 255}
  385. with pytest.warns(RuntimeWarning, match=message):
  386. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  387. message = "Each element of `guess` must be a scalar..."
  388. guess = {'n': 1, 'p': 'hi'}
  389. with pytest.raises(ValueError, match=message):
  390. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  391. guess = [1, 'f']
  392. with pytest.raises(ValueError, match=message):
  393. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  394. guess = [[1, 2]]
  395. with pytest.raises(ValueError, match=message):
  396. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  397. message = "A `guess` sequence must contain at least 2..."
  398. guess = [1]
  399. with pytest.raises(ValueError, match=message):
  400. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  401. message = "A `guess` sequence may not contain more than 3..."
  402. guess = [1, 2, 3, 4]
  403. with pytest.raises(ValueError, match=message):
  404. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  405. message = "Guess for parameter `n` rounded.*|Guess for parameter `p` clipped.*"
  406. guess = {'n': 4.5, 'p': -0.5}
  407. with pytest.warns(RuntimeWarning, match=message):
  408. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  409. message = "Guess for parameter `loc` rounded..."
  410. guess = [5, 0.5, 0.5]
  411. with pytest.warns(RuntimeWarning, match=message):
  412. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  413. message = "Guess for parameter `p` clipped..."
  414. guess = {'n': 5, 'p': -0.5}
  415. with pytest.warns(RuntimeWarning, match=message):
  416. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  417. message = "Guess for parameter `loc` clipped..."
  418. guess = [5, 0.5, 1]
  419. with pytest.warns(RuntimeWarning, match=message):
  420. stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
  421. def basic_fit_test(self, dist_name, method, rng=1):
  422. N = 5000
  423. dist_data = dict(distcont + distdiscrete)
  424. rng = np.random.default_rng(self.seed)
  425. dist = getattr(stats, dist_name)
  426. shapes = np.array(dist_data[dist_name])
  427. bounds = np.empty((len(shapes) + 2, 2), dtype=np.float64)
  428. bounds[:-2, 0] = shapes/10.**np.sign(shapes)
  429. bounds[:-2, 1] = shapes*10.**np.sign(shapes)
  430. bounds[-2] = (0, 10)
  431. bounds[-1] = (1e-16, 10)
  432. loc = rng.uniform(*bounds[-2])
  433. scale = rng.uniform(*bounds[-1])
  434. ref = list(dist_data[dist_name]) + [loc, scale]
  435. if getattr(dist, 'pmf', False):
  436. ref = ref[:-1]
  437. ref[-1] = np.floor(loc)
  438. data = dist.rvs(*ref, size=N, random_state=rng)
  439. bounds = bounds[:-1]
  440. if getattr(dist, 'pdf', False):
  441. data = dist.rvs(*ref, size=N, random_state=rng)
  442. with warnings.catch_warnings():
  443. warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning)
  444. res = stats.fit(dist, data, bounds, method=method,
  445. optimizer=self.opt)
  446. nlff_names = {'mle': 'nnlf', 'mse': '_penalized_nlpsf'}
  447. nlff_name = nlff_names[method]
  448. assert_nlff_less_or_close(dist, data, res.params, ref, **self.tols,
  449. nlff_name=nlff_name)
  450. @pytest.mark.parametrize("dist_name", cases_test_fit_mle())
  451. def test_basic_fit_mle(self, dist_name):
  452. self.basic_fit_test(dist_name, "mle", rng=5)
  453. @pytest.mark.parametrize("dist_name", cases_test_fit_mse())
  454. def test_basic_fit_mse(self, dist_name):
  455. self.basic_fit_test(dist_name, "mse", rng=2)
  456. @pytest.mark.slow
  457. def test_arcsine(self):
  458. # Can't guarantee that all distributions will fit all data with
  459. # arbitrary bounds. This distribution just happens to fail above.
  460. # Try something slightly different.
  461. N = 1000
  462. rng = np.random.default_rng(self.seed)
  463. dist = stats.arcsine
  464. shapes = (1., 2.)
  465. data = dist.rvs(*shapes, size=N, random_state=rng)
  466. shape_bounds = {'loc': (0.1, 10), 'scale': (0.1, 10)}
  467. res = stats.fit(dist, data, shape_bounds, method='mse', optimizer=self.opt)
  468. assert_nlff_less_or_close(dist, data, res.params, shapes,
  469. nlff_name='_penalized_nlpsf', **self.tols)
  470. @pytest.mark.parametrize("method", ('mle', 'mse'))
  471. def test_argus(self, method):
  472. # Can't guarantee that all distributions will fit all data with
  473. # arbitrary bounds. This distribution just happens to fail above.
  474. # Try something slightly different.
  475. N = 1000
  476. rng = np.random.default_rng(self.seed)
  477. dist = stats.argus
  478. shapes = (1., 2., 3.)
  479. data = dist.rvs(*shapes, size=N, random_state=rng)
  480. shape_bounds = {'chi': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)}
  481. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt, method=method)
  482. nlff_name = {'mle': 'nnlf', 'mse': '_penalized_nlpsf'}[method]
  483. assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols,
  484. nlff_name=nlff_name)
  485. @pytest.mark.xslow
  486. def test_beta(self):
  487. # Can't guarantee that all distributions will fit all data with
  488. # arbitrary bounds. This distribution just happens to fail above.
  489. # Try something slightly different.
  490. N = 1000
  491. rng = np.random.default_rng(self.seed)
  492. dist = stats.beta
  493. shapes = (2.3098496451481823, 0.62687954300963677, 1., 2.)
  494. data = dist.rvs(*shapes, size=N, random_state=rng)
  495. shape_bounds = {'a': (0.1, 10), 'b':(0.1, 10),
  496. 'loc': (0.1, 10), 'scale': (0.1, 10)}
  497. res = stats.fit(dist, data, shape_bounds, method='mle', optimizer=self.opt)
  498. assert_nlff_less_or_close(dist, data, res.params, shapes,
  499. nlff_name='nnlf', **self.tols)
  500. def test_foldnorm(self):
  501. # Can't guarantee that all distributions will fit all data with
  502. # arbitrary bounds. This distribution just happens to fail above.
  503. # Try something slightly different.
  504. N = 1000
  505. rng = np.random.default_rng(self.seed)
  506. dist = stats.foldnorm
  507. shapes = (1.952125337355587, 2., 3.)
  508. data = dist.rvs(*shapes, size=N, random_state=rng)
  509. shape_bounds = {'c': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)}
  510. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt)
  511. assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols)
  512. def test_nakagami(self):
  513. # Can't guarantee that all distributions will fit all data with
  514. # arbitrary bounds. This distribution just happens to fail above.
  515. # Try something slightly different.
  516. N = 1000
  517. rng = np.random.default_rng(self.seed)
  518. dist = stats.nakagami
  519. shapes = (4.9673794866666237, 1., 2.)
  520. data = dist.rvs(*shapes, size=N, random_state=rng)
  521. shape_bounds = {'nu':(0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)}
  522. res = stats.fit(dist, data, shape_bounds, method='mle', optimizer=self.opt)
  523. assert_nlff_less_or_close(dist, data, res.params, shapes,
  524. nlff_name='nnlf', **self.tols)
  525. @pytest.mark.slow
  526. def test_powerlaw(self):
  527. # Can't guarantee that all distributions will fit all data with
  528. # arbitrary bounds. This distribution just happens to fail above.
  529. # Try something slightly different.
  530. N = 1000
  531. rng = np.random.default_rng(self.seed)
  532. dist = stats.powerlaw
  533. shapes = (1.6591133289905851, 1., 2.)
  534. data = dist.rvs(*shapes, size=N, random_state=rng)
  535. shape_bounds = {'a': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)}
  536. res = stats.fit(dist, data, shape_bounds, method='mse', optimizer=self.opt)
  537. assert_nlff_less_or_close(dist, data, res.params, shapes,
  538. nlff_name='_penalized_nlpsf', **self.tols)
  539. def test_truncpareto(self):
  540. # Can't guarantee that all distributions will fit all data with
  541. # arbitrary bounds. This distribution just happens to fail above.
  542. # Try something slightly different.
  543. N = 1000
  544. rng = np.random.default_rng(self.seed)
  545. dist = stats.truncpareto
  546. shapes = (1.8, 5.3, 2.3, 4.1)
  547. data = dist.rvs(*shapes, size=N, random_state=rng)
  548. shape_bounds = [(0.1, 10)]*4
  549. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt)
  550. assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols)
  551. @pytest.mark.slow
  552. def test_truncweibull_min(self):
  553. # Can't guarantee that all distributions will fit all data with
  554. # arbitrary bounds. This distribution just happens to fail above.
  555. # Try something slightly different.
  556. N = 1000
  557. rng = np.random.default_rng(self.seed)
  558. dist = stats.truncweibull_min
  559. shapes = (2.5, 0.25, 1.75, 2., 3.)
  560. data = dist.rvs(*shapes, size=N, random_state=rng)
  561. shape_bounds = [(0.1, 10)]*5
  562. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt)
  563. assert_nlff_less_or_close(dist, data, res.params, shapes, **self.tols)
  564. def test_missing_shape_bounds(self):
  565. # some distributions have a small domain w.r.t. a parameter, e.g.
  566. # $p \in [0, 1]$ for binomial distribution
  567. # User does not need to provide these because the intersection of the
  568. # user's bounds (none) and the distribution's domain is finite
  569. N = 1000
  570. rng = np.random.default_rng(self.seed)
  571. dist = stats.binom
  572. n, p, loc = 10, 0.65, 0
  573. data = dist.rvs(n, p, loc=loc, size=N, random_state=rng)
  574. shape_bounds = {'n': np.array([0, 20])} # check arrays are OK, too
  575. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt)
  576. assert_allclose(res.params, (n, p, loc), **self.tols)
  577. dist = stats.bernoulli
  578. p, loc = 0.314159, 0
  579. data = dist.rvs(p, loc=loc, size=N, random_state=rng)
  580. res = stats.fit(dist, data, optimizer=self.opt)
  581. assert_allclose(res.params, (p, loc), **self.tols)
  582. def test_fit_only_loc_scale(self):
  583. # fit only loc
  584. N = 5000
  585. rng = np.random.default_rng(self.seed)
  586. dist = stats.norm
  587. loc, scale = 1.5, 1
  588. data = dist.rvs(loc=loc, size=N, random_state=rng)
  589. loc_bounds = (0, 5)
  590. bounds = {'loc': loc_bounds}
  591. res = stats.fit(dist, data, bounds, optimizer=self.opt)
  592. assert_allclose(res.params, (loc, scale), **self.tols)
  593. # fit only scale
  594. loc, scale = 0, 2.5
  595. data = dist.rvs(scale=scale, size=N, random_state=rng)
  596. scale_bounds = (0.01, 5)
  597. bounds = {'scale': scale_bounds}
  598. res = stats.fit(dist, data, bounds, optimizer=self.opt)
  599. assert_allclose(res.params, (loc, scale), **self.tols)
  600. # fit only loc and scale
  601. dist = stats.norm
  602. loc, scale = 1.5, 2.5
  603. data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng)
  604. bounds = {'loc': loc_bounds, 'scale': scale_bounds}
  605. res = stats.fit(dist, data, bounds, optimizer=self.opt)
  606. assert_allclose(res.params, (loc, scale), **self.tols)
  607. def test_everything_fixed(self):
  608. N = 5000
  609. rng = np.random.default_rng(self.seed)
  610. dist = stats.norm
  611. loc, scale = 1.5, 2.5
  612. data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng)
  613. # loc, scale fixed to 0, 1 by default
  614. res = stats.fit(dist, data)
  615. assert_allclose(res.params, (0, 1), **self.tols)
  616. # loc, scale explicitly fixed
  617. bounds = {'loc': (loc, loc), 'scale': (scale, scale)}
  618. res = stats.fit(dist, data, bounds)
  619. assert_allclose(res.params, (loc, scale), **self.tols)
  620. # `n` gets fixed during polishing
  621. dist = stats.binom
  622. n, p, loc = 10, 0.65, 0
  623. data = dist.rvs(n, p, loc=loc, size=N, random_state=rng)
  624. shape_bounds = {'n': (0, 20), 'p': (0.65, 0.65)}
  625. res = stats.fit(dist, data, shape_bounds, optimizer=self.opt)
  626. assert_allclose(res.params, (n, p, loc), **self.tols)
  627. def test_failure(self):
  628. N = 5000
  629. rng = np.random.default_rng(self.seed)
  630. dist = stats.nbinom
  631. shapes = (5, 0.5)
  632. data = dist.rvs(*shapes, size=N, random_state=rng)
  633. assert data.min() == 0
  634. # With lower bounds on location at 0.5, likelihood is zero
  635. bounds = [(0, 30), (0, 1), (0.5, 10)]
  636. res = stats.fit(dist, data, bounds)
  637. message = "Optimization converged to parameter values that are"
  638. assert res.message.startswith(message)
  639. assert res.success is False
  640. @pytest.mark.xslow
  641. def test_guess(self):
  642. # Test that guess helps DE find the desired solution
  643. N = 2000
  644. # With some seeds, `fit` doesn't need a guess
  645. rng = np.random.default_rng(196390444561)
  646. dist = stats.nhypergeom
  647. params = (20, 7, 12, 0)
  648. bounds = [(2, 200), (0.7, 70), (1.2, 120), (0, 10)]
  649. data = dist.rvs(*params, size=N, random_state=rng)
  650. res = stats.fit(dist, data, bounds, optimizer=self.opt)
  651. assert not np.allclose(res.params, params, **self.tols)
  652. res = stats.fit(dist, data, bounds, guess=params, optimizer=self.opt)
  653. assert_allclose(res.params, params, **self.tols)
  654. def test_mse_accuracy_1(self):
  655. # Test maximum spacing estimation against example from Wikipedia
  656. # https://en.wikipedia.org/wiki/Maximum_spacing_estimation#Examples
  657. data = [2, 4]
  658. dist = stats.expon
  659. bounds = {'loc': (0, 0), 'scale': (1e-8, 10)}
  660. res_mle = stats.fit(dist, data, bounds=bounds, method='mle')
  661. assert_allclose(res_mle.params.scale, 3, atol=1e-3)
  662. res_mse = stats.fit(dist, data, bounds=bounds, method='mse')
  663. assert_allclose(res_mse.params.scale, 3.915, atol=1e-3)
  664. def test_mse_accuracy_2(self):
  665. # Test maximum spacing estimation against example from Wikipedia
  666. # https://en.wikipedia.org/wiki/Maximum_spacing_estimation#Examples
  667. rng = np.random.default_rng(9843212616816518964)
  668. dist = stats.uniform
  669. n = 10
  670. data = dist(3, 6).rvs(size=n, random_state=rng)
  671. bounds = {'loc': (0, 10), 'scale': (1e-8, 10)}
  672. res = stats.fit(dist, data, bounds=bounds, method='mse')
  673. # (loc=3.608118420015416, scale=5.509323262055043)
  674. x = np.sort(data)
  675. a = (n*x[0] - x[-1])/(n - 1)
  676. b = (n*x[-1] - x[0])/(n - 1)
  677. ref = a, b-a # (3.6081133632151503, 5.509328130317254)
  678. assert_allclose(res.params, ref, rtol=1e-4)
  679. # Data from Matlab: https://www.mathworks.com/help/stats/lillietest.html
  680. examgrades = [65, 61, 81, 88, 69, 89, 55, 84, 86, 84, 71, 81, 84, 81, 78, 67,
  681. 96, 66, 73, 75, 59, 71, 69, 63, 79, 76, 63, 85, 87, 88, 80, 71,
  682. 65, 84, 71, 75, 81, 79, 64, 65, 84, 77, 70, 75, 84, 75, 73, 92,
  683. 90, 79, 80, 71, 73, 71, 58, 79, 73, 64, 77, 82, 81, 59, 54, 82,
  684. 57, 79, 79, 73, 74, 82, 63, 64, 73, 69, 87, 68, 81, 73, 83, 73,
  685. 80, 73, 73, 71, 66, 78, 64, 74, 68, 67, 75, 75, 80, 85, 74, 76,
  686. 80, 77, 93, 70, 86, 80, 81, 83, 68, 60, 85, 64, 74, 82, 81, 77,
  687. 66, 85, 75, 81, 69, 60, 83, 72]
  688. class TestGoodnessOfFit:
  689. def test_gof_iv(self):
  690. dist = stats.norm
  691. x = [1, 2, 3]
  692. message = r"`dist` must be a \(non-frozen\) instance of..."
  693. with pytest.raises(TypeError, match=message):
  694. goodness_of_fit(stats.norm(), x)
  695. message = "`data` must be a one-dimensional array of numbers."
  696. with pytest.raises(ValueError, match=message):
  697. goodness_of_fit(dist, [[1, 2, 3]])
  698. message = "`statistic` must be one of..."
  699. with pytest.raises(ValueError, match=message):
  700. goodness_of_fit(dist, x, statistic='mm')
  701. message = "`n_mc_samples` must be an integer."
  702. with pytest.raises(TypeError, match=message):
  703. goodness_of_fit(dist, x, n_mc_samples=1000.5)
  704. message = "SeedSequence expects int or sequence"
  705. with pytest.raises(TypeError, match=message):
  706. goodness_of_fit(dist, x, rng='herring')
  707. def test_against_ks(self):
  708. rng = np.random.default_rng(8517426291317196949)
  709. x = examgrades
  710. known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
  711. res = goodness_of_fit(stats.norm, x, known_params=known_params,
  712. statistic='ks', rng=rng)
  713. ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact')
  714. assert_allclose(res.statistic, ref.statistic) # ~0.0848
  715. assert_allclose(res.pvalue, ref.pvalue, atol=5e-3) # ~0.335
  716. def test_against_lilliefors(self):
  717. rng = np.random.default_rng(2291803665717442724)
  718. x = examgrades
  719. # preserve use of old random_state during SPEC 7 transition
  720. res = goodness_of_fit(stats.norm, x, statistic='ks', random_state=rng)
  721. known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
  722. ref = stats.kstest(x, stats.norm(**known_params).cdf, method='exact')
  723. assert_allclose(res.statistic, ref.statistic) # ~0.0848
  724. assert_allclose(res.pvalue, 0.0348, atol=5e-3)
  725. def test_against_cvm(self):
  726. rng = np.random.default_rng(8674330857509546614)
  727. x = examgrades
  728. known_params = {'loc': np.mean(x), 'scale': np.std(x, ddof=1)}
  729. res = goodness_of_fit(stats.norm, x, known_params=known_params,
  730. statistic='cvm', rng=rng)
  731. ref = stats.cramervonmises(x, stats.norm(**known_params).cdf)
  732. assert_allclose(res.statistic, ref.statistic) # ~0.090
  733. assert_allclose(res.pvalue, ref.pvalue, atol=5e-3) # ~0.636
  734. def test_against_anderson_case_0(self):
  735. # "Case 0" is where loc and scale are known [1]
  736. rng = np.random.default_rng(7384539336846690410)
  737. x = np.arange(1, 101)
  738. # loc that produced critical value of statistic found w/ root_scalar
  739. known_params = {'loc': 45.01575354024957, 'scale': 30}
  740. res = goodness_of_fit(stats.norm, x, known_params=known_params,
  741. statistic='ad', rng=rng)
  742. assert_allclose(res.statistic, 2.492) # See [1] Table 1A 1.0
  743. assert_allclose(res.pvalue, 0.05, atol=5e-3)
  744. def test_against_anderson_case_1(self):
  745. # "Case 1" is where scale is known and loc is fit [1]
  746. rng = np.random.default_rng(5040212485680146248)
  747. x = np.arange(1, 101)
  748. # scale that produced critical value of statistic found w/ root_scalar
  749. known_params = {'scale': 29.957112639101933}
  750. res = goodness_of_fit(stats.norm, x, known_params=known_params,
  751. statistic='ad', rng=rng)
  752. assert_allclose(res.statistic, 0.908) # See [1] Table 1B 1.1
  753. assert_allclose(res.pvalue, 0.1, atol=5e-3)
  754. def test_against_anderson_case_2(self):
  755. # "Case 2" is where loc is known and scale is fit [1]
  756. rng = np.random.default_rng(726693985720914083)
  757. x = np.arange(1, 101)
  758. # loc that produced critical value of statistic found w/ root_scalar
  759. known_params = {'loc': 44.5680212261933}
  760. res = goodness_of_fit(stats.norm, x, known_params=known_params,
  761. statistic='ad', rng=rng)
  762. assert_allclose(res.statistic, 2.904) # See [1] Table 1B 1.2
  763. assert_allclose(res.pvalue, 0.025, atol=5e-3)
  764. def test_against_anderson_case_3(self):
  765. # "Case 3" is where both loc and scale are fit [1]
  766. rng = np.random.default_rng(6763691329830218206)
  767. # c that produced critical value of statistic found w/ root_scalar
  768. x = stats.skewnorm.rvs(1.4477847789132101, loc=1, scale=2, size=100,
  769. random_state=rng)
  770. res = goodness_of_fit(stats.norm, x, statistic='ad', rng=rng)
  771. assert_allclose(res.statistic, 0.559) # See [1] Table 1B 1.2
  772. assert_allclose(res.pvalue, 0.15, atol=5e-3)
  773. @pytest.mark.xslow
  774. def test_against_anderson_gumbel_r(self):
  775. rng = np.random.default_rng(7302761058217743)
  776. # c that produced critical value of statistic found w/ root_scalar
  777. x = stats.genextreme(0.051896837188595134, loc=0.5,
  778. scale=1.5).rvs(size=1000, random_state=rng)
  779. res = goodness_of_fit(stats.gumbel_r, x, statistic='ad', rng=rng)
  780. ref = stats.anderson(x, dist='gumbel_r', method='interpolate')
  781. assert_allclose(res.statistic, ref.statistic)
  782. assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)
  783. def test_against_filliben_norm(self):
  784. # Test against `stats.fit` ref. [7] Section 8 "Example"
  785. rng = np.random.default_rng(8024266430745011915)
  786. y = [6, 1, -4, 8, -2, 5, 0]
  787. known_params = {'loc': 0, 'scale': 1}
  788. res = stats.goodness_of_fit(stats.norm, y, known_params=known_params,
  789. statistic="filliben", rng=rng)
  790. # Slight discrepancy presumably due to roundoff in Filliben's
  791. # calculation. Using exact order statistic medians instead of
  792. # Filliben's approximation doesn't account for it.
  793. assert_allclose(res.statistic, 0.98538, atol=1e-4)
  794. assert 0.75 < res.pvalue < 0.9
  795. # Using R's ppcc library:
  796. # library(ppcc)
  797. # options(digits=16)
  798. # x < - c(6, 1, -4, 8, -2, 5, 0)
  799. # set.seed(100)
  800. # ppccTest(x, "qnorm", ppos="Filliben")
  801. # Discrepancy with
  802. assert_allclose(res.statistic, 0.98540957187084, rtol=2e-5)
  803. assert_allclose(res.pvalue, 0.8875, rtol=2e-3)
  804. def test_filliben_property(self):
  805. # Filliben's statistic should be independent of data location and scale
  806. rng = np.random.default_rng(8535677809395478813)
  807. x = rng.normal(loc=10, scale=0.5, size=100)
  808. res = stats.goodness_of_fit(stats.norm, x,
  809. statistic="filliben", rng=rng)
  810. known_params = {'loc': 0, 'scale': 1}
  811. ref = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
  812. statistic="filliben", rng=rng)
  813. assert_allclose(res.statistic, ref.statistic, rtol=1e-15)
  814. @pytest.mark.parametrize('case', [(25, [.928, .937, .950, .958, .966]),
  815. (50, [.959, .965, .972, .977, .981]),
  816. (95, [.977, .979, .983, .986, .989])])
  817. def test_against_filliben_norm_table(self, case):
  818. # Test against `stats.fit` ref. [7] Table 1
  819. rng = np.random.default_rng(504569995557928957)
  820. n, ref = case
  821. x = rng.random(n)
  822. known_params = {'loc': 0, 'scale': 1}
  823. res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
  824. statistic="filliben", rng=rng)
  825. percentiles = np.array([0.005, 0.01, 0.025, 0.05, 0.1])
  826. res = stats.scoreatpercentile(res.null_distribution, percentiles*100)
  827. assert_allclose(res, ref, atol=2e-3)
  828. @pytest.mark.xslow
  829. @pytest.mark.parametrize('case', [(5, 0.95772790260469, 0.4755),
  830. (6, 0.95398832257958, 0.3848),
  831. (7, 0.9432692889277, 0.2328)])
  832. def test_against_ppcc(self, case):
  833. # Test against R ppcc, e.g.
  834. # library(ppcc)
  835. # options(digits=16)
  836. # x < - c(0.52325412, 1.06907699, -0.36084066, 0.15305959, 0.99093194)
  837. # set.seed(100)
  838. # ppccTest(x, "qrayleigh", ppos="Filliben")
  839. n, ref_statistic, ref_pvalue = case
  840. rng = np.random.default_rng(7777775561439803116)
  841. x = rng.normal(size=n)
  842. res = stats.goodness_of_fit(stats.rayleigh, x, statistic="filliben",
  843. rng=rng)
  844. assert_allclose(res.statistic, ref_statistic, rtol=1e-4)
  845. assert_allclose(res.pvalue, ref_pvalue, atol=1.5e-2)
  846. def test_params_effects(self):
  847. # Ensure that `guessed_params`, `fit_params`, and `known_params` have
  848. # the intended effects.
  849. rng = np.random.default_rng(9121950977643805391)
  850. x = stats.skewnorm.rvs(-5.044559778383153, loc=1, scale=2, size=50,
  851. random_state=rng)
  852. # Show that `guessed_params` don't fit to the guess,
  853. # but `fit_params` and `known_params` respect the provided fit
  854. guessed_params = {'c': 13.4}
  855. fit_params = {'scale': 13.73}
  856. known_params = {'loc': -13.85}
  857. rng = np.random.default_rng(9121950977643805391)
  858. res1 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
  859. guessed_params=guessed_params,
  860. fit_params=fit_params,
  861. known_params=known_params, rng=rng)
  862. assert not np.allclose(res1.fit_result.params.c, 13.4)
  863. assert_equal(res1.fit_result.params.scale, 13.73)
  864. assert_equal(res1.fit_result.params.loc, -13.85)
  865. # Show that changing the guess changes the parameter that gets fit,
  866. # and it changes the null distribution
  867. guessed_params = {'c': 2}
  868. rng = np.random.default_rng(9121950977643805391)
  869. res2 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
  870. guessed_params=guessed_params,
  871. fit_params=fit_params,
  872. known_params=known_params, rng=rng)
  873. assert not np.allclose(res2.fit_result.params.c,
  874. res1.fit_result.params.c, rtol=1e-8)
  875. assert not np.allclose(res2.null_distribution,
  876. res1.null_distribution, rtol=1e-8)
  877. assert_equal(res2.fit_result.params.scale, 13.73)
  878. assert_equal(res2.fit_result.params.loc, -13.85)
  879. # If we set all parameters as fit_params and known_params,
  880. # they're all fixed to those values, but the null distribution
  881. # varies.
  882. fit_params = {'c': 13.4, 'scale': 13.73}
  883. rng = np.random.default_rng(9121950977643805391)
  884. res3 = goodness_of_fit(stats.weibull_min, x, n_mc_samples=2,
  885. guessed_params=guessed_params,
  886. fit_params=fit_params,
  887. known_params=known_params, rng=rng)
  888. assert_equal(res3.fit_result.params.c, 13.4)
  889. assert_equal(res3.fit_result.params.scale, 13.73)
  890. assert_equal(res3.fit_result.params.loc, -13.85)
  891. assert not np.allclose(res3.null_distribution, res1.null_distribution)
  892. def test_custom_statistic(self):
  893. # Test support for custom statistic function.
  894. # References:
  895. # [1] Pyke, R. (1965). "Spacings". Journal of the Royal Statistical
  896. # Society: Series B (Methodological), 27(3): 395-436.
  897. # [2] Burrows, P. M. (1979). "Selected Percentage Points of
  898. # Greenwood's Statistics". Journal of the Royal Statistical
  899. # Society. Series A (General), 142(2): 256-258.
  900. # Use the Greenwood statistic for illustration; see [1, p.402].
  901. def greenwood(dist, data, *, axis):
  902. x = np.sort(data, axis=axis)
  903. y = dist.cdf(x)
  904. d = np.diff(y, axis=axis, prepend=0, append=1)
  905. return np.sum(d ** 2, axis=axis)
  906. # Run the Monte Carlo test with sample size = 5 on a fully specified
  907. # null distribution, and compare the simulated quantiles to the exact
  908. # ones given in [2, Table 1, column (n = 5)].
  909. rng = np.random.default_rng(9121950977643805391)
  910. data = stats.expon.rvs(size=5, random_state=rng)
  911. result = goodness_of_fit(stats.expon, data,
  912. known_params={'loc': 0, 'scale': 1},
  913. statistic=greenwood, rng=rng)
  914. p = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
  915. exact_quantiles = [
  916. .183863, .199403, .210088, .226040, .239947, .253677, .268422,
  917. .285293, .306002, .334447, .382972, .432049, .547468]
  918. simulated_quantiles = np.quantile(result.null_distribution, p)
  919. assert_allclose(simulated_quantiles, exact_quantiles, atol=0.005)
  920. class TestFitResult:
  921. def test_plot_iv(self):
  922. rng = np.random.default_rng(1769658657308472721)
  923. data = stats.norm.rvs(0, 1, size=100, random_state=rng)
  924. def optimizer(*args, **kwargs):
  925. return differential_evolution(*args, **kwargs, rng=rng)
  926. bounds = [(0, 30), (0, 1)]
  927. res = stats.fit(stats.norm, data, bounds, optimizer=optimizer)
  928. try:
  929. import matplotlib # noqa: F401
  930. message = r"`plot_type` must be one of \{'..."
  931. with pytest.raises(ValueError, match=message):
  932. res.plot(plot_type='llama')
  933. except (ModuleNotFoundError, ImportError):
  934. message = r"matplotlib must be installed to use method `plot`."
  935. with pytest.raises(ModuleNotFoundError, match=message):
  936. res.plot(plot_type='llama')