| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956 |
- """Quasi-Monte Carlo engines and helpers."""
- import copy
- import math
- import numbers
- import os
- import warnings
- from abc import ABC, abstractmethod
- from functools import partial
- from typing import (
- ClassVar,
- Literal,
- overload,
- TYPE_CHECKING,
- )
- from collections.abc import Callable
- import numpy as np
- from scipy._lib._util import DecimalNumber, GeneratorType, IntNumber, SeedType
- if TYPE_CHECKING:
- import numpy.typing as npt
- import scipy.stats as stats
- from scipy._lib._util import rng_integers, _rng_spawn, _transition_to_rng
- from scipy.sparse.csgraph import minimum_spanning_tree
- from scipy.spatial import distance, Voronoi
- from scipy.special import gammainc
- from ._sobol import (
- _initialize_v, _cscramble, _fill_p_cumulative, _draw, _fast_forward,
- _categorize, _MAXDIM
- )
- from ._qmc_cy import (
- _cy_wrapper_centered_discrepancy,
- _cy_wrapper_wrap_around_discrepancy,
- _cy_wrapper_mixture_discrepancy,
- _cy_wrapper_l2_star_discrepancy,
- _cy_wrapper_update_discrepancy,
- _cy_van_der_corput_scrambled,
- _cy_van_der_corput,
- )
- __all__ = ['scale', 'discrepancy', 'geometric_discrepancy', 'update_discrepancy',
- 'QMCEngine', 'Sobol', 'Halton', 'LatinHypercube', 'PoissonDisk',
- 'MultinomialQMC', 'MultivariateNormalQMC']
- @overload
- def check_random_state(seed: IntNumber | None = ...) -> np.random.Generator:
- ...
- @overload
- def check_random_state(seed: GeneratorType) -> GeneratorType:
- ...
- # Based on scipy._lib._util.check_random_state
- # This is going to be removed at the end of the SPEC 7 transition,
- # so I'll just leave the argument name `seed` alone
- def check_random_state(seed=None):
- """Turn `seed` into a `numpy.random.Generator` instance.
- Parameters
- ----------
- seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
- If `seed` is an int or None, a new `numpy.random.Generator` is
- created using ``np.random.default_rng(seed)``.
- If `seed` is already a ``Generator`` or ``RandomState`` instance, then
- the provided instance is used.
- Returns
- -------
- seed : {`numpy.random.Generator`, `numpy.random.RandomState`}
- Random number generator.
- """
- if seed is None or isinstance(seed, numbers.Integral | np.integer):
- return np.random.default_rng(seed)
- elif isinstance(seed, np.random.RandomState | np.random.Generator):
- return seed
- else:
- raise ValueError(f'{seed!r} cannot be used to seed a'
- ' numpy.random.Generator instance')
- def scale(
- sample: "npt.ArrayLike",
- l_bounds: "npt.ArrayLike",
- u_bounds: "npt.ArrayLike",
- *,
- reverse: bool = False
- ) -> np.ndarray:
- r"""Sample scaling from unit hypercube to different bounds.
- To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
- with :math:`a` the lower bounds and :math:`b` the upper bounds.
- The following transformation is used:
- .. math::
- (b - a) \cdot \text{sample} + a
- Parameters
- ----------
- sample : array_like (n, d)
- Sample to scale.
- l_bounds, u_bounds : array_like (d,)
- Lower and upper bounds (resp. :math:`a`, :math:`b`) of transformed
- data. If `reverse` is True, range of the original data to transform
- to the unit hypercube.
- reverse : bool, optional
- Reverse the transformation from different bounds to the unit hypercube.
- Default is False.
- Returns
- -------
- sample : array_like (n, d)
- Scaled sample.
- Examples
- --------
- Transform 3 samples in the unit hypercube to bounds:
- >>> from scipy.stats import qmc
- >>> l_bounds = [-2, 0]
- >>> u_bounds = [6, 5]
- >>> sample = [[0.5 , 0.75],
- ... [0.5 , 0.5],
- ... [0.75, 0.25]]
- >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
- >>> sample_scaled
- array([[2. , 3.75],
- [2. , 2.5 ],
- [4. , 1.25]])
- And convert back to the unit hypercube:
- >>> sample_ = qmc.scale(sample_scaled, l_bounds, u_bounds, reverse=True)
- >>> sample_
- array([[0.5 , 0.75],
- [0.5 , 0.5 ],
- [0.75, 0.25]])
- """
- sample = np.asarray(sample)
- # Checking bounds and sample
- if not sample.ndim == 2:
- raise ValueError('Sample is not a 2D array')
- lower, upper = _validate_bounds(
- l_bounds=l_bounds, u_bounds=u_bounds, d=sample.shape[1]
- )
- if not reverse:
- # Checking that sample is within the hypercube
- if (sample.max() > 1.) or (sample.min() < 0.):
- raise ValueError('Sample is not in unit hypercube')
- return sample * (upper - lower) + lower
- else:
- # Checking that sample is within the bounds
- if not (np.all(sample >= lower) and np.all(sample <= upper)):
- raise ValueError('Sample is out of bounds')
- return (sample - lower) / (upper - lower)
- def _ensure_in_unit_hypercube(sample: "npt.ArrayLike") -> np.ndarray:
- """Ensure that sample is a 2D array and is within a unit hypercube
- Parameters
- ----------
- sample : array_like (n, d)
- A 2D array of points.
- Returns
- -------
- np.ndarray
- The array interpretation of the input sample
- Raises
- ------
- ValueError
- If the input is not a 2D array or contains points outside of
- a unit hypercube.
- """
- sample = np.asarray(sample, dtype=np.float64, order="C")
- if not sample.ndim == 2:
- raise ValueError("Sample is not a 2D array")
- if (sample.max() > 1.) or (sample.min() < 0.):
- raise ValueError("Sample is not in unit hypercube")
- return sample
- def discrepancy(
- sample: "npt.ArrayLike",
- *,
- iterative: bool = False,
- method: Literal["CD", "WD", "MD", "L2-star"] = "CD",
- workers: IntNumber = 1) -> float:
- """Discrepancy of a given sample.
- Parameters
- ----------
- sample : array_like (n, d)
- The sample to compute the discrepancy from.
- iterative : bool, optional
- Must be False if not using it for updating the discrepancy.
- Default is False. Refer to the notes for more details.
- method : str, optional
- Type of discrepancy, can be ``CD``, ``WD``, ``MD`` or ``L2-star``.
- Refer to the notes for more details. Default is ``CD``.
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is given all
- CPU threads are used. Default is 1.
- Returns
- -------
- discrepancy : float
- Discrepancy.
- See Also
- --------
- geometric_discrepancy
- Notes
- -----
- The discrepancy is a uniformity criterion used to assess the space filling
- of a number of samples in a hypercube. A discrepancy quantifies the
- distance between the continuous uniform distribution on a hypercube and the
- discrete uniform distribution on :math:`n` distinct sample points.
- The lower the value is, the better the coverage of the parameter space is.
- For a collection of subsets of the hypercube, the discrepancy is the
- difference between the fraction of sample points in one of those
- subsets and the volume of that subset. There are different definitions of
- discrepancy corresponding to different collections of subsets. Some
- versions take a root mean square difference over subsets instead of
- a maximum.
- A measure of uniformity is reasonable if it satisfies the following
- criteria [1]_:
- 1. It is invariant under permuting factors and/or runs.
- 2. It is invariant under rotation of the coordinates.
- 3. It can measure not only uniformity of the sample over the hypercube,
- but also the projection uniformity of the sample over non-empty
- subset of lower dimension hypercubes.
- 4. There is some reasonable geometric meaning.
- 5. It is easy to compute.
- 6. It satisfies the Koksma-Hlawka-like inequality.
- 7. It is consistent with other criteria in experimental design.
- Four methods are available:
- * ``CD``: Centered Discrepancy - subspace involves a corner of the
- hypercube
- * ``WD``: Wrap-around Discrepancy - subspace can wrap around bounds
- * ``MD``: Mixture Discrepancy - mix between CD/WD covering more criteria
- * ``L2-star``: L2-star discrepancy - like CD BUT variant to rotation
- Methods ``CD``, ``WD``, and ``MD`` implement the right hand side of equations
- 9, 10, and 18 of [2]_, respectively; the square root is not taken. On the
- other hand, ``L2-star`` computes the quantity given by equation 10 of
- [3]_ as implemented by subsequent equations; the square root is taken.
- Lastly, using ``iterative=True``, it is possible to compute the
- discrepancy as if we had :math:`n+1` samples. This is useful if we want
- to add a point to a sampling and check the candidate which would give the
- lowest discrepancy. Then you could just update the discrepancy with
- each candidate using `update_discrepancy`. This method is faster than
- computing the discrepancy for a large number of candidates.
- References
- ----------
- .. [1] Fang et al. "Design and modeling for computer experiments".
- Computer Science and Data Analysis Series, 2006.
- .. [2] Zhou Y.-D. et al. "Mixture discrepancy for quasi-random point sets."
- Journal of Complexity, 29 (3-4) , pp. 283-301, 2013.
- .. [3] T. T. Warnock. "Computational investigations of low discrepancy
- point sets." Applications of Number Theory to Numerical
- Analysis, Academic Press, pp. 319-343, 1972.
- Examples
- --------
- Calculate the quality of the sample using the discrepancy:
- >>> import numpy as np
- >>> from scipy.stats import qmc
- >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
- >>> l_bounds = [0.5, 0.5]
- >>> u_bounds = [6.5, 6.5]
- >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
- >>> space
- array([[0.08333333, 0.41666667],
- [0.25 , 0.91666667],
- [0.41666667, 0.25 ],
- [0.58333333, 0.75 ],
- [0.75 , 0.08333333],
- [0.91666667, 0.58333333]])
- >>> qmc.discrepancy(space)
- 0.008142039609053464
- We can also compute iteratively the ``CD`` discrepancy by using
- ``iterative=True``.
- >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
- >>> disc_init
- 0.04769081147119336
- >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
- 0.008142039609053513
- """
- sample = _ensure_in_unit_hypercube(sample)
- workers = _validate_workers(workers)
- methods = {
- "CD": _cy_wrapper_centered_discrepancy,
- "WD": _cy_wrapper_wrap_around_discrepancy,
- "MD": _cy_wrapper_mixture_discrepancy,
- "L2-star": _cy_wrapper_l2_star_discrepancy,
- }
- if method in methods:
- return methods[method](sample, iterative, workers=workers)
- else:
- raise ValueError(f"{method!r} is not a valid method. It must be one of"
- f" {set(methods)!r}")
- def geometric_discrepancy(
- sample: "npt.ArrayLike",
- method: Literal["mindist", "mst"] = "mindist",
- metric: str = "euclidean") -> float:
- """Discrepancy of a given sample based on its geometric properties.
- Parameters
- ----------
- sample : array_like (n, d)
- The sample to compute the discrepancy from.
- method : {"mindist", "mst"}, optional
- The method to use. One of ``mindist`` for minimum distance (default)
- or ``mst`` for minimum spanning tree.
- metric : str or callable, optional
- The distance metric to use. See the documentation
- for `scipy.spatial.distance.pdist` for the available metrics and
- the default.
- Returns
- -------
- discrepancy : float
- Discrepancy (higher values correspond to greater sample uniformity).
- See Also
- --------
- discrepancy
- Notes
- -----
- The discrepancy can serve as a simple measure of quality of a random sample.
- This measure is based on the geometric properties of the distribution of points
- in the sample, such as the minimum distance between any pair of points, or
- the mean edge length in a minimum spanning tree.
- The higher the value is, the better the coverage of the parameter space is.
- Note that this is different from `scipy.stats.qmc.discrepancy`, where lower
- values correspond to higher quality of the sample.
- Also note that when comparing different sampling strategies using this function,
- the sample size must be kept constant.
- It is possible to calculate two metrics from the minimum spanning tree:
- the mean edge length and the standard deviation of edges lengths. Using
- both metrics offers a better picture of uniformity than either metric alone,
- with higher mean and lower standard deviation being preferable (see [1]_
- for a brief discussion). This function currently only calculates the mean
- edge length.
- References
- ----------
- .. [1] Franco J. et al. "Minimum Spanning Tree: A new approach to assess the quality
- of the design of computer experiments." Chemometrics and Intelligent Laboratory
- Systems, 97 (2), pp. 164-169, 2009.
- Examples
- --------
- Calculate the quality of the sample using the minimum euclidean distance
- (the defaults):
- >>> import numpy as np
- >>> from scipy.stats import qmc
- >>> rng = np.random.default_rng(191468432622931918890291693003068437394)
- >>> sample = qmc.LatinHypercube(d=2, rng=rng).random(50)
- >>> qmc.geometric_discrepancy(sample)
- 0.03708161435687876
- Calculate the quality using the mean edge length in the minimum
- spanning tree:
- >>> qmc.geometric_discrepancy(sample, method='mst')
- 0.1105149978798376
- Display the minimum spanning tree and the points with
- the smallest distance:
- >>> import matplotlib.pyplot as plt
- >>> from matplotlib.lines import Line2D
- >>> from scipy.sparse.csgraph import minimum_spanning_tree
- >>> from scipy.spatial.distance import pdist, squareform
- >>> dist = pdist(sample)
- >>> mst = minimum_spanning_tree(squareform(dist))
- >>> edges = np.where(mst.toarray() > 0)
- >>> edges = np.asarray(edges).T
- >>> min_dist = np.min(dist)
- >>> min_idx = np.argwhere(squareform(dist) == min_dist)[0]
- >>> fig, ax = plt.subplots(figsize=(10, 5))
- >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',
- ... xlim=[0, 1], ylim=[0, 1])
- >>> for edge in edges:
- ... ax.plot(sample[edge, 0], sample[edge, 1], c='k')
- >>> ax.scatter(sample[:, 0], sample[:, 1])
- >>> ax.add_patch(plt.Circle(sample[min_idx[0]], min_dist, color='red', fill=False))
- >>> markers = [
- ... Line2D([0], [0], marker='o', lw=0, label='Sample points'),
- ... Line2D([0], [0], color='k', label='Minimum spanning tree'),
- ... Line2D([0], [0], marker='o', lw=0, markerfacecolor='w', markeredgecolor='r',
- ... label='Minimum point-to-point distance'),
- ... ]
- >>> ax.legend(handles=markers, loc='center left', bbox_to_anchor=(1, 0.5));
- >>> plt.show()
- """
- sample = _ensure_in_unit_hypercube(sample)
- if sample.shape[0] < 2:
- raise ValueError("Sample must contain at least two points")
- distances = distance.pdist(sample, metric=metric) # type: ignore[call-overload]
- if np.any(distances == 0.0):
- warnings.warn("Sample contains duplicate points.", stacklevel=2)
- if method == "mindist":
- return np.min(distances[distances.nonzero()])
- elif method == "mst":
- fully_connected_graph = distance.squareform(distances)
- mst = minimum_spanning_tree(fully_connected_graph)
- distances = mst[mst.nonzero()]
- # TODO consider returning both the mean and the standard deviation
- # see [1] for a discussion
- return np.mean(distances)
- else:
- raise ValueError(f"{method!r} is not a valid method. "
- f"It must be one of {{'mindist', 'mst'}}")
- def update_discrepancy(
- x_new: "npt.ArrayLike",
- sample: "npt.ArrayLike",
- initial_disc: DecimalNumber) -> float:
- """Update the centered discrepancy with a new sample.
- Parameters
- ----------
- x_new : array_like (1, d)
- The new sample to add in `sample`.
- sample : array_like (n, d)
- The initial sample.
- initial_disc : float
- Centered discrepancy of the `sample`.
- Returns
- -------
- discrepancy : float
- Centered discrepancy of the sample composed of `x_new` and `sample`.
- Examples
- --------
- We can also compute iteratively the discrepancy by using
- ``iterative=True``.
- >>> import numpy as np
- >>> from scipy.stats import qmc
- >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
- >>> l_bounds = [0.5, 0.5]
- >>> u_bounds = [6.5, 6.5]
- >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
- >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
- >>> disc_init
- 0.04769081147119336
- >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
- 0.008142039609053513
- """
- sample = np.asarray(sample, dtype=np.float64, order="C")
- x_new = np.asarray(x_new, dtype=np.float64, order="C")
- # Checking that sample is within the hypercube and 2D
- if not sample.ndim == 2:
- raise ValueError('Sample is not a 2D array')
- if (sample.max() > 1.) or (sample.min() < 0.):
- raise ValueError('Sample is not in unit hypercube')
- # Checking that x_new is within the hypercube and 1D
- if not x_new.ndim == 1:
- raise ValueError('x_new is not a 1D array')
- if not (np.all(x_new >= 0) and np.all(x_new <= 1)):
- raise ValueError('x_new is not in unit hypercube')
- if x_new.shape[0] != sample.shape[1]:
- raise ValueError("x_new and sample must be broadcastable")
- return _cy_wrapper_update_discrepancy(x_new, sample, initial_disc)
- def _perturb_discrepancy(sample: np.ndarray, i1: int, i2: int, k: int,
- disc: float):
- """Centered discrepancy after an elementary perturbation of a LHS.
- An elementary perturbation consists of an exchange of coordinates between
- two points: ``sample[i1, k] <-> sample[i2, k]``. By construction,
- this operation conserves the LHS properties.
- Parameters
- ----------
- sample : array_like (n, d)
- The sample (before permutation) to compute the discrepancy from.
- i1 : int
- The first line of the elementary permutation.
- i2 : int
- The second line of the elementary permutation.
- k : int
- The column of the elementary permutation.
- disc : float
- Centered discrepancy of the design before permutation.
- Returns
- -------
- discrepancy : float
- Centered discrepancy of the design after permutation.
- References
- ----------
- .. [1] Jin et al. "An efficient algorithm for constructing optimal design
- of computer experiments", Journal of Statistical Planning and
- Inference, 2005.
- """
- n = sample.shape[0]
- z_ij = sample - 0.5
- # Eq (19)
- c_i1j = (1. / n ** 2.
- * np.prod(0.5 * (2. + abs(z_ij[i1, :])
- + abs(z_ij) - abs(z_ij[i1, :] - z_ij)), axis=1))
- c_i2j = (1. / n ** 2.
- * np.prod(0.5 * (2. + abs(z_ij[i2, :])
- + abs(z_ij) - abs(z_ij[i2, :] - z_ij)), axis=1))
- # Eq (20)
- c_i1i1 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i1, :]))
- - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i1, :])
- - 0.5 * z_ij[i1, :] ** 2))
- c_i2i2 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i2, :]))
- - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i2, :])
- - 0.5 * z_ij[i2, :] ** 2))
- # Eq (22), typo in the article in the denominator i2 -> i1
- num = (2 + abs(z_ij[i2, k]) + abs(z_ij[:, k])
- - abs(z_ij[i2, k] - z_ij[:, k]))
- denum = (2 + abs(z_ij[i1, k]) + abs(z_ij[:, k])
- - abs(z_ij[i1, k] - z_ij[:, k]))
- gamma = num / denum
- # Eq (23)
- c_p_i1j = gamma * c_i1j
- # Eq (24)
- c_p_i2j = c_i2j / gamma
- alpha = (1 + abs(z_ij[i2, k])) / (1 + abs(z_ij[i1, k]))
- beta = (2 - abs(z_ij[i2, k])) / (2 - abs(z_ij[i1, k]))
- g_i1 = np.prod(1. + abs(z_ij[i1, :]))
- g_i2 = np.prod(1. + abs(z_ij[i2, :]))
- h_i1 = np.prod(1. + 0.5 * abs(z_ij[i1, :]) - 0.5 * (z_ij[i1, :] ** 2))
- h_i2 = np.prod(1. + 0.5 * abs(z_ij[i2, :]) - 0.5 * (z_ij[i2, :] ** 2))
- # Eq (25), typo in the article g is missing
- c_p_i1i1 = ((g_i1 * alpha) / (n ** 2) - 2. * alpha * beta * h_i1 / n)
- # Eq (26), typo in the article n ** 2
- c_p_i2i2 = ((g_i2 / ((n ** 2) * alpha)) - (2. * h_i2 / (n * alpha * beta)))
- # Eq (26)
- sum_ = c_p_i1j - c_i1j + c_p_i2j - c_i2j
- mask = np.ones(n, dtype=bool)
- mask[[i1, i2]] = False
- sum_ = sum(sum_[mask])
- disc_ep = (disc + c_p_i1i1 - c_i1i1 + c_p_i2i2 - c_i2i2 + 2 * sum_)
- return disc_ep
- def primes_from_2_to(n: int) -> np.ndarray:
- """Prime numbers from 2 to *n*.
- Parameters
- ----------
- n : int
- Sup bound with ``n >= 6``.
- Returns
- -------
- primes : list(int)
- Primes in ``2 <= p < n``.
- Notes
- -----
- Taken from [1]_ by P.T. Roy, written consent given on 23.04.2021
- by the original author, Bruno Astrolino, for free use in SciPy under
- the 3-clause BSD.
- References
- ----------
- .. [1] `StackOverflow <https://stackoverflow.com/questions/2068372>`_.
- """
- sieve = np.ones(n // 3 + (n % 6 == 2), dtype=bool)
- for i in range(1, int(n ** 0.5) // 3 + 1):
- k = 3 * i + 1 | 1
- sieve[k * k // 3::2 * k] = False
- sieve[k * (k - 2 * (i & 1) + 4) // 3::2 * k] = False
- return np.r_[2, 3, ((3 * np.nonzero(sieve)[0][1:] + 1) | 1)]
- def n_primes(n: IntNumber) -> list[int]:
- """List of the n-first prime numbers.
- Parameters
- ----------
- n : int
- Number of prime numbers wanted.
- Returns
- -------
- primes : list(int)
- List of primes.
- """
- primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
- 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
- 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
- 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
- 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
- 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,
- 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
- 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599,
- 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673,
- 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761,
- 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857,
- 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947,
- 953, 967, 971, 977, 983, 991, 997][:n]
- if len(primes) < n:
- big_number = 2000
- while 'Not enough primes':
- primes = primes_from_2_to(big_number)[:n] # type: ignore
- if len(primes) == n:
- break
- big_number += 1000
- return primes
- def _van_der_corput_permutations(
- base: IntNumber, *, rng: SeedType = None
- ) -> np.ndarray:
- """Permutations for scrambling a Van der Corput sequence.
- Parameters
- ----------
- base : int
- Base of the sequence.
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. During the transition, the behavior documented above is not
- accurate; see `check_random_state` for actual behavior. After the
- transition, this admonition can be removed.
- Returns
- -------
- permutations : array_like
- Permutation indices.
- Notes
- -----
- In Algorithm 1 of Owen 2017, a permutation of `np.arange(base)` is
- created for each positive integer `k` such that ``1 - base**-k < 1``
- using floating-point arithmetic. For double precision floats, the
- condition ``1 - base**-k < 1`` can also be written as ``base**-k >
- 2**-54``, which makes it more apparent how many permutations we need
- to create.
- """
- rng = check_random_state(rng)
- count = math.ceil(54 / math.log2(base)) - 1
- permutations = np.repeat(np.arange(base)[None], count, axis=0)
- for perm in permutations:
- rng.shuffle(perm)
- return permutations
- def van_der_corput(
- n: IntNumber,
- base: IntNumber = 2,
- *,
- start_index: IntNumber = 0,
- scramble: bool = False,
- permutations: "npt.ArrayLike | None" = None,
- rng: SeedType = None,
- workers: IntNumber = 1) -> np.ndarray:
- """Van der Corput sequence.
- Pseudo-random number generator based on a b-adic expansion.
- Scrambling uses permutations of the remainders (see [1]_). Multiple
- permutations are applied to construct a point. The sequence of
- permutations has to be the same for all points of the sequence.
- Parameters
- ----------
- n : int
- Number of element of the sequence.
- base : int, optional
- Base of the sequence. Default is 2.
- start_index : int, optional
- Index to start the sequence from. Default is 0.
- scramble : bool, optional
- If True, use Owen scrambling. Otherwise no scrambling is done.
- Default is True.
- permutations : array_like, optional
- Permutations used for scrambling.
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is
- given all CPU threads are used. Default is 1.
- Returns
- -------
- sequence : list (n,)
- Sequence of Van der Corput.
- References
- ----------
- .. [1] A. B. Owen. "A randomized Halton algorithm in R",
- :arxiv:`1706.02808`, 2017.
- """
- if base < 2:
- raise ValueError("'base' must be at least 2")
- if scramble:
- if permutations is None:
- permutations = _van_der_corput_permutations(
- base=base, rng=rng
- )
- else:
- permutations = np.asarray(permutations)
- permutations = permutations.astype(np.int64)
- return _cy_van_der_corput_scrambled(n, base, start_index,
- permutations, workers)
- else:
- return _cy_van_der_corput(n, base, start_index, workers)
- class QMCEngine(ABC):
- """A generic Quasi-Monte Carlo sampler class meant for subclassing.
- QMCEngine is a base class to construct a specific Quasi-Monte Carlo
- sampler. It cannot be used directly as a sampler.
- Parameters
- ----------
- d : int
- Dimension of the parameter space.
- optimization : {None, "random-cd", "lloyd"}, optional
- Whether to use an optimization scheme to improve the quality after
- sampling. Note that this is a post-processing step that does not
- guarantee that all properties of the sample will be conserved.
- Default is None.
- * ``random-cd``: random permutations of coordinates to lower the
- centered discrepancy. The best sample based on the centered
- discrepancy is constantly updated. Centered discrepancy-based
- sampling shows better space-filling robustness toward 2D and 3D
- subprojections compared to using other discrepancy measures.
- * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
- The process converges to equally spaced samples.
- .. versionadded:: 1.10.0
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- Notes
- -----
- By convention samples are distributed over the half-open interval
- ``[0, 1)``. Instances of the class can access the attributes: ``d`` for
- the dimension; and ``rng`` for the random number generator.
- **Subclassing**
- When subclassing `QMCEngine` to create a new sampler, ``__init__`` and
- ``random`` must be redefined.
- * ``__init__(d, rng=None)``: at least fix the dimension. If the sampler
- does not take advantage of a ``rng`` (deterministic methods like
- Halton), this parameter can be omitted.
- * ``_random(n, *, workers=1)``: draw ``n`` from the engine. ``workers``
- is used for parallelism. See `Halton` for example.
- Optionally, two other methods can be overwritten by subclasses:
- * ``reset``: Reset the engine to its original state.
- * ``fast_forward``: If the sequence is deterministic (like Halton
- sequence), then ``fast_forward(n)`` is skipping the ``n`` first draw.
- Examples
- --------
- To create a random sampler based on ``np.random.random``, we would do the
- following:
- >>> from scipy.stats import qmc
- >>> class RandomEngine(qmc.QMCEngine):
- ... def __init__(self, d, rng=None):
- ... super().__init__(d=d, rng=rng)
- ...
- ...
- ... def _random(self, n=1, *, workers=1):
- ... return self.rng.random((n, self.d))
- ...
- ...
- ... def reset(self):
- ... super().__init__(d=self.d, rng=self.rng_seed)
- ... return self
- ...
- ...
- ... def fast_forward(self, n):
- ... self.random(n)
- ... return self
- After subclassing `QMCEngine` to define the sampling strategy we want to
- use, we can create an instance to sample from.
- >>> engine = RandomEngine(2)
- >>> engine.random(5)
- array([[0.22733602, 0.31675834], # random
- [0.79736546, 0.67625467],
- [0.39110955, 0.33281393],
- [0.59830875, 0.18673419],
- [0.67275604, 0.94180287]])
- We can also reset the state of the generator and resample again.
- >>> _ = engine.reset()
- >>> engine.random(5)
- array([[0.22733602, 0.31675834], # random
- [0.79736546, 0.67625467],
- [0.39110955, 0.33281393],
- [0.59830875, 0.18673419],
- [0.67275604, 0.94180287]])
- """
- @abstractmethod
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self,
- d: IntNumber,
- *,
- optimization: Literal["random-cd", "lloyd"] | None = None,
- rng: SeedType = None
- ) -> None:
- self._initialize(d, optimization=optimization, rng=rng)
- # During SPEC 7 transition:
- # `__init__` has to be wrapped with @_transition_to_rng decorator
- # because it is public. Subclasses previously called `__init__`
- # directly, but this was problematic because arguments passed to
- # subclass `__init__` as `seed` would get passed to superclass
- # `__init__` as `rng`, rejecting `RandomState` arguments.
- def _initialize(
- self,
- d: IntNumber,
- *,
- optimization: Literal["random-cd", "lloyd"] | None = None,
- rng: SeedType = None
- ) -> None:
- if not np.issubdtype(type(d), np.integer) or d < 0:
- raise ValueError('d must be a non-negative integer value')
- self.d = d
- if isinstance(rng, np.random.Generator):
- # Spawn a Generator that we can own and reset.
- self.rng = _rng_spawn(rng, 1)[0]
- else:
- # Create our instance of Generator, does not need spawning
- # Also catch RandomState which cannot be spawned
- self.rng = check_random_state(rng)
- self.rng_seed = copy.deepcopy(self.rng)
- self.num_generated = 0
- config = {
- # random-cd
- "n_nochange": 100,
- "n_iters": 10_000,
- "rng": self.rng,
- # lloyd
- "tol": 1e-5,
- "maxiter": 10,
- "qhull_options": None,
- }
- self._optimization = optimization
- self.optimization_method = _select_optimizer(optimization, config)
- @abstractmethod
- def _random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- ...
- def random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- """Draw `n` in the half-open interval ``[0, 1)``.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space.
- Default is 1.
- workers : int, optional
- Only supported with `Halton`.
- Number of workers to use for parallel processing. If -1 is
- given all CPU threads are used. Default is 1. It becomes faster
- than one worker for `n` greater than :math:`10^3`.
- Returns
- -------
- sample : array_like (n, d)
- QMC sample.
- """
- sample = self._random(n, workers=workers)
- if self.optimization_method is not None:
- sample = self.optimization_method(sample)
- self.num_generated += n
- return sample
- def integers(
- self,
- l_bounds: "npt.ArrayLike",
- *,
- u_bounds: "npt.ArrayLike | None" = None,
- n: IntNumber = 1,
- endpoint: bool = False,
- workers: IntNumber = 1
- ) -> np.ndarray:
- r"""
- Draw `n` integers from `l_bounds` (inclusive) to `u_bounds`
- (exclusive), or if endpoint=True, `l_bounds` (inclusive) to
- `u_bounds` (inclusive).
- Parameters
- ----------
- l_bounds : int or array-like of ints
- Lowest (signed) integers to be drawn (unless ``u_bounds=None``,
- in which case this parameter is 0 and this value is used for
- `u_bounds`).
- u_bounds : int or array-like of ints, optional
- If provided, one above the largest (signed) integer to be drawn
- (see above for behavior if ``u_bounds=None``).
- If array-like, must contain integer values.
- n : int, optional
- Number of samples to generate in the parameter space.
- Default is 1.
- endpoint : bool, optional
- If true, sample from the interval ``[l_bounds, u_bounds]`` instead
- of the default ``[l_bounds, u_bounds)``. Defaults is False.
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is
- given all CPU threads are used. Only supported when using `Halton`
- Default is 1.
- Returns
- -------
- sample : array_like (n, d)
- QMC sample.
- Notes
- -----
- It is safe to just use the same ``[0, 1)`` to integer mapping
- with QMC that you would use with MC. You still get unbiasedness,
- a strong law of large numbers, an asymptotically infinite variance
- reduction and a finite sample variance bound.
- To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
- with :math:`a` the lower bounds and :math:`b` the upper bounds,
- the following transformation is used:
- .. math::
- \text{floor}((b - a) \cdot \text{sample} + a)
- """
- if u_bounds is None:
- u_bounds = l_bounds
- l_bounds = 0
- u_bounds = np.atleast_1d(u_bounds)
- l_bounds = np.atleast_1d(l_bounds)
- if endpoint:
- u_bounds = u_bounds + 1
- if (not np.issubdtype(l_bounds.dtype, np.integer) or
- not np.issubdtype(u_bounds.dtype, np.integer)):
- message = ("'u_bounds' and 'l_bounds' must be integers or"
- " array-like of integers")
- raise ValueError(message)
- if isinstance(self, Halton):
- sample = self.random(n=n, workers=workers)
- else:
- sample = self.random(n=n)
- sample = scale(sample, l_bounds=l_bounds, u_bounds=u_bounds)
- sample = np.floor(sample).astype(np.int64)
- return sample
- def reset(self) -> "QMCEngine":
- """Reset the engine to base state.
- Returns
- -------
- engine : QMCEngine
- Engine reset to its base state.
- """
- rng = copy.deepcopy(self.rng_seed)
- self.rng = check_random_state(rng)
- self.num_generated = 0
- return self
- def fast_forward(self, n: IntNumber) -> "QMCEngine":
- """Fast-forward the sequence by `n` positions.
- Parameters
- ----------
- n : int
- Number of points to skip in the sequence.
- Returns
- -------
- engine : QMCEngine
- Engine reset to its base state.
- """
- self.random(n=n)
- return self
- class Halton(QMCEngine):
- """Halton sequence.
- Pseudo-random number generator that generalize the Van der Corput sequence
- for multiple dimensions. The Halton sequence uses the base-two Van der
- Corput sequence for the first dimension, base-three for its second and
- base-:math:`p` for its :math:`n`-dimension, with :math:`p` the
- :math:`n`'th prime.
- Parameters
- ----------
- d : int
- Dimension of the parameter space.
- scramble : bool, optional
- If True, use random scrambling from [2]_. Otherwise no scrambling
- is done.
- Default is True.
- optimization : {None, "random-cd", "lloyd"}, optional
- Whether to use an optimization scheme to improve the quality after
- sampling. Note that this is a post-processing step that does not
- guarantee that all properties of the sample will be conserved.
- Default is None.
- * ``random-cd``: random permutations of coordinates to lower the
- centered discrepancy. The best sample based on the centered
- discrepancy is constantly updated. Centered discrepancy-based
- sampling shows better space-filling robustness toward 2D and 3D
- subprojections compared to using other discrepancy measures.
- * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
- The process converges to equally spaced samples.
- .. versionadded:: 1.10.0
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- Notes
- -----
- The Halton sequence has severe striping artifacts for even modestly
- large dimensions. These can be ameliorated by scrambling. Scrambling
- also supports replication-based error estimates and extends
- applicability to unbounded integrands.
- References
- ----------
- .. [1] Halton, "On the efficiency of certain quasi-random sequences of
- points in evaluating multi-dimensional integrals", Numerische
- Mathematik, 1960.
- .. [2] A. B. Owen. "A randomized Halton algorithm in R",
- :arxiv:`1706.02808`, 2017.
- Examples
- --------
- Generate samples from a low discrepancy sequence of Halton.
- >>> from scipy.stats import qmc
- >>> sampler = qmc.Halton(d=2, scramble=False)
- >>> sample = sampler.random(n=5)
- >>> sample
- array([[0. , 0. ],
- [0.5 , 0.33333333],
- [0.25 , 0.66666667],
- [0.75 , 0.11111111],
- [0.125 , 0.44444444]])
- Compute the quality of the sample using the discrepancy criterion.
- >>> qmc.discrepancy(sample)
- 0.088893711419753
- If some wants to continue an existing design, extra points can be obtained
- by calling again `random`. Alternatively, you can skip some points like:
- >>> _ = sampler.fast_forward(5)
- >>> sample_continued = sampler.random(n=5)
- >>> sample_continued
- array([[0.3125 , 0.37037037],
- [0.8125 , 0.7037037 ],
- [0.1875 , 0.14814815],
- [0.6875 , 0.48148148],
- [0.4375 , 0.81481481]])
- Finally, samples can be scaled to bounds.
- >>> l_bounds = [0, 2]
- >>> u_bounds = [10, 5]
- >>> qmc.scale(sample_continued, l_bounds, u_bounds)
- array([[3.125 , 3.11111111],
- [8.125 , 4.11111111],
- [1.875 , 2.44444444],
- [6.875 , 3.44444444],
- [4.375 , 4.44444444]])
- """
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self, d: IntNumber, *, scramble: bool = True,
- optimization: Literal["random-cd", "lloyd"] | None = None,
- rng: SeedType = None
- ) -> None:
- # Used in `scipy.integrate.qmc_quad`
- self._init_quad = {'d': d, 'scramble': True,
- 'optimization': optimization}
- super()._initialize(d=d, optimization=optimization, rng=rng)
- # important to have ``type(bdim) == int`` for performance reason
- self.base = [int(bdim) for bdim in n_primes(d)]
- self.scramble = scramble
- self._initialize_permutations()
- def _initialize_permutations(self) -> None:
- """Initialize permutations for all Van der Corput sequences.
- Permutations are only needed for scrambling.
- """
- self._permutations: list = [None] * len(self.base)
- if self.scramble:
- for i, bdim in enumerate(self.base):
- permutations = _van_der_corput_permutations(
- base=bdim, rng=self.rng
- )
- self._permutations[i] = permutations
- def _random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- """Draw `n` in the half-open interval ``[0, 1)``.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is
- given all CPU threads are used. Default is 1. It becomes faster
- than one worker for `n` greater than :math:`10^3`.
- Returns
- -------
- sample : array_like (n, d)
- QMC sample.
- """
- workers = _validate_workers(workers)
- # Generate a sample using a Van der Corput sequence per dimension.
- sample = [van_der_corput(n, bdim, start_index=self.num_generated,
- scramble=self.scramble,
- permutations=self._permutations[i],
- workers=workers)
- for i, bdim in enumerate(self.base)]
- return np.array(sample).T.reshape(n, self.d)
- class LatinHypercube(QMCEngine):
- r"""Latin hypercube sampling (LHS).
- A Latin hypercube sample [1]_ generates :math:`n` points in
- :math:`[0,1)^{d}`. Each univariate marginal distribution is stratified,
- placing exactly one point in :math:`[j/n, (j+1)/n)` for
- :math:`j=0,1,...,n-1`. They are still applicable when :math:`n << d`.
- Parameters
- ----------
- d : int
- Dimension of the parameter space.
- scramble : bool, optional
- When False, center samples within cells of a multi-dimensional grid.
- Otherwise, samples are randomly placed within cells of the grid.
- .. note::
- Setting ``scramble=False`` does not ensure deterministic output.
- For that, use the `rng` parameter.
- Default is True.
- .. versionadded:: 1.10.0
- optimization : {None, "random-cd", "lloyd"}, optional
- Whether to use an optimization scheme to improve the quality after
- sampling. Note that this is a post-processing step that does not
- guarantee that all properties of the sample will be conserved.
- Default is None.
- * ``random-cd``: random permutations of coordinates to lower the
- centered discrepancy. The best sample based on the centered
- discrepancy is constantly updated. Centered discrepancy-based
- sampling shows better space-filling robustness toward 2D and 3D
- subprojections compared to using other discrepancy measures.
- * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
- The process converges to equally spaced samples.
- .. versionadded:: 1.8.0
- .. versionchanged:: 1.10.0
- Add ``lloyd``.
- strength : {1, 2}, optional
- Strength of the LHS. ``strength=1`` produces a plain LHS while
- ``strength=2`` produces an orthogonal array based LHS of strength 2
- [7]_, [8]_. In that case, only ``n=p**2`` points can be sampled,
- with ``p`` a prime number. It also constrains ``d <= p + 1``.
- Default is 1.
- .. versionadded:: 1.8.0
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- See Also
- --------
- :ref:`quasi-monte-carlo`
- Notes
- -----
- When LHS is used for integrating a function :math:`f` over :math:`n`,
- LHS is extremely effective on integrands that are nearly additive [2]_.
- With a LHS of :math:`n` points, the variance of the integral is always
- lower than plain MC on :math:`n-1` points [3]_. There is a central limit
- theorem for LHS on the mean and variance of the integral [4]_, but not
- necessarily for optimized LHS due to the randomization.
- :math:`A` is called an orthogonal array of strength :math:`t` if in each
- n-row-by-t-column submatrix of :math:`A`: all :math:`p^t` possible
- distinct rows occur the same number of times. The elements of :math:`A`
- are in the set :math:`\{0, 1, ..., p-1\}`, also called symbols.
- The constraint that :math:`p` must be a prime number is to allow modular
- arithmetic. Increasing strength adds some symmetry to the sub-projections
- of a sample. With strength 2, samples are symmetric along the diagonals of
- 2D sub-projections. This may be undesirable, but on the other hand, the
- sample dispersion is improved.
- Strength 1 (plain LHS) brings an advantage over strength 0 (MC) and
- strength 2 is a useful increment over strength 1. Going to strength 3 is
- a smaller increment and scrambled QMC like Sobol', Halton are more
- performant [7]_.
- To create a LHS of strength 2, the orthogonal array :math:`A` is
- randomized by applying a random, bijective map of the set of symbols onto
- itself. For example, in column 0, all 0s might become 2; in column 1,
- all 0s might become 1, etc.
- Then, for each column :math:`i` and symbol :math:`j`, we add a plain,
- one-dimensional LHS of size :math:`p` to the subarray where
- :math:`A^i = j`. The resulting matrix is finally divided by :math:`p`.
- References
- ----------
- .. [1] Mckay et al., "A Comparison of Three Methods for Selecting Values
- of Input Variables in the Analysis of Output from a Computer Code."
- Technometrics, 1979.
- .. [2] M. Stein, "Large sample properties of simulations using Latin
- hypercube sampling." Technometrics 29, no. 2: 143-151, 1987.
- .. [3] A. B. Owen, "Monte Carlo variance of scrambled net quadrature."
- SIAM Journal on Numerical Analysis 34, no. 5: 1884-1910, 1997
- .. [4] Loh, W.-L. "On Latin hypercube sampling." The annals of statistics
- 24, no. 5: 2058-2080, 1996.
- .. [5] Fang et al. "Design and modeling for computer experiments".
- Computer Science and Data Analysis Series, 2006.
- .. [6] Damblin et al., "Numerical studies of space filling designs:
- optimization of Latin Hypercube Samples and subprojection properties."
- Journal of Simulation, 2013.
- .. [7] A. B. Owen , "Orthogonal arrays for computer experiments,
- integration and visualization." Statistica Sinica, 1992.
- .. [8] B. Tang, "Orthogonal Array-Based Latin Hypercubes."
- Journal of the American Statistical Association, 1993.
- .. [9] Seaholm, Susan K. et al. (1988). Latin hypercube sampling and the
- sensitivity analysis of a Monte Carlo epidemic model. Int J Biomed
- Comput, 23(1-2), 97-112. :doi:`10.1016/0020-7101(88)90067-0`
- Examples
- --------
- Generate samples from a Latin hypercube generator.
- >>> from scipy.stats import qmc
- >>> sampler = qmc.LatinHypercube(d=2)
- >>> sample = sampler.random(n=5)
- >>> sample
- array([[0.1545328 , 0.53664833], # random
- [0.84052691, 0.06474907],
- [0.52177809, 0.93343721],
- [0.68033825, 0.36265316],
- [0.26544879, 0.61163943]])
- Compute the quality of the sample using the discrepancy criterion.
- >>> qmc.discrepancy(sample)
- 0.0196... # random
- Samples can be scaled to bounds.
- >>> l_bounds = [0, 2]
- >>> u_bounds = [10, 5]
- >>> qmc.scale(sample, l_bounds, u_bounds)
- array([[1.54532796, 3.609945 ], # random
- [8.40526909, 2.1942472 ],
- [5.2177809 , 4.80031164],
- [6.80338249, 3.08795949],
- [2.65448791, 3.83491828]])
- Below are other examples showing alternative ways to construct LHS with
- even better coverage of the space.
- Using a base LHS as a baseline.
- >>> sampler = qmc.LatinHypercube(d=2)
- >>> sample = sampler.random(n=5)
- >>> qmc.discrepancy(sample)
- 0.0196... # random
- Use the `optimization` keyword argument to produce a LHS with
- lower discrepancy at higher computational cost.
- >>> sampler = qmc.LatinHypercube(d=2, optimization="random-cd")
- >>> sample = sampler.random(n=5)
- >>> qmc.discrepancy(sample)
- 0.0176... # random
- Use the `strength` keyword argument to produce an orthogonal array based
- LHS of strength 2. In this case, the number of sample points must be the
- square of a prime number.
- >>> sampler = qmc.LatinHypercube(d=2, strength=2)
- >>> sample = sampler.random(n=9)
- >>> qmc.discrepancy(sample)
- 0.00526... # random
- Options could be combined to produce an optimized centered
- orthogonal array based LHS. After optimization, the result would not
- be guaranteed to be of strength 2.
- **Real-world example**
- In [9]_, a Latin Hypercube sampling (LHS) strategy was used to sample a
- parameter space to study the importance of each parameter of an epidemic
- model. Such analysis is also called a sensitivity analysis.
- Since the dimensionality of the problem is high (6), it is computationally
- expensive to cover the space. When numerical experiments are costly, QMC
- enables analysis that may not be possible if using a grid.
- The six parameters of the model represented the probability of illness,
- the probability of withdrawal, and four contact probabilities. The
- authors assumed uniform distributions for all parameters and generated
- 50 samples.
- Using `scipy.stats.qmc.LatinHypercube` to replicate the protocol,
- the first step is to create a sample in the unit hypercube:
- >>> from scipy.stats import qmc
- >>> sampler = qmc.LatinHypercube(d=6)
- >>> sample = sampler.random(n=50)
- Then the sample can be scaled to the appropriate bounds:
- >>> l_bounds = [0.000125, 0.01, 0.0025, 0.05, 0.47, 0.7]
- >>> u_bounds = [0.000375, 0.03, 0.0075, 0.15, 0.87, 0.9]
- >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
- Such a sample was used to run the model 50 times, and a polynomial
- response surface was constructed. This allowed the authors to study the
- relative importance of each parameter across the range of possibilities
- of every other parameter.
- In this computer experiment, they showed a 14-fold reduction in the
- number of samples required to maintain an error below 2% on their
- response surface when compared to a grid sampling.
- """
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self, d: IntNumber, *,
- scramble: bool = True,
- strength: int = 1,
- optimization: Literal["random-cd", "lloyd"] | None = None,
- rng: SeedType = None
- ) -> None:
- # Used in `scipy.integrate.qmc_quad`
- self._init_quad = {'d': d, 'scramble': True, 'strength': strength,
- 'optimization': optimization}
- super()._initialize(d=d, rng=rng, optimization=optimization)
- self.scramble = scramble
- lhs_method_strength = {
- 1: self._random_lhs,
- 2: self._random_oa_lhs
- }
- try:
- self.lhs_method: Callable = lhs_method_strength[strength]
- except KeyError as exc:
- message = (f"{strength!r} is not a valid strength. It must be one"
- f" of {set(lhs_method_strength)!r}")
- raise ValueError(message) from exc
- def _random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- lhs = self.lhs_method(n)
- return lhs
- def _random_lhs(self, n: IntNumber = 1) -> np.ndarray:
- """Base LHS algorithm."""
- if not self.scramble:
- samples: np.ndarray | float = 0.5
- else:
- samples = self.rng.uniform(size=(n, self.d))
- perms = np.tile(np.arange(1, n + 1),
- (self.d, 1)) # type: ignore[arg-type]
- for i in range(self.d):
- self.rng.shuffle(perms[i, :])
- perms = perms.T
- samples = (perms - samples) / n
- return samples
- def _random_oa_lhs(self, n: IntNumber = 4) -> np.ndarray:
- """Orthogonal array based LHS of strength 2."""
- p = np.sqrt(n).astype(int)
- n_row = p**2
- n_col = p + 1
- primes = primes_from_2_to(p + 1)
- if p not in primes or n != n_row:
- raise ValueError(
- "n is not the square of a prime number. Close"
- f" values are {primes[-2:]**2}"
- )
- if self.d > p + 1:
- raise ValueError("n is too small for d. Must be n > (d-1)**2")
- oa_sample = np.zeros(shape=(n_row, n_col), dtype=int)
- # OA of strength 2
- arrays = np.tile(np.arange(p), (2, 1))
- oa_sample[:, :2] = np.stack(np.meshgrid(*arrays),
- axis=-1).reshape(-1, 2)
- for p_ in range(1, p):
- oa_sample[:, 2+p_-1] = np.mod(oa_sample[:, 0]
- + p_*oa_sample[:, 1], p)
- # scramble the OA
- oa_sample_ = np.empty(shape=(n_row, n_col), dtype=int)
- for j in range(n_col):
- perms = self.rng.permutation(p)
- oa_sample_[:, j] = perms[oa_sample[:, j]]
- oa_sample = oa_sample_
- # following is making a scrambled OA into an OA-LHS
- oa_lhs_sample = np.zeros(shape=(n_row, n_col))
- lhs_engine = LatinHypercube(d=1, scramble=self.scramble, strength=1,
- rng=self.rng) # type: QMCEngine
- for j in range(n_col):
- for k in range(p):
- idx = oa_sample[:, j] == k
- lhs = lhs_engine.random(p).flatten()
- oa_lhs_sample[:, j][idx] = lhs + oa_sample[:, j][idx]
- oa_lhs_sample /= p
- return oa_lhs_sample[:, :self.d]
- class Sobol(QMCEngine):
- """Engine for generating (scrambled) Sobol' sequences.
- Sobol' sequences are low-discrepancy, quasi-random numbers. Points
- can be drawn using two methods:
- * `random_base2`: safely draw :math:`n=2^m` points. This method
- guarantees the balance properties of the sequence.
- * `random`: draw an arbitrary number of points from the
- sequence. See warning below.
- Parameters
- ----------
- d : int
- Dimensionality of the sequence. Max dimensionality is 21201.
- scramble : bool, optional
- If True, use LMS+shift scrambling. Otherwise, no scrambling is done.
- Default is True.
- bits : int, optional
- Number of bits of the generator. Control the maximum number of points
- that can be generated, which is ``2**bits``. Maximal value is 64.
- It does not correspond to the return type, which is always
- ``np.float64`` to prevent points from repeating themselves.
- Default is None, which for backward compatibility, corresponds to 30.
- .. versionadded:: 1.9.0
- optimization : {None, "random-cd", "lloyd"}, optional
- Whether to use an optimization scheme to improve the quality after
- sampling. Note that this is a post-processing step that does not
- guarantee that all properties of the sample will be conserved.
- Default is None.
- * ``random-cd``: random permutations of coordinates to lower the
- centered discrepancy. The best sample based on the centered
- discrepancy is constantly updated. Centered discrepancy-based
- sampling shows better space-filling robustness toward 2D and 3D
- subprojections compared to using other discrepancy measures.
- * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
- The process converges to equally spaced samples.
- .. versionadded:: 1.10.0
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- Notes
- -----
- Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in
- :math:`[0,1)^{d}`. Scrambling them [3]_ makes them suitable for singular
- integrands, provides a means of error estimation, and can improve their
- rate of convergence. The scrambling strategy which is implemented is a
- (left) linear matrix scramble (LMS) followed by a digital random shift
- (LMS+shift) [2]_.
- There are many versions of Sobol' sequences depending on their
- 'direction numbers'. This code uses direction numbers from [4]_. Hence,
- the maximum number of dimension is 21201. The direction numbers have been
- precomputed with search criterion 6 and can be retrieved at
- https://web.maths.unsw.edu.au/~fkuo/sobol/.
- .. warning::
- Sobol' sequences are a quadrature rule and they lose their balance
- properties if one uses a sample size that is not a power of 2, or skips
- the first point, or thins the sequence [5]_.
- If :math:`n=2^m` points are not enough then one should take :math:`2^M`
- points for :math:`M>m`. When scrambling, the number R of independent
- replicates does not have to be a power of 2.
- Sobol' sequences are generated to some number :math:`B` of bits.
- After :math:`2^B` points have been generated, the sequence would
- repeat. Hence, an error is raised.
- The number of bits can be controlled with the parameter `bits`.
- References
- ----------
- .. [1] I. M. Sobol', "The distribution of points in a cube and the accurate
- evaluation of integrals." Zh. Vychisl. Mat. i Mat. Phys., 7:784-802,
- 1967.
- .. [2] J. Matousek, "On the L2-discrepancy for anchored boxes."
- J. of Complexity 14, 527-556, 1998.
- .. [3] Art B. Owen, "Scrambling Sobol and Niederreiter-Xing points."
- Journal of Complexity, 14(4):466-489, December 1998.
- .. [4] S. Joe and F. Y. Kuo, "Constructing sobol sequences with better
- two-dimensional projections." SIAM Journal on Scientific Computing,
- 30(5):2635-2654, 2008.
- .. [5] Art B. Owen, "On dropping the first Sobol' point."
- :arxiv:`2008.08051`, 2020.
- Examples
- --------
- Generate samples from a low discrepancy sequence of Sobol'.
- >>> from scipy.stats import qmc
- >>> sampler = qmc.Sobol(d=2, scramble=False)
- >>> sample = sampler.random_base2(m=3)
- >>> sample
- array([[0. , 0. ],
- [0.5 , 0.5 ],
- [0.75 , 0.25 ],
- [0.25 , 0.75 ],
- [0.375, 0.375],
- [0.875, 0.875],
- [0.625, 0.125],
- [0.125, 0.625]])
- Compute the quality of the sample using the discrepancy criterion.
- >>> qmc.discrepancy(sample)
- 0.013882107204860938
- To continue an existing design, extra points can be obtained
- by calling again `random_base2`. Alternatively, you can skip some
- points like:
- >>> _ = sampler.reset()
- >>> _ = sampler.fast_forward(4)
- >>> sample_continued = sampler.random_base2(m=2)
- >>> sample_continued
- array([[0.375, 0.375],
- [0.875, 0.875],
- [0.625, 0.125],
- [0.125, 0.625]])
- Finally, samples can be scaled to bounds.
- >>> l_bounds = [0, 2]
- >>> u_bounds = [10, 5]
- >>> qmc.scale(sample_continued, l_bounds, u_bounds)
- array([[3.75 , 3.125],
- [8.75 , 4.625],
- [6.25 , 2.375],
- [1.25 , 3.875]])
- """
- MAXDIM: ClassVar[int] = _MAXDIM
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self, d: IntNumber, *, scramble: bool = True,
- bits: IntNumber | None = None, rng: SeedType = None,
- optimization: Literal["random-cd", "lloyd"] | None = None
- ) -> None:
- # Used in `scipy.integrate.qmc_quad`
- self._init_quad = {'d': d, 'scramble': True, 'bits': bits,
- 'optimization': optimization}
- super()._initialize(d=d, optimization=optimization, rng=rng)
- if d > self.MAXDIM:
- raise ValueError(
- f"Maximum supported dimensionality is {self.MAXDIM}."
- )
- self.bits = bits
- self.dtype_i: type
- self.scramble = scramble
- if self.bits is None:
- self.bits = 30
- if self.bits <= 32:
- self.dtype_i = np.uint32
- elif 32 < self.bits <= 64:
- self.dtype_i = np.uint64
- else:
- raise ValueError("Maximum supported 'bits' is 64")
- self.maxn = 2**self.bits
- # v is d x maxbit matrix
- self._sv: np.ndarray = np.zeros((d, self.bits), dtype=self.dtype_i)
- _initialize_v(self._sv, dim=d, bits=self.bits)
- if not scramble:
- self._shift: np.ndarray = np.zeros(d, dtype=self.dtype_i)
- else:
- # scramble self._shift and self._sv
- self._scramble()
- self._quasi = self._shift.copy()
- # normalization constant with the largest possible number
- # calculate in Python to not overflow int with 2**64
- self._scale = 1.0 / 2 ** self.bits
- self._first_point = (self._quasi * self._scale).reshape(1, -1)
- # explicit casting to float64
- self._first_point = self._first_point.astype(np.float64)
- def _scramble(self) -> None:
- """Scramble the sequence using LMS+shift."""
- # Generate shift vector
- self._shift = np.dot(
- rng_integers(self.rng, 2, size=(self.d, self.bits),
- dtype=self.dtype_i),
- 2 ** np.arange(self.bits, dtype=self.dtype_i),
- )
- # Generate lower triangular matrices (stacked across dimensions)
- ltm = np.tril(rng_integers(self.rng, 2,
- size=(self.d, self.bits, self.bits),
- dtype=self.dtype_i))
- _cscramble(
- dim=self.d, bits=self.bits, # type: ignore[arg-type]
- ltm=ltm, sv=self._sv
- )
- def _random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- """Draw next point(s) in the Sobol' sequence.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- Returns
- -------
- sample : array_like (n, d)
- Sobol' sample.
- """
- sample: np.ndarray = np.empty((n, self.d), dtype=np.float64)
- if n == 0:
- return sample
- total_n = self.num_generated + n
- if total_n > self.maxn:
- msg = (
- f"At most 2**{self.bits}={self.maxn} distinct points can be "
- f"generated. {self.num_generated} points have been previously "
- f"generated, then: n={self.num_generated}+{n}={total_n}. "
- )
- if self.bits != 64:
- msg += "Consider increasing `bits`."
- raise ValueError(msg)
- if self.num_generated == 0:
- # verify n is 2**n
- if not (n & (n - 1) == 0):
- warnings.warn("The balance properties of Sobol' points require"
- " n to be a power of 2.", stacklevel=3)
- if n == 1:
- sample = self._first_point
- else:
- _draw(
- n=n - 1, num_gen=self.num_generated, dim=self.d,
- scale=self._scale, sv=self._sv, quasi=self._quasi,
- sample=sample
- )
- sample = np.concatenate(
- [self._first_point, sample]
- )[:n]
- else:
- _draw(
- n=n, num_gen=self.num_generated - 1, dim=self.d,
- scale=self._scale, sv=self._sv, quasi=self._quasi,
- sample=sample
- )
- return sample
- def random_base2(self, m: IntNumber) -> np.ndarray:
- """Draw point(s) from the Sobol' sequence.
- This function draws :math:`n=2^m` points in the parameter space
- ensuring the balance properties of the sequence.
- Parameters
- ----------
- m : int
- Logarithm in base 2 of the number of samples; i.e., n = 2^m.
- Returns
- -------
- sample : array_like (n, d)
- Sobol' sample.
- """
- n = 2 ** m
- total_n = self.num_generated + n
- if not (total_n & (total_n - 1) == 0):
- raise ValueError('The balance properties of Sobol\' points require '
- f'n to be a power of 2. {self.num_generated} points '
- 'have been previously generated, then: '
- f'n={self.num_generated}+2**{m}={total_n}. '
- 'If you still want to do this, the function '
- '\'Sobol.random()\' can be used.'
- )
- return self.random(n)
- def reset(self) -> "Sobol":
- """Reset the engine to base state.
- Returns
- -------
- engine : Sobol
- Engine reset to its base state.
- """
- super().reset()
- self._quasi = self._shift.copy()
- return self
- def fast_forward(self, n: IntNumber) -> "Sobol":
- """Fast-forward the sequence by `n` positions.
- Parameters
- ----------
- n : int
- Number of points to skip in the sequence.
- Returns
- -------
- engine : Sobol
- The fast-forwarded engine.
- """
- if self.num_generated == 0:
- _fast_forward(
- n=n - 1, num_gen=self.num_generated, dim=self.d,
- sv=self._sv, quasi=self._quasi
- )
- else:
- _fast_forward(
- n=n, num_gen=self.num_generated - 1, dim=self.d,
- sv=self._sv, quasi=self._quasi
- )
- self.num_generated += n
- return self
- class PoissonDisk(QMCEngine):
- """Poisson disk sampling.
- Parameters
- ----------
- d : int
- Dimension of the parameter space.
- radius : float
- Minimal distance to keep between points when sampling new candidates.
- hypersphere : {"volume", "surface"}, optional
- Sampling strategy to generate potential candidates to be added in the
- final sample. Default is "volume".
- * ``volume``: original Bridson algorithm as described in [1]_.
- New candidates are sampled *within* the hypersphere.
- * ``surface``: only sample the surface of the hypersphere.
- ncandidates : int
- Number of candidates to sample per iteration. More candidates result
- in a denser sampling as more candidates can be accepted per iteration.
- optimization : {None, "random-cd", "lloyd"}, optional
- Whether to use an optimization scheme to improve the quality after
- sampling. Note that this is a post-processing step that does not
- guarantee that all properties of the sample will be conserved.
- Default is None.
- * ``random-cd``: random permutations of coordinates to lower the
- centered discrepancy. The best sample based on the centered
- discrepancy is constantly updated. Centered discrepancy-based
- sampling shows better space-filling robustness toward 2D and 3D
- subprojections compared to using other discrepancy measures.
- * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
- The process converges to equally spaced samples.
- .. versionadded:: 1.10.0
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- l_bounds, u_bounds : array_like (d,)
- Lower and upper bounds of target sample data.
- Notes
- -----
- Poisson disk sampling is an iterative sampling strategy. Starting from
- a seed sample, `ncandidates` are sampled in the hypersphere
- surrounding the seed. Candidates below a certain `radius` or outside the
- domain are rejected. New samples are added in a pool of sample seed. The
- process stops when the pool is empty or when the number of required
- samples is reached.
- The maximum number of point that a sample can contain is directly linked
- to the `radius`. As the dimension of the space increases, a higher radius
- spreads the points further and help overcome the curse of dimensionality.
- See the :ref:`quasi monte carlo tutorial <quasi-monte-carlo>` for more
- details.
- .. warning::
- The algorithm is more suitable for low dimensions and sampling size
- due to its iterative nature and memory requirements.
- Selecting a small radius with a high dimension would
- mean that the space could contain more samples than using lower
- dimension or a bigger radius.
- Some code taken from [2]_, written consent given on 31.03.2021
- by the original author, Shamis, for free use in SciPy under
- the 3-clause BSD.
- References
- ----------
- .. [1] Robert Bridson, "Fast Poisson Disk Sampling in Arbitrary
- Dimensions." SIGGRAPH, 2007.
- .. [2] `StackOverflow <https://stackoverflow.com/questions/66047540>`__.
- Examples
- --------
- Generate a 2D sample using a `radius` of 0.2.
- >>> import numpy as np
- >>> import matplotlib.pyplot as plt
- >>> from matplotlib.collections import PatchCollection
- >>> from scipy.stats import qmc
- >>>
- >>> rng = np.random.default_rng()
- >>> radius = 0.2
- >>> engine = qmc.PoissonDisk(d=2, radius=radius, rng=rng)
- >>> sample = engine.random(20)
- Visualizing the 2D sample and showing that no points are closer than
- `radius`. ``radius/2`` is used to visualize non-intersecting circles.
- If two samples are exactly at `radius` from each other, then their circle
- of radius ``radius/2`` will touch.
- >>> fig, ax = plt.subplots()
- >>> _ = ax.scatter(sample[:, 0], sample[:, 1])
- >>> circles = [plt.Circle((xi, yi), radius=radius/2, fill=False)
- ... for xi, yi in sample]
- >>> collection = PatchCollection(circles, match_original=True)
- >>> ax.add_collection(collection)
- >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',
- ... xlim=[0, 1], ylim=[0, 1])
- >>> plt.show()
- Such visualization can be seen as circle packing: how many circle can
- we put in the space. It is a np-hard problem. The method `fill_space`
- can be used to add samples until no more samples can be added. This is
- a hard problem and parameters may need to be adjusted manually. Beware of
- the dimension: as the dimensionality increases, the number of samples
- required to fill the space increases exponentially
- (curse-of-dimensionality).
- """
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self,
- d: IntNumber,
- *,
- radius: DecimalNumber = 0.05,
- hypersphere: Literal["volume", "surface"] = "volume",
- ncandidates: IntNumber = 30,
- optimization: Literal["random-cd", "lloyd"] | None = None,
- rng: SeedType = None,
- l_bounds: "npt.ArrayLike | None" = None,
- u_bounds: "npt.ArrayLike | None" = None,
- ) -> None:
- # Used in `scipy.integrate.qmc_quad`
- self._init_quad = {'d': d, 'radius': radius,
- 'hypersphere': hypersphere,
- 'ncandidates': ncandidates,
- 'optimization': optimization}
- super()._initialize(d=d, optimization=optimization, rng=rng)
- hypersphere_sample = {
- "volume": self._hypersphere_volume_sample,
- "surface": self._hypersphere_surface_sample
- }
- try:
- self.hypersphere_method = hypersphere_sample[hypersphere]
- except KeyError as exc:
- message = (
- f"{hypersphere!r} is not a valid hypersphere sampling"
- f" method. It must be one of {set(hypersphere_sample)!r}")
- raise ValueError(message) from exc
- # size of the sphere from which the samples are drawn relative to the
- # size of a disk (radius)
- # for the surface sampler, all new points are almost exactly 1 radius
- # away from at least one existing sample +eps to avoid rejection
- self.radius_factor = 2 if hypersphere == "volume" else 1.001
- self.radius = radius
- self.radius_squared = self.radius**2
- # sample to generate per iteration in the hypersphere around center
- self.ncandidates = ncandidates
- if u_bounds is None:
- u_bounds = np.ones(d)
- if l_bounds is None:
- l_bounds = np.zeros(d)
- self.l_bounds, self.u_bounds = _validate_bounds(
- l_bounds=l_bounds, u_bounds=u_bounds, d=int(d)
- )
- with np.errstate(divide='ignore'):
- self.cell_size = self.radius / np.sqrt(self.d)
- self.grid_size = (
- np.ceil((self.u_bounds - self.l_bounds) / self.cell_size)
- ).astype(int)
- self._initialize_grid_pool()
- def _initialize_grid_pool(self):
- """Sampling pool and sample grid."""
- self.sample_pool = []
- # Positions of cells
- # n-dim value for each grid cell
- self.sample_grid = np.empty(
- np.append(self.grid_size, self.d),
- dtype=np.float32
- )
- # Initialise empty cells with NaNs
- self.sample_grid.fill(np.nan)
- def _random(
- self, n: IntNumber = 1, *, workers: IntNumber = 1
- ) -> np.ndarray:
- """Draw `n` in the interval ``[l_bounds, u_bounds]``.
- Note that it can return fewer samples if the space is full.
- See the note section of the class.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- Returns
- -------
- sample : array_like (n, d)
- QMC sample.
- """
- if n == 0 or self.d == 0:
- return np.empty((n, self.d))
- def in_limits(sample: np.ndarray) -> bool:
- for i in range(self.d):
- if (sample[i] > self.u_bounds[i] or sample[i] < self.l_bounds[i]):
- return False
- return True
- def in_neighborhood(candidate: np.ndarray, n: int = 2) -> bool:
- """
- Check if there are samples closer than ``radius_squared`` to the
- `candidate` sample.
- """
- indices = ((candidate - self.l_bounds) / self.cell_size).astype(int)
- ind_min = np.maximum(indices - n, 0)
- ind_max = np.minimum(indices + n + 1, self.grid_size)
- # Check if the center cell is empty
- if not np.isnan(self.sample_grid[tuple(indices)][0]):
- return True
- a = [slice(ind_min[i], ind_max[i]) for i in range(self.d)]
- # guards against: invalid value encountered in less as we are
- # comparing with nan and returns False. Which is wanted.
- with np.errstate(invalid='ignore'):
- if np.any(
- np.sum(
- np.square(candidate - self.sample_grid[tuple(a)]),
- axis=self.d
- ) < self.radius_squared
- ):
- return True
- return False
- def add_sample(candidate: np.ndarray) -> None:
- self.sample_pool.append(candidate)
- indices = ((candidate - self.l_bounds) / self.cell_size).astype(int)
- self.sample_grid[tuple(indices)] = candidate
- curr_sample.append(candidate)
- curr_sample: list[np.ndarray] = []
- if len(self.sample_pool) == 0:
- # the pool is being initialized with a single random sample
- add_sample(self.rng.uniform(self.l_bounds, self.u_bounds))
- num_drawn = 1
- else:
- num_drawn = 0
- # exhaust sample pool to have up to n sample
- while len(self.sample_pool) and num_drawn < n:
- # select a sample from the available pool
- idx_center = rng_integers(self.rng, len(self.sample_pool))
- center = self.sample_pool[idx_center]
- del self.sample_pool[idx_center]
- # generate candidates around the center sample
- candidates = self.hypersphere_method(
- center, self.radius * self.radius_factor, self.ncandidates
- )
- # keep candidates that satisfy some conditions
- for candidate in candidates:
- if in_limits(candidate) and not in_neighborhood(candidate):
- add_sample(candidate)
- num_drawn += 1
- if num_drawn >= n:
- break
- self.num_generated += num_drawn
- return np.array(curr_sample)
- def fill_space(self) -> np.ndarray:
- """Draw ``n`` samples in the interval ``[l_bounds, u_bounds]``.
- Unlike `random`, this method will try to add points until
- the space is full. Depending on ``candidates`` (and to a lesser extent
- other parameters), some empty areas can still be present in the sample.
- .. warning::
- This can be extremely slow in high dimensions or if the
- ``radius`` is very small-with respect to the dimensionality.
- Returns
- -------
- sample : array_like (n, d)
- QMC sample.
- """
- return self.random(np.inf) # type: ignore[arg-type]
- def reset(self) -> "PoissonDisk":
- """Reset the engine to base state.
- Returns
- -------
- engine : PoissonDisk
- Engine reset to its base state.
- """
- super().reset()
- self._initialize_grid_pool()
- return self
- def _hypersphere_volume_sample(
- self, center: np.ndarray, radius: DecimalNumber,
- candidates: IntNumber = 1
- ) -> np.ndarray:
- """Uniform sampling within hypersphere."""
- # should remove samples within r/2
- x = self.rng.standard_normal(size=(candidates, self.d))
- ssq = np.sum(x**2, axis=1)
- fr = radius * gammainc(self.d/2, ssq/2)**(1/self.d) / np.sqrt(ssq)
- fr_tiled = np.tile(
- fr.reshape(-1, 1), (1, self.d) # type: ignore[arg-type]
- )
- p = center + np.multiply(x, fr_tiled)
- return p
- def _hypersphere_surface_sample(
- self, center: np.ndarray, radius: DecimalNumber,
- candidates: IntNumber = 1
- ) -> np.ndarray:
- """Uniform sampling on the hypersphere's surface."""
- vec = self.rng.standard_normal(size=(candidates, self.d))
- vec /= np.linalg.norm(vec, axis=1)[:, None]
- p = center + np.multiply(vec, radius)
- return p
- class MultivariateNormalQMC:
- r"""QMC sampling from a multivariate Normal :math:`N(\mu, \Sigma)`.
- Parameters
- ----------
- mean : array_like (d,)
- The mean vector. Where ``d`` is the dimension.
- cov : array_like (d, d), optional
- The covariance matrix. If omitted, use `cov_root` instead.
- If both `cov` and `cov_root` are omitted, use the identity matrix.
- cov_root : array_like (d, d'), optional
- A root decomposition of the covariance matrix, where ``d'`` may be less
- than ``d`` if the covariance is not full rank. If omitted, use `cov`.
- inv_transform : bool, optional
- If True, use inverse transform instead of Box-Muller. Default is True.
- engine : QMCEngine, optional
- Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- Examples
- --------
- >>> import matplotlib.pyplot as plt
- >>> from scipy.stats import qmc
- >>> dist = qmc.MultivariateNormalQMC(mean=[0, 5], cov=[[1, 0], [0, 1]])
- >>> sample = dist.random(512)
- >>> _ = plt.scatter(sample[:, 0], sample[:, 1])
- >>> plt.show()
- """
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self,
- mean: "npt.ArrayLike",
- cov: "npt.ArrayLike | None" = None,
- *,
- cov_root: "npt.ArrayLike | None" = None,
- inv_transform: bool = True,
- engine: QMCEngine | None = None,
- rng: SeedType = None,
- ) -> None:
- mean = np.asarray(np.atleast_1d(mean))
- d = mean.shape[0]
- if cov is not None:
- # covariance matrix provided
- cov = np.asarray(np.atleast_2d(cov))
- # check for square/symmetric cov matrix and mean vector has the
- # same d
- if not mean.shape[0] == cov.shape[0]:
- raise ValueError("Dimension mismatch between mean and "
- "covariance.")
- if not np.allclose(cov, cov.transpose()):
- raise ValueError("Covariance matrix is not symmetric.")
- # compute Cholesky decomp; if it fails, do the eigen decomposition
- try:
- cov_root = np.linalg.cholesky(cov).transpose()
- except np.linalg.LinAlgError:
- eigval, eigvec = np.linalg.eigh(cov)
- if not np.all(eigval >= -1.0e-8):
- raise ValueError("Covariance matrix not PSD.")
- eigval = np.clip(eigval, 0.0, None)
- cov_root = (eigvec * np.sqrt(eigval)).transpose()
- elif cov_root is not None:
- # root decomposition provided
- cov_root = np.atleast_2d(cov_root)
- if not mean.shape[0] == cov_root.shape[0]:
- raise ValueError("Dimension mismatch between mean and "
- "covariance.")
- else:
- # corresponds to identity covariance matrix
- cov_root = None
- self._inv_transform = inv_transform
- if not inv_transform:
- # to apply Box-Muller, we need an even number of dimensions
- engine_dim = 2 * math.ceil(d / 2)
- else:
- engine_dim = d
- if engine is None:
- # Need this during SPEC 7 transition to prevent `RandomState`
- # from being passed via `rng`.
- kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng"
- kwargs = {kwarg: rng}
- self.engine = Sobol(
- d=engine_dim, scramble=True, bits=30, **kwargs
- ) # type: QMCEngine
- elif isinstance(engine, QMCEngine):
- if engine.d != engine_dim:
- raise ValueError("Dimension of `engine` must be consistent"
- " with dimensions of mean and covariance."
- " If `inv_transform` is False, it must be"
- " an even number.")
- self.engine = engine
- else:
- raise ValueError("`engine` must be an instance of "
- "`scipy.stats.qmc.QMCEngine` or `None`.")
- self._mean = mean
- self._corr_matrix = cov_root
- self._d = d
- def random(self, n: IntNumber = 1) -> np.ndarray:
- """Draw `n` QMC samples from the multivariate Normal.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- Returns
- -------
- sample : array_like (n, d)
- Sample.
- """
- base_samples = self._standard_normal_samples(n)
- return self._correlate(base_samples)
- def _correlate(self, base_samples: np.ndarray) -> np.ndarray:
- if self._corr_matrix is not None:
- return base_samples @ self._corr_matrix + self._mean
- else:
- # avoid multiplying with identity here
- return base_samples + self._mean
- def _standard_normal_samples(self, n: IntNumber = 1) -> np.ndarray:
- """Draw `n` QMC samples from the standard Normal :math:`N(0, I_d)`.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- Returns
- -------
- sample : array_like (n, d)
- Sample.
- """
- # get base samples
- samples = self.engine.random(n)
- if self._inv_transform:
- # apply inverse transform
- # (values to close to 0/1 result in inf values)
- return stats.norm.ppf(0.5 + (1 - 1e-10) * (samples - 0.5)) # type: ignore[attr-defined] # noqa: E501
- else:
- # apply Box-Muller transform (note: indexes starting from 1)
- even = np.arange(0, samples.shape[-1], 2)
- Rs = np.sqrt(-2 * np.log(samples[:, even]))
- thetas = 2 * math.pi * samples[:, 1 + even]
- cos = np.cos(thetas)
- sin = np.sin(thetas)
- transf_samples = np.stack([Rs * cos, Rs * sin],
- -1).reshape(n, -1)
- # make sure we only return the number of dimension requested
- return transf_samples[:, : self._d]
- class MultinomialQMC:
- r"""QMC sampling from a multinomial distribution.
- Parameters
- ----------
- pvals : array_like (k,)
- Vector of probabilities of size ``k``, where ``k`` is the number
- of categories. Elements must be non-negative and sum to 1.
- n_trials : int
- Number of trials.
- engine : QMCEngine, optional
- Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
- rng : `numpy.random.Generator`, optional
- Pseudorandom number generator state. When `rng` is None, a new
- `numpy.random.Generator` is created using entropy from the
- operating system. Types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to
- `rng`. For an interim period, both keywords will continue to work, although
- only one may be specified at a time. After the interim period, function
- calls using the `seed` keyword will emit warnings. Following a
- deprecation period, the `seed` keyword will be removed.
- Examples
- --------
- Let's define 3 categories and for a given sample, the sum of the trials
- of each category is 8. The number of trials per category is determined
- by the `pvals` associated to each category.
- Then, we sample this distribution 64 times.
- >>> import matplotlib.pyplot as plt
- >>> from scipy.stats import qmc
- >>> dist = qmc.MultinomialQMC(
- ... pvals=[0.2, 0.4, 0.4], n_trials=10, engine=qmc.Halton(d=1)
- ... )
- >>> sample = dist.random(64)
- We can plot the sample and verify that the median of number of trials
- for each category is following the `pvals`. That would be
- ``pvals * n_trials = [2, 4, 4]``.
- >>> fig, ax = plt.subplots()
- >>> ax.yaxis.get_major_locator().set_params(integer=True)
- >>> _ = ax.boxplot(sample)
- >>> ax.set(xlabel="Categories", ylabel="Trials")
- >>> plt.show()
- """
- @_transition_to_rng('seed', replace_doc=False)
- def __init__(
- self,
- pvals: "npt.ArrayLike",
- n_trials: IntNumber,
- *,
- engine: QMCEngine | None = None,
- rng: SeedType = None,
- ) -> None:
- self.pvals = np.atleast_1d(np.asarray(pvals))
- if np.min(pvals) < 0:
- raise ValueError('Elements of pvals must be non-negative.')
- if not np.isclose(np.sum(pvals), 1):
- raise ValueError('Elements of pvals must sum to 1.')
- self.n_trials = n_trials
- if engine is None:
- # Need this during SPEC 7 transition to prevent `RandomState`
- # from being passed via `rng`.
- kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng"
- kwargs = {kwarg: rng}
- self.engine = Sobol(
- d=1, scramble=True, bits=30, **kwargs
- ) # type: QMCEngine
- elif isinstance(engine, QMCEngine):
- if engine.d != 1:
- raise ValueError("Dimension of `engine` must be 1.")
- self.engine = engine
- else:
- raise ValueError("`engine` must be an instance of "
- "`scipy.stats.qmc.QMCEngine` or `None`.")
- def random(self, n: IntNumber = 1) -> np.ndarray:
- """Draw `n` QMC samples from the multinomial distribution.
- Parameters
- ----------
- n : int, optional
- Number of samples to generate in the parameter space. Default is 1.
- Returns
- -------
- samples : array_like (n, pvals)
- Sample.
- """
- sample = np.empty((n, len(self.pvals)))
- for i in range(n):
- base_draws = self.engine.random(self.n_trials).ravel()
- p_cumulative = np.empty_like(self.pvals, dtype=float)
- _fill_p_cumulative(np.array(self.pvals, dtype=float), p_cumulative)
- sample_ = np.zeros_like(self.pvals, dtype=np.intp)
- _categorize(base_draws, p_cumulative, sample_)
- sample[i] = sample_
- return sample
- def _select_optimizer(
- optimization: Literal["random-cd", "lloyd"] | None, config: dict
- ) -> Callable | None:
- """A factory for optimization methods."""
- optimization_method: dict[str, Callable] = {
- "random-cd": _random_cd,
- "lloyd": _lloyd_centroidal_voronoi_tessellation
- }
- optimizer: partial | None
- if optimization is not None:
- try:
- optimization = optimization.lower() # type: ignore[assignment]
- optimizer_ = optimization_method[optimization]
- except KeyError as exc:
- message = (f"{optimization!r} is not a valid optimization"
- f" method. It must be one of"
- f" {set(optimization_method)!r}")
- raise ValueError(message) from exc
- # config
- optimizer = partial(optimizer_, **config)
- else:
- optimizer = None
- return optimizer
- def _random_cd(
- best_sample: np.ndarray, n_iters: int, n_nochange: int, rng: GeneratorType,
- **kwargs: dict
- ) -> np.ndarray:
- """Optimal LHS on CD.
- Create a base LHS and do random permutations of coordinates to
- lower the centered discrepancy.
- Because it starts with a normal LHS, it also works with the
- `scramble` keyword argument.
- Two stopping criterion are used to stop the algorithm: at most,
- `n_iters` iterations are performed; or if there is no improvement
- for `n_nochange` consecutive iterations.
- """
- del kwargs # only use keywords which are defined, needed by factory
- n, d = best_sample.shape
- if d == 0 or n == 0:
- return np.empty((n, d))
- if d == 1 or n == 1:
- # discrepancy measures are invariant under permuting factors and runs
- return best_sample
- best_disc = discrepancy(best_sample)
- bounds = ([0, d - 1],
- [0, n - 1],
- [0, n - 1])
- n_nochange_ = 0
- n_iters_ = 0
- while n_nochange_ < n_nochange and n_iters_ < n_iters:
- n_iters_ += 1
- col = rng_integers(rng, *bounds[0], endpoint=True) # type: ignore[misc]
- row_1 = rng_integers(rng, *bounds[1], endpoint=True) # type: ignore[misc]
- row_2 = rng_integers(rng, *bounds[2], endpoint=True) # type: ignore[misc]
- disc = _perturb_discrepancy(best_sample,
- row_1, row_2, col,
- best_disc)
- if disc < best_disc:
- best_sample[row_1, col], best_sample[row_2, col] = (
- best_sample[row_2, col], best_sample[row_1, col])
- best_disc = disc
- n_nochange_ = 0
- else:
- n_nochange_ += 1
- return best_sample
- def _l1_norm(sample: np.ndarray) -> float:
- return distance.pdist(sample, 'cityblock').min()
- def _lloyd_iteration(
- sample: np.ndarray,
- decay: float,
- qhull_options: str
- ) -> np.ndarray:
- """Lloyd-Max algorithm iteration.
- Based on the implementation of Stéfan van der Walt:
- https://github.com/stefanv/lloyd
- which is:
- Copyright (c) 2021-04-21 Stéfan van der Walt
- https://github.com/stefanv/lloyd
- MIT License
- Parameters
- ----------
- sample : array_like (n, d)
- The sample to iterate on.
- decay : float
- Relaxation decay. A positive value would move the samples toward
- their centroid, and negative value would move them away.
- 1 would move the samples to their centroid.
- qhull_options : str
- Additional options to pass to Qhull. See Qhull manual
- for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
- "Qbb Qc Qz Qj" otherwise.)
- Returns
- -------
- sample : array_like (n, d)
- The sample after an iteration of Lloyd's algorithm.
- """
- new_sample = np.empty_like(sample)
- voronoi = Voronoi(sample, qhull_options=qhull_options)
- for ii, idx in enumerate(voronoi.point_region):
- # the region is a series of indices into self.voronoi.vertices
- # remove samples at infinity, designated by index -1
- region = [i for i in voronoi.regions[idx] if i != -1]
- # get the vertices for this region
- verts = voronoi.vertices[region]
- # clipping would be wrong, we need to intersect
- # verts = np.clip(verts, 0, 1)
- # move samples towards centroids:
- # Centroid in n-D is the mean for uniformly distributed nodes
- # of a geometry.
- centroid = np.mean(verts, axis=0)
- new_sample[ii] = sample[ii] + (centroid - sample[ii]) * decay
- # only update sample to centroid within the region
- is_valid = np.all(np.logical_and(new_sample >= 0, new_sample <= 1), axis=1)
- sample[is_valid] = new_sample[is_valid]
- return sample
- def _lloyd_centroidal_voronoi_tessellation(
- sample: "npt.ArrayLike",
- *,
- tol: DecimalNumber = 1e-5,
- maxiter: IntNumber = 10,
- qhull_options: str | None = None,
- **kwargs: dict
- ) -> np.ndarray:
- """Approximate Centroidal Voronoi Tessellation.
- Perturb samples in N-dimensions using Lloyd-Max algorithm.
- Parameters
- ----------
- sample : array_like (n, d)
- The sample to iterate on. With ``n`` the number of samples and ``d``
- the dimension. Samples must be in :math:`[0, 1]^d`, with ``d>=2``.
- tol : float, optional
- Tolerance for termination. If the min of the L1-norm over the samples
- changes less than `tol`, it stops the algorithm. Default is 1e-5.
- maxiter : int, optional
- Maximum number of iterations. It will stop the algorithm even if
- `tol` is above the threshold.
- Too many iterations tend to cluster the samples as a hypersphere.
- Default is 10.
- qhull_options : str, optional
- Additional options to pass to Qhull. See Qhull manual
- for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
- "Qbb Qc Qz Qj" otherwise.)
- Returns
- -------
- sample : array_like (n, d)
- The sample after being processed by Lloyd-Max algorithm.
- Notes
- -----
- Lloyd-Max algorithm is an iterative process with the purpose of improving
- the dispersion of samples. For given sample: (i) compute a Voronoi
- Tessellation; (ii) find the centroid of each Voronoi cell; (iii) move the
- samples toward the centroid of their respective cell. See [1]_, [2]_.
- A relaxation factor is used to control how fast samples can move at each
- iteration. This factor is starting at 2 and ending at 1 after `maxiter`
- following an exponential decay.
- The process converges to equally spaced samples. It implies that measures
- like the discrepancy could suffer from too many iterations. On the other
- hand, L1 and L2 distances should improve. This is especially true with
- QMC methods which tend to favor the discrepancy over other criteria.
- .. note::
- The current implementation does not intersect the Voronoi Tessellation
- with the boundaries. This implies that for a low number of samples,
- empirically below 20, no Voronoi cell is touching the boundaries.
- Hence, samples cannot be moved close to the boundaries.
- Further improvements could consider the samples at infinity so that
- all boundaries are segments of some Voronoi cells. This would fix
- the computation of the centroid position.
- .. warning::
- The Voronoi Tessellation step is expensive and quickly becomes
- intractable with dimensions as low as 10 even for a sample
- of size as low as 1000.
- .. versionadded:: 1.9.0
- References
- ----------
- .. [1] Lloyd. "Least Squares Quantization in PCM".
- IEEE Transactions on Information Theory, 1982.
- .. [2] Max J. "Quantizing for minimum distortion".
- IEEE Transactions on Information Theory, 1960.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial import distance
- >>> from scipy.stats._qmc import _lloyd_centroidal_voronoi_tessellation
- >>> rng = np.random.default_rng()
- >>> sample = rng.random((128, 2))
- .. note::
- The samples need to be in :math:`[0, 1]^d`. `scipy.stats.qmc.scale`
- can be used to scale the samples from their
- original bounds to :math:`[0, 1]^d`. And back to their original bounds.
- Compute the quality of the sample using the L1 criterion.
- >>> def l1_norm(sample):
- ... return distance.pdist(sample, 'cityblock').min()
- >>> l1_norm(sample)
- 0.00161... # random
- Now process the sample using Lloyd's algorithm and check the improvement
- on the L1. The value should increase.
- >>> sample = _lloyd_centroidal_voronoi_tessellation(sample)
- >>> l1_norm(sample)
- 0.0278... # random
- """
- del kwargs # only use keywords which are defined, needed by factory
- sample = np.asarray(sample).copy()
- if not sample.ndim == 2:
- raise ValueError('`sample` is not a 2D array')
- if not sample.shape[1] >= 2:
- raise ValueError('`sample` dimension is not >= 2')
- # Checking that sample is within the hypercube
- if (sample.max() > 1.) or (sample.min() < 0.):
- raise ValueError('`sample` is not in unit hypercube')
- if qhull_options is None:
- qhull_options = 'Qbb Qc Qz QJ'
- if sample.shape[1] >= 5:
- qhull_options += ' Qx'
- # Fit an exponential to be 2 at 0 and 1 at `maxiter`.
- # The decay is used for relaxation.
- # analytical solution for y=exp(-maxiter/x) - 0.1
- root = -maxiter / np.log(0.1)
- decay = [np.exp(-x / root)+0.9 for x in range(maxiter)]
- l1_old = _l1_norm(sample=sample)
- for i in range(maxiter):
- sample = _lloyd_iteration(
- sample=sample, decay=decay[i],
- qhull_options=qhull_options,
- )
- l1_new = _l1_norm(sample=sample)
- if abs(l1_new - l1_old) < tol:
- break
- else:
- l1_old = l1_new
- return sample
- def _validate_workers(workers: IntNumber = 1) -> IntNumber:
- """Validate `workers` based on platform and value.
- Parameters
- ----------
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is
- given all CPU threads are used. Default is 1.
- Returns
- -------
- Workers : int
- Number of CPU used by the algorithm
- """
- workers = int(workers)
- if workers == -1:
- workers = os.cpu_count() # type: ignore[assignment]
- if workers is None:
- raise NotImplementedError(
- "Cannot determine the number of cpus using os.cpu_count(), "
- "cannot use -1 for the number of workers"
- )
- elif workers <= 0:
- raise ValueError(f"Invalid number of workers: {workers}, must be -1 "
- "or > 0")
- return workers
- def _validate_bounds(
- l_bounds: "npt.ArrayLike", u_bounds: "npt.ArrayLike", d: int
- ) -> "tuple[npt.NDArray[np.generic], npt.NDArray[np.generic]]":
- """Bounds input validation.
- Parameters
- ----------
- l_bounds, u_bounds : array_like (d,)
- Lower and upper bounds.
- d : int
- Dimension to use for broadcasting.
- Returns
- -------
- l_bounds, u_bounds : array_like (d,)
- Lower and upper bounds.
- """
- try:
- lower = np.broadcast_to(l_bounds, d)
- upper = np.broadcast_to(u_bounds, d)
- except ValueError as exc:
- msg = ("'l_bounds' and 'u_bounds' must be broadcastable and respect"
- " the sample dimension")
- raise ValueError(msg) from exc
- if not np.all(lower < upper):
- raise ValueError("Bounds are not consistent 'l_bounds' < 'u_bounds'")
- return lower, upper
|