| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939 |
- """
- Distance computations (:mod:`scipy.spatial.distance`)
- =====================================================
- .. sectionauthor:: Damian Eads
- Function reference
- ------------------
- Distance matrix computation from a collection of raw observation vectors
- stored in a rectangular array.
- .. autosummary::
- :toctree: generated/
- pdist -- pairwise distances between observation vectors.
- cdist -- distances between two collections of observation vectors
- squareform -- convert distance matrix to a condensed one and vice versa
- directed_hausdorff -- directed Hausdorff distance between arrays
- Predicates for checking the validity of distance matrices, both
- condensed and redundant. Also contained in this module are functions
- for computing the number of observations in a distance matrix.
- .. autosummary::
- :toctree: generated/
- is_valid_dm -- checks for a valid distance matrix
- is_valid_y -- checks for a valid condensed distance matrix
- num_obs_dm -- # of observations in a distance matrix
- num_obs_y -- # of observations in a condensed distance matrix
- Distance functions between two numeric vectors ``u`` and ``v``. Computing
- distances over a large collection of vectors is inefficient for these
- functions. Use ``pdist`` for this purpose.
- .. autosummary::
- :toctree: generated/
- braycurtis -- the Bray-Curtis distance.
- canberra -- the Canberra distance.
- chebyshev -- the Chebyshev distance.
- cityblock -- the Manhattan distance.
- correlation -- the Correlation distance.
- cosine -- the Cosine distance.
- euclidean -- the Euclidean distance.
- jensenshannon -- the Jensen-Shannon distance.
- mahalanobis -- the Mahalanobis distance.
- minkowski -- the Minkowski distance.
- seuclidean -- the normalized Euclidean distance.
- sqeuclidean -- the squared Euclidean distance.
- Distance functions between two boolean vectors (representing sets) ``u`` and
- ``v``. As in the case of numerical vectors, ``pdist`` is more efficient for
- computing the distances between all pairs.
- .. autosummary::
- :toctree: generated/
- dice -- the Dice dissimilarity.
- hamming -- the Hamming distance.
- jaccard -- the Jaccard distance.
- rogerstanimoto -- the Rogers-Tanimoto dissimilarity.
- russellrao -- the Russell-Rao dissimilarity.
- sokalsneath -- the Sokal-Sneath dissimilarity.
- yule -- the Yule dissimilarity.
- :func:`hamming` also operates over discrete numerical vectors.
- """
- # Copyright (C) Damian Eads, 2007-2008. New BSD License.
- __all__ = [
- 'braycurtis',
- 'canberra',
- 'cdist',
- 'chebyshev',
- 'cityblock',
- 'correlation',
- 'cosine',
- 'dice',
- 'directed_hausdorff',
- 'euclidean',
- 'hamming',
- 'is_valid_dm',
- 'is_valid_y',
- 'jaccard',
- 'jensenshannon',
- 'mahalanobis',
- 'minkowski',
- 'num_obs_dm',
- 'num_obs_y',
- 'pdist',
- 'rogerstanimoto',
- 'russellrao',
- 'seuclidean',
- 'sokalsneath',
- 'sqeuclidean',
- 'squareform',
- 'yule'
- ]
- import math
- import warnings
- import dataclasses
- from collections.abc import Callable
- from functools import partial
- import numpy as np
- from scipy._lib._array_api import _asarray
- from scipy._lib._util import _asarray_validated, _transition_to_rng
- from scipy._lib import array_api_extra as xpx
- from scipy.linalg import norm
- from scipy.special import rel_entr
- from . import _hausdorff, _distance_pybind, _distance_wrap
- def _copy_array_if_base_present(a):
- """Copy the array if its base points to a parent array."""
- if a.base is not None:
- return a.copy()
- return a
- def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
- XA = XA - XA.mean(axis=1, keepdims=True)
- XB = XB - XB.mean(axis=1, keepdims=True)
- _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)
- def _correlation_pdist_wrap(X, dm, **kwargs):
- X2 = X - X.mean(axis=1, keepdims=True)
- _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)
- def _convert_to_type(X, out_type):
- return np.ascontiguousarray(X, dtype=out_type)
- def _nbool_correspond_all(u, v, w=None):
- if u.dtype == v.dtype == bool and w is None:
- not_u = ~u
- not_v = ~v
- nff = (not_u & not_v).sum()
- nft = (not_u & v).sum()
- ntf = (u & not_v).sum()
- ntt = (u & v).sum()
- else:
- dtype = np.result_type(int, u.dtype, v.dtype)
- u = u.astype(dtype)
- v = v.astype(dtype)
- not_u = 1.0 - u
- not_v = 1.0 - v
- if w is not None:
- not_u = w * not_u
- u = w * u
- nff = (not_u * not_v).sum()
- nft = (not_u * v).sum()
- ntf = (u * not_v).sum()
- ntt = (u * v).sum()
- return (nff, nft, ntf, ntt)
- def _nbool_correspond_ft_tf(u, v, w=None):
- if u.dtype == v.dtype == bool and w is None:
- not_u = ~u
- not_v = ~v
- nft = (not_u & v).sum()
- ntf = (u & not_v).sum()
- else:
- dtype = np.result_type(int, u.dtype, v.dtype)
- u = u.astype(dtype)
- v = v.astype(dtype)
- not_u = 1.0 - u
- not_v = 1.0 - v
- if w is not None:
- not_u = w * not_u
- u = w * u
- nft = (not_u * v).sum()
- ntf = (u * not_v).sum()
- return (nft, ntf)
- def _validate_cdist_input(XA, XB, mA, mB, n, metric_info, **kwargs):
- # get supported types
- types = metric_info.types
- # choose best type
- typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
- # validate data
- XA = _convert_to_type(XA, out_type=typ)
- XB = _convert_to_type(XB, out_type=typ)
- # validate kwargs
- _validate_kwargs = metric_info.validator
- if _validate_kwargs:
- kwargs = _validate_kwargs((XA, XB), mA + mB, n, **kwargs)
- return XA, XB, typ, kwargs
- def _validate_weight_with_size(X, m, n, **kwargs):
- w = kwargs.pop('w', None)
- if w is None:
- return kwargs
- if w.ndim != 1 or w.shape[0] != n:
- raise ValueError("Weights must have same size as input vector. "
- f"{w.shape[0]} vs. {n}")
- kwargs['w'] = _validate_weights(w)
- return kwargs
- def _validate_hamming_kwargs(X, m, n, **kwargs):
- w = kwargs.get('w', np.ones((n,), dtype='double'))
- if w.ndim != 1 or w.shape[0] != n:
- raise ValueError(f"Weights must have same size as input vector. "
- f"{w.shape[0]} vs. {n}")
- kwargs['w'] = _validate_weights(w)
- return kwargs
- def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
- VI = kwargs.pop('VI', None)
- if VI is None:
- if m <= n:
- # There are fewer observations than the dimension of
- # the observations.
- raise ValueError(
- f"The number of observations ({m}) is too small; "
- f"the covariance matrix is singular. For observations "
- f"with {n} dimensions, at least {n + 1} observations are required.")
- if isinstance(X, tuple):
- X = np.vstack(X)
- CV = np.atleast_2d(np.cov(X.astype(np.float64, copy=False).T))
- VI = np.linalg.inv(CV).T.copy()
- kwargs["VI"] = _convert_to_double(VI)
- return kwargs
- def _validate_minkowski_kwargs(X, m, n, **kwargs):
- kwargs = _validate_weight_with_size(X, m, n, **kwargs)
- if 'p' not in kwargs:
- kwargs['p'] = 2.
- else:
- if kwargs['p'] <= 0:
- raise ValueError("p must be greater than 0")
- return kwargs
- def _validate_pdist_input(X, m, n, metric_info, **kwargs):
- # get supported types
- types = metric_info.types
- # choose best type
- typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
- # validate data
- X = _convert_to_type(X, out_type=typ)
- # validate kwargs
- _validate_kwargs = metric_info.validator
- if _validate_kwargs:
- kwargs = _validate_kwargs(X, m, n, **kwargs)
- return X, typ, kwargs
- def _validate_seuclidean_kwargs(X, m, n, **kwargs):
- V = kwargs.pop('V', None)
- if V is None:
- if isinstance(X, tuple):
- X = np.vstack(X)
- V = np.var(X.astype(np.float64, copy=False), axis=0, ddof=1)
- else:
- V = np.asarray(V, order='c')
- if len(V.shape) != 1:
- raise ValueError('Variance vector V must '
- 'be one-dimensional.')
- if V.shape[0] != n:
- raise ValueError('Variance vector V must be of the same '
- 'dimension as the vectors on which the distances '
- 'are computed.')
- kwargs['V'] = _convert_to_double(V)
- return kwargs
- def _validate_vector(u, dtype=None):
- # XXX Is order='c' really necessary?
- u = np.asarray(u, dtype=dtype, order='c')
- if u.ndim == 1:
- return u
- raise ValueError("Input vector should be 1-D.")
- def _validate_weights(w, dtype=np.float64):
- w = _validate_vector(w, dtype=dtype)
- if np.any(w < 0):
- raise ValueError("Input weights should be all non-negative")
- return w
- @_transition_to_rng('seed', position_num=2, replace_doc=False)
- def directed_hausdorff(u, v, rng=0):
- """
- Compute the directed Hausdorff distance between two 2-D arrays.
- Distances between pairs are calculated using a Euclidean metric.
- Parameters
- ----------
- u : (M,N) array_like
- Input array with M points in N dimensions.
- v : (O,N) array_like
- Input array with O points in N dimensions.
- rng : int or `numpy.random.Generator` or None, optional
- Pseudorandom number generator state. Default is 0 so the
- shuffling of `u` and `v` is reproducible.
- If `rng` is passed by keyword, types other than `numpy.random.Generator` are
- passed to `numpy.random.default_rng` to instantiate a ``Generator``.
- If `rng` is already a ``Generator`` instance, then the provided instance is
- used.
- If this argument is passed by position or `seed` is passed by keyword,
- legacy behavior for the argument `seed` applies:
- - If `seed` is None, a new ``RandomState`` instance is used. The state is
- initialized using data from ``/dev/urandom`` (or the Windows analogue)
- if available or from the system clock otherwise.
- - If `seed` is an int, a new ``RandomState`` instance is used,
- seeded with `seed`.
- - If `seed` is already a ``Generator`` or ``RandomState`` instance, then
- that instance is used.
- .. versionchanged:: 1.15.0
- As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
- transition from use of `numpy.random.RandomState` to
- `numpy.random.Generator`, this keyword was changed from `seed` to `rng`.
- For an interim period, both keywords will continue to work, although only
- one may be specified at a time. After the interim period, function calls
- using the `seed` keyword will emit warnings. The behavior of both `seed`
- and `rng` are outlined above, but only the `rng` keyword should be used in
- new code.
- Returns
- -------
- d : double
- The directed Hausdorff distance between arrays `u` and `v`,
- index_1 : int
- index of point contributing to Hausdorff pair in `u`
- index_2 : int
- index of point contributing to Hausdorff pair in `v`
- Raises
- ------
- ValueError
- An exception is thrown if `u` and `v` do not have
- the same number of columns.
- See Also
- --------
- scipy.spatial.procrustes : Another similarity test for two data sets
- Notes
- -----
- Uses the early break technique and the random sampling approach
- described by [1]_. Although worst-case performance is ``O(m * o)``
- (as with the brute force algorithm), this is unlikely in practice
- as the input data would have to require the algorithm to explore
- every single point interaction, and after the algorithm shuffles
- the input points at that. The best case performance is ``O(m)``,
- which is satisfied by selecting an inner loop distance that is
- less than cmax and leads to an early break as often as possible.
- The authors have formally shown that the average runtime is
- closer to ``O(m)``.
- .. versionadded:: 0.19.0
- References
- ----------
- .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for
- calculating the exact Hausdorff distance." IEEE Transactions On
- Pattern Analysis And Machine Intelligence, vol. 37, no. 11,
- pp. 2153-63, 2015. :doi:`10.1109/TPAMI.2015.2408351`.
- Examples
- --------
- Find the directed Hausdorff distance between two 2-D arrays of
- coordinates:
- >>> from scipy.spatial.distance import directed_hausdorff
- >>> import numpy as np
- >>> u = np.array([(1.0, 0.0),
- ... (0.0, 1.0),
- ... (-1.0, 0.0),
- ... (0.0, -1.0)])
- >>> v = np.array([(2.0, 0.0),
- ... (0.0, 2.0),
- ... (-2.0, 0.0),
- ... (0.0, -4.0)])
- >>> directed_hausdorff(u, v)[0]
- 2.23606797749979
- >>> directed_hausdorff(v, u)[0]
- 3.0
- Find the general (symmetric) Hausdorff distance between two 2-D
- arrays of coordinates:
- >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
- 3.0
- Find the indices of the points that generate the Hausdorff distance
- (the Hausdorff pair):
- >>> directed_hausdorff(v, u)[1:]
- (3, 3)
- """
- u = np.asarray(u, dtype=np.float64, order='c')
- v = np.asarray(v, dtype=np.float64, order='c')
- if u.shape[1] != v.shape[1]:
- raise ValueError('u and v need to have the same '
- 'number of columns')
- result = _hausdorff.directed_hausdorff(u, v, rng)
- return result
- def minkowski(u, v, p=2, w=None):
- """
- Compute the Minkowski distance between two 1-D arrays.
- The Minkowski distance between 1-D arrays `u` and `v`,
- is defined as
- .. math::
- {\\|u-v\\|}_p = (\\sum{|u_i - v_i|^p})^{1/p}.
- \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- p : scalar
- The order of the norm of the difference :math:`{\\|u-v\\|}_p`. Note
- that for :math:`0 < p < 1`, the triangle inequality only holds with
- an additional multiplicative factor, i.e. it is only a quasi-metric.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- minkowski : double
- The Minkowski distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)
- 2.0
- >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)
- 1.4142135623730951
- >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)
- 1.2599210498948732
- >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)
- 1.0
- >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)
- 1.0
- >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)
- 1.0
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if p <= 0:
- raise ValueError("p must be greater than 0")
- u_v = u - v
- if w is not None:
- w = _validate_weights(w)
- if p == 1:
- root_w = w
- elif p == 2:
- # better precision and speed
- root_w = np.sqrt(w)
- elif p == np.inf:
- root_w = (w != 0)
- else:
- root_w = np.power(w, 1/p)
- u_v = root_w * u_v
- dist = norm(u_v, ord=p)
- return dist
- def euclidean(u, v, w=None):
- """
- Computes the Euclidean distance between two 1-D arrays.
- The Euclidean distance between 1-D arrays `u` and `v`, is defined as
- .. math::
- {\\|u-v\\|}_2
- \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2}
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- euclidean : double
- The Euclidean distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.euclidean([1, 0, 0], [0, 1, 0])
- 1.4142135623730951
- >>> distance.euclidean([1, 1, 0], [0, 1, 0])
- 1.0
- """
- return minkowski(u, v, p=2, w=w)
- def sqeuclidean(u, v, w=None):
- """
- Compute the squared Euclidean distance between two 1-D arrays.
- The squared Euclidean distance between `u` and `v` is defined as
- .. math::
- \\sum_i{w_i |u_i - v_i|^2}
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- sqeuclidean : double
- The squared Euclidean distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0])
- 2.0
- >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0])
- 1.0
- """
- # Preserve float dtypes, but convert everything else to np.float64
- # for stability.
- utype, vtype = None, None
- if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)):
- utype = np.float64
- if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)):
- vtype = np.float64
- u = _validate_vector(u, dtype=utype)
- v = _validate_vector(v, dtype=vtype)
- u_v = u - v
- u_v_w = u_v # only want weights applied once
- if w is not None:
- w = _validate_weights(w)
- u_v_w = w * u_v
- return np.dot(u_v, u_v_w)
- def correlation(u, v, w=None, centered=True):
- """
- Compute the correlation distance between two 1-D arrays.
- The correlation distance between `u` and `v`, is
- defined as
- .. math::
- 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
- {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
- where :math:`\\bar{u}` is the mean of the elements of `u`
- and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
- Parameters
- ----------
- u : (N,) array_like of floats
- Input array.
- v : (N,) array_like of floats
- Input array.
- w : (N,) array_like of floats, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- centered : bool, optional
- If True, `u` and `v` will be centered. Default is True.
- Returns
- -------
- correlation : double
- The correlation distance between 1-D array `u` and `v`.
- Examples
- --------
- Find the correlation between two arrays.
- >>> from scipy.spatial.distance import correlation
- >>> correlation([1, 0, 1], [1, 1, 0])
- 1.5
- Using a weighting array, the correlation can be calculated as:
- >>> correlation([1, 0, 1], [1, 1, 0], w=[0.9, 0.1, 0.1])
- 1.1
- If centering is not needed, the correlation can be calculated as:
- >>> correlation([1, 0, 1], [1, 1, 0], centered=False)
- 0.5
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if np.iscomplexobj(u) or np.iscomplexobj(v):
- msg = "`u` and `v` must be real."
- raise TypeError(msg)
- if w is not None:
- w = _validate_weights(w)
- w = w / w.sum()
- if centered:
- if w is not None:
- umu = np.dot(u, w)
- vmu = np.dot(v, w)
- else:
- umu = np.mean(u)
- vmu = np.mean(v)
- u = u - umu
- v = v - vmu
- if w is not None:
- vw = v * w
- uw = u * w
- else:
- vw, uw = v, u
- uv = np.dot(u, vw)
- uu = np.dot(u, uw)
- vv = np.dot(v, vw)
- dist = 1.0 - uv / math.sqrt(uu * vv)
- # Clip the result to avoid rounding error
- return np.clip(dist, 0.0, 2.0)
- def cosine(u, v, w=None):
- """
- Compute the Cosine distance between 1-D arrays.
- The Cosine distance between `u` and `v`, is defined as
- .. math::
- 1 - \\frac{u \\cdot v}
- {\\|u\\|_2 \\|v\\|_2}.
- where :math:`u \\cdot v` is the dot product of :math:`u` and
- :math:`v`.
- Parameters
- ----------
- u : (N,) array_like of floats
- Input array.
- v : (N,) array_like of floats
- Input array.
- w : (N,) array_like of floats, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- cosine : double
- The Cosine distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.cosine([1, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.cosine([100, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.cosine([1, 1, 0], [0, 1, 0])
- 0.29289321881345254
- """
- # cosine distance is also referred to as 'uncentered correlation',
- # or 'reflective correlation'
- return correlation(u, v, w=w, centered=False)
- def hamming(u, v, w=None):
- """
- Compute the Hamming distance between two 1-D arrays.
- The Hamming distance between 1-D arrays `u` and `v`, is simply the
- proportion of disagreeing components in `u` and `v`. If `u` and `v` are
- boolean vectors, the Hamming distance is
- .. math::
- \\frac{c_{01} + c_{10}}{n}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n`.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- hamming : double
- The Hamming distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.hamming([1, 0, 0], [0, 1, 0])
- 0.66666666666666663
- >>> distance.hamming([1, 0, 0], [1, 1, 0])
- 0.33333333333333331
- >>> distance.hamming([1, 0, 0], [2, 0, 0])
- 0.33333333333333331
- >>> distance.hamming([1, 0, 0], [3, 0, 0])
- 0.33333333333333331
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if u.shape != v.shape:
- raise ValueError('The 1d arrays must have equal lengths.')
- u_ne_v = u != v
- if w is not None:
- w = _validate_weights(w)
- if w.shape != u.shape:
- raise ValueError("'w' should have the same length as 'u' and 'v'.")
- w = w / w.sum()
- return np.dot(u_ne_v, w)
- return np.mean(u_ne_v)
- def jaccard(u, v, w=None):
- r"""
- Compute the Jaccard dissimilarity between two boolean vectors.
- Given boolean vectors :math:`u \equiv (u_1, \cdots, u_n)`
- and :math:`v \equiv (v_1, \cdots, v_n)` that are not both zero,
- their *Jaccard dissimilarity* is defined as ([1]_, p. 26)
- .. math::
- d_\textrm{jaccard}(u, v) := \frac{c_{10} + c_{01}}
- {c_{11} + c_{10} + c_{01}}
- where
- .. math::
- c_{ij} := \sum_{1 \le k \le n, u_k=i, v_k=j} 1
- for :math:`i, j \in \{ 0, 1\}`. If :math:`u` and :math:`v` are both zero,
- their Jaccard dissimilarity is defined to be zero. [2]_
- If a (non-negative) weight vector :math:`w \equiv (w_1, \cdots, w_n)`
- is supplied, the *weighted Jaccard dissimilarity* is defined similarly
- but with :math:`c_{ij}` replaced by
- .. math::
- \tilde{c}_{ij} := \sum_{1 \le k \le n, u_k=i, v_k=j} w_k
- Parameters
- ----------
- u : (N,) array_like of bools
- Input vector.
- v : (N,) array_like of bools
- Input vector.
- w : (N,) array_like of floats, optional
- Weights for each pair of :math:`(u_k, v_k)`. Default is ``None``,
- which gives each pair a weight of ``1.0``.
- Returns
- -------
- jaccard : float
- The Jaccard dissimilarity between vectors `u` and `v`, optionally
- weighted by `w` if supplied.
- Notes
- -----
- The Jaccard dissimilarity satisfies the triangle inequality and is
- qualified as a metric. [2]_
- The *Jaccard index*, or *Jaccard similarity coefficient*, is equal to
- one minus the Jaccard dissimilarity. [3]_
- The dissimilarity between general (finite) sets may be computed by
- encoding them as boolean vectors and computing the dissimilarity
- between the encoded vectors.
- For example, subsets :math:`A,B` of :math:`\{ 1, 2, ..., n \}` may be
- encoded into boolean vectors :math:`u, v` by setting
- :math:`u_k := 1_{k \in A}`, :math:`v_k := 1_{k \in B}`
- for :math:`k = 1,2,\cdots,n`.
- .. versionchanged:: 1.2.0
- Previously, if all (positively weighted) elements in `u` and `v` are
- zero, the function would return ``nan``. This was changed to return
- ``0`` instead.
- .. versionchanged:: 1.15.0
- Non-0/1 numeric input used to produce an ad hoc result. Since 1.15.0,
- numeric input is converted to Boolean before computation.
- References
- ----------
- .. [1] Kaufman, L. and Rousseeuw, P. J. (1990). "Finding Groups in Data:
- An Introduction to Cluster Analysis." John Wiley & Sons, Inc.
- .. [2] Kosub, S. (2019). "A note on the triangle inequality for the
- Jaccard distance." *Pattern Recognition Letters*, 120:36-38.
- .. [3] https://en.wikipedia.org/wiki/Jaccard_index
- Examples
- --------
- >>> from scipy.spatial import distance
- Non-zero vectors with no matching 1s have dissimilarity of 1.0:
- >>> distance.jaccard([1, 0, 0], [0, 1, 0])
- 1.0
- Vectors with some matching 1s have dissimilarity less than 1.0:
- >>> distance.jaccard([1, 0, 0, 0], [1, 1, 1, 0])
- 0.6666666666666666
- Identical vectors, including zero vectors, have dissimilarity of 0.0:
- >>> distance.jaccard([1, 0, 0], [1, 0, 0])
- 0.0
- >>> distance.jaccard([0, 0, 0], [0, 0, 0])
- 0.0
- The following example computes the dissimilarity from a confusion matrix
- directly by setting the weight vector to the frequency of True Positive,
- False Negative, False Positive, and True Negative:
- >>> distance.jaccard([1, 1, 0, 0], [1, 0, 1, 0], [31, 41, 59, 26])
- 0.7633587786259542 # (41+59)/(31+41+59)
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- unequal = np.bitwise_xor(u != 0, v != 0)
- nonzero = np.bitwise_or(u != 0, v != 0)
- if w is not None:
- w = _validate_weights(w)
- unequal = w * unequal
- nonzero = w * nonzero
- a = np.float64(unequal.sum())
- b = np.float64(nonzero.sum())
- return (a / b) if b != 0 else np.float64(0)
- def seuclidean(u, v, V):
- """
- Return the standardized Euclidean distance between two 1-D arrays.
- The standardized Euclidean distance between two n-vectors `u` and `v` is
- .. math::
- \\sqrt{\\sum\\limits_i \\frac{1}{V_i} \\left(u_i-v_i \\right)^2}
- ``V`` is the variance vector; ``V[I]`` is the variance computed over all the i-th
- components of the points. If not passed, it is automatically computed.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- V : (N,) array_like
- `V` is a 1-D array of component variances. It is usually computed
- among a larger collection of vectors.
- Returns
- -------
- seuclidean : double
- The standardized Euclidean distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1])
- 4.4721359549995796
- >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1])
- 3.3166247903553998
- >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1])
- 3.1780497164141406
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- V = _validate_vector(V, dtype=np.float64)
- if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
- raise TypeError('V must be a 1-D array of the same dimension '
- 'as u and v.')
- return euclidean(u, v, w=1/V)
- def cityblock(u, v, w=None):
- """
- Compute the City Block (Manhattan) distance.
- Computes the Manhattan distance between two 1-D arrays `u` and `v`,
- which is defined as
- .. math::
- \\sum_i {\\left| u_i - v_i \\right|}.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- cityblock : double
- The City Block (Manhattan) distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.cityblock([1, 0, 0], [0, 1, 0])
- 2
- >>> distance.cityblock([1, 0, 0], [0, 2, 0])
- 3
- >>> distance.cityblock([1, 0, 0], [1, 1, 0])
- 1
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- l1_diff = abs(u - v)
- if w is not None:
- w = _validate_weights(w)
- l1_diff = w * l1_diff
- return l1_diff.sum()
- def mahalanobis(u, v, VI):
- """
- Compute the Mahalanobis distance between two 1-D arrays.
- The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as
- .. math::
- \\sqrt{ (u-v) V^{-1} (u-v)^T }
- where ``V`` is the covariance matrix. Note that the argument `VI`
- is the inverse of ``V``.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- VI : array_like
- The inverse of the covariance matrix.
- Returns
- -------
- mahalanobis : double
- The Mahalanobis distance between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
- >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)
- 1.0
- >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)
- 1.0
- >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
- 1.7320508075688772
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- VI = np.atleast_2d(VI)
- delta = u - v
- m = np.dot(np.dot(delta, VI), delta)
- return np.sqrt(m)
- def chebyshev(u, v, w=None):
- r"""
- Compute the Chebyshev distance.
- The *Chebyshev distance* between real vectors
- :math:`u \equiv (u_1, \cdots, u_n)` and
- :math:`v \equiv (v_1, \cdots, v_n)` is defined as [1]_
- .. math::
- d_\textrm{chebyshev}(u,v) := \max_{1 \le i \le n} |u_i-v_i|
- If a (non-negative) weight vector :math:`w \equiv (w_1, \cdots, w_n)`
- is supplied, the *weighted Chebyshev distance* is defined to be the
- weighted Minkowski distance of infinite order; that is,
- .. math::
- \begin{align}
- d_\textrm{chebyshev}(u,v;w) &:= \lim_{p\rightarrow \infty}
- \left( \sum_{i=1}^n w_i | u_i-v_i |^p \right)^\frac{1}{p} \\
- &= \max_{1 \le i \le n} 1_{w_i > 0} | u_i - v_i |
- \end{align}
- Parameters
- ----------
- u : (N,) array_like of floats
- Input vector.
- v : (N,) array_like of floats
- Input vector.
- w : (N,) array_like of floats, optional
- Weight vector. Default is ``None``, which gives all pairs
- :math:`(u_i, v_i)` the same weight ``1.0``.
- Returns
- -------
- chebyshev : float
- The Chebyshev distance between vectors `u` and `v`, optionally weighted
- by `w`.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Chebyshev_distance
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.chebyshev([1, 0, 0], [0, 1, 0])
- 1
- >>> distance.chebyshev([1, 1, 0], [0, 1, 0])
- 1
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if w is not None:
- w = _validate_weights(w)
- return max((w > 0) * abs(u - v))
- return max(abs(u - v))
- def braycurtis(u, v, w=None):
- """
- Compute the Bray-Curtis distance between two 1-D arrays.
- Bray-Curtis distance is defined as
- .. math::
- \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|}
- The Bray-Curtis distance is in the range [0, 1] if all coordinates are
- positive, and is undefined if the inputs are of length zero.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- braycurtis : double
- The Bray-Curtis distance between 1-D arrays `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.braycurtis([1, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.braycurtis([1, 1, 0], [0, 1, 0])
- 0.33333333333333331
- """
- u = _validate_vector(u)
- v = _validate_vector(v, dtype=np.float64)
- l1_diff = abs(u - v)
- l1_sum = abs(u + v)
- if w is not None:
- w = _validate_weights(w)
- l1_diff = w * l1_diff
- l1_sum = w * l1_sum
- return l1_diff.sum() / l1_sum.sum()
- def canberra(u, v, w=None):
- """
- Compute the Canberra distance between two 1-D arrays.
- The Canberra distance is defined as
- .. math::
- d(u,v) = \\sum_i \\frac{|u_i-v_i|}
- {|u_i|+|v_i|}.
- Parameters
- ----------
- u : (N,) array_like
- Input array.
- v : (N,) array_like
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- canberra : double
- The Canberra distance between vectors `u` and `v`.
- Notes
- -----
- When ``u[i]`` and ``v[i]`` are 0 for given i, then the fraction 0/0 = 0 is
- used in the calculation.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.canberra([1, 0, 0], [0, 1, 0])
- 2.0
- >>> distance.canberra([1, 1, 0], [0, 1, 0])
- 1.0
- """
- u = _validate_vector(u)
- v = _validate_vector(v, dtype=np.float64)
- if w is not None:
- w = _validate_weights(w)
- with np.errstate(invalid='ignore'):
- abs_uv = abs(u - v)
- abs_u = abs(u)
- abs_v = abs(v)
- d = abs_uv / (abs_u + abs_v)
- if w is not None:
- d = w * d
- d = np.nansum(d)
- return d
- def jensenshannon(p, q, base=None, *, axis=0, keepdims=False):
- """
- Compute the Jensen-Shannon distance (metric) between
- two probability arrays. This is the square root
- of the Jensen-Shannon divergence.
- The Jensen-Shannon distance between two probability
- vectors `p` and `q` is defined as,
- .. math::
- \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
- where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
- and :math:`D` is the Kullback-Leibler divergence.
- This routine will normalize `p` and `q` if they don't sum to 1.0.
- Parameters
- ----------
- p : (N,) array_like
- left probability vector
- q : (N,) array_like
- right probability vector
- base : double, optional
- the base of the logarithm used to compute the output
- if not given, then the routine uses the default base of
- scipy.stats.entropy.
- axis : int, optional
- Axis along which the Jensen-Shannon distances are computed. The default
- is 0.
- .. versionadded:: 1.7.0
- keepdims : bool, optional
- If this is set to `True`, the reduced axes are left in the
- result as dimensions with size one. With this option,
- the result will broadcast correctly against the input array.
- Default is False.
- .. versionadded:: 1.7.0
- Returns
- -------
- js : double or ndarray
- The Jensen-Shannon distances between `p` and `q` along the `axis`.
- Notes
- -----
- .. versionadded:: 1.2.0
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> import numpy as np
- >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0)
- 1.0
- >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5])
- 0.46450140402245893
- >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0])
- 0.0
- >>> a = np.array([[1, 2, 3, 4],
- ... [5, 6, 7, 8],
- ... [9, 10, 11, 12]])
- >>> b = np.array([[13, 14, 15, 16],
- ... [17, 18, 19, 20],
- ... [21, 22, 23, 24]])
- >>> distance.jensenshannon(a, b, axis=0)
- array([0.1954288, 0.1447697, 0.1138377, 0.0927636])
- >>> distance.jensenshannon(a, b, axis=1)
- array([0.1402339, 0.0399106, 0.0201815])
- """
- p = np.asarray(p)
- q = np.asarray(q)
- p = p / np.sum(p, axis=axis, keepdims=True)
- q = q / np.sum(q, axis=axis, keepdims=True)
- m = (p + q) / 2.0
- left = rel_entr(p, m)
- right = rel_entr(q, m)
- left_sum = np.sum(left, axis=axis, keepdims=keepdims)
- right_sum = np.sum(right, axis=axis, keepdims=keepdims)
- js = left_sum + right_sum
- if base is not None:
- js /= np.log(base)
- return np.sqrt(js / 2.0)
- def yule(u, v, w=None):
- """
- Compute the Yule dissimilarity between two boolean 1-D arrays.
- The Yule dissimilarity is defined as
- .. math::
- \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`.
- Parameters
- ----------
- u : (N,) array_like, bool
- Input array.
- v : (N,) array_like, bool
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- yule : double
- The Yule dissimilarity between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.yule([1, 0, 0], [0, 1, 0])
- 2.0
- >>> distance.yule([1, 1, 0], [0, 1, 0])
- 0.0
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if w is not None:
- w = _validate_weights(w)
- (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
- half_R = ntf * nft
- if half_R == 0:
- return 0.0
- else:
- return float(2.0 * half_R / (ntt * nff + half_R))
- def dice(u, v, w=None):
- """
- Compute the Dice dissimilarity between two boolean 1-D arrays.
- The Dice dissimilarity between `u` and `v`, is
- .. math::
- \\frac{c_{TF} + c_{FT}}
- {2c_{TT} + c_{FT} + c_{TF}}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n`.
- Parameters
- ----------
- u : (N,) array_like, bool
- Input 1-D array.
- v : (N,) array_like, bool
- Input 1-D array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- dice : double
- The Dice dissimilarity between 1-D arrays `u` and `v`.
- Notes
- -----
- This function computes the Dice dissimilarity index. To compute the
- Dice similarity index, convert one to the other with similarity =
- 1 - dissimilarity.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.dice([1, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.dice([1, 0, 0], [1, 1, 0])
- 0.3333333333333333
- >>> distance.dice([1, 0, 0], [2, 0, 0])
- -0.3333333333333333
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if w is not None:
- w = _validate_weights(w)
- if u.dtype == v.dtype == bool and w is None:
- ntt = (u & v).sum()
- else:
- dtype = np.result_type(int, u.dtype, v.dtype)
- u = u.astype(dtype)
- v = v.astype(dtype)
- if w is None:
- ntt = (u * v).sum()
- else:
- ntt = (u * v * w).sum()
- (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
- return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))
- def rogerstanimoto(u, v, w=None):
- """
- Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays.
- The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays
- `u` and `v`, is defined as
- .. math::
- \\frac{R}
- {c_{TT} + c_{FF} + R}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
- Parameters
- ----------
- u : (N,) array_like, bool
- Input array.
- v : (N,) array_like, bool
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- rogerstanimoto : double
- The Rogers-Tanimoto dissimilarity between vectors
- `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0])
- 0.8
- >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0])
- 0.5
- >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0])
- -1.0
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if w is not None:
- w = _validate_weights(w)
- (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
- return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
- def russellrao(u, v, w=None):
- """
- Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
- The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and
- `v`, is defined as
- .. math::
- \\frac{n - c_{TT}}
- {n}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n`.
- Parameters
- ----------
- u : (N,) array_like, bool
- Input array.
- v : (N,) array_like, bool
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- russellrao : double
- The Russell-Rao dissimilarity between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.russellrao([1, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.russellrao([1, 0, 0], [1, 1, 0])
- 0.6666666666666666
- >>> distance.russellrao([1, 0, 0], [2, 0, 0])
- 0.3333333333333333
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if u.dtype == v.dtype == bool and w is None:
- ntt = (u & v).sum()
- n = float(len(u))
- elif w is None:
- ntt = (u * v).sum()
- n = float(len(u))
- else:
- w = _validate_weights(w)
- ntt = (u * v * w).sum()
- n = w.sum()
- return float(n - ntt) / n
- def sokalsneath(u, v, w=None):
- """
- Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays.
- The Sokal-Sneath dissimilarity between `u` and `v`,
- .. math::
- \\frac{R}
- {c_{TT} + R}
- where :math:`c_{ij}` is the number of occurrences of
- :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
- :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
- Parameters
- ----------
- u : (N,) array_like, bool
- Input array.
- v : (N,) array_like, bool
- Input array.
- w : (N,) array_like, optional
- The weights for each value in `u` and `v`. Default is None,
- which gives each value a weight of 1.0
- Returns
- -------
- sokalsneath : double
- The Sokal-Sneath dissimilarity between vectors `u` and `v`.
- Examples
- --------
- >>> from scipy.spatial import distance
- >>> distance.sokalsneath([1, 0, 0], [0, 1, 0])
- 1.0
- >>> distance.sokalsneath([1, 0, 0], [1, 1, 0])
- 0.66666666666666663
- >>> distance.sokalsneath([1, 0, 0], [2, 1, 0])
- 0.0
- >>> distance.sokalsneath([1, 0, 0], [3, 1, 0])
- -2.0
- """
- u = _validate_vector(u)
- v = _validate_vector(v)
- if u.dtype == v.dtype == bool and w is None:
- ntt = (u & v).sum()
- elif w is None:
- ntt = (u * v).sum()
- else:
- w = _validate_weights(w)
- ntt = (u * v * w).sum()
- (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
- denom = np.array(ntt + 2.0 * (ntf + nft))
- if not denom.any():
- raise ValueError('Sokal-Sneath dissimilarity is not defined for '
- 'vectors that are entirely false.')
- return float(2.0 * (ntf + nft)) / denom
- _convert_to_double = partial(_convert_to_type, out_type=np.float64)
- _convert_to_bool = partial(_convert_to_type, out_type=bool)
- # adding python-only wrappers to _distance_wrap module
- _distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap
- _distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap
- @dataclasses.dataclass(frozen=True)
- class CDistMetricWrapper:
- metric_name: str
- def __call__(self, XA, XB, *, out=None, **kwargs):
- XA = np.ascontiguousarray(XA)
- XB = np.ascontiguousarray(XB)
- mA, n = XA.shape
- mB, _ = XB.shape
- metric_name = self.metric_name
- metric_info = _METRICS[metric_name]
- XA, XB, typ, kwargs = _validate_cdist_input(
- XA, XB, mA, mB, n, metric_info, **kwargs)
- w = kwargs.pop('w', None)
- if w is not None:
- metric = metric_info.dist_func
- return _cdist_callable(
- XA, XB, metric=metric, out=out, w=w, **kwargs)
- dm = _prepare_out_argument(out, np.float64, (mA, mB))
- # get cdist wrapper
- cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')
- cdist_fn(XA, XB, dm, **kwargs)
- return dm
- @dataclasses.dataclass(frozen=True)
- class PDistMetricWrapper:
- metric_name: str
- def __call__(self, X, *, out=None, **kwargs):
- X = np.ascontiguousarray(X)
- m, n = X.shape
- metric_name = self.metric_name
- metric_info = _METRICS[metric_name]
- X, typ, kwargs = _validate_pdist_input(
- X, m, n, metric_info, **kwargs)
- out_size = (m * (m - 1)) // 2
- w = kwargs.pop('w', None)
- if w is not None:
- metric = metric_info.dist_func
- return _pdist_callable(
- X, metric=metric, out=out, w=w, **kwargs)
- dm = _prepare_out_argument(out, np.float64, (out_size,))
- # get pdist wrapper
- pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')
- pdist_fn(X, dm, **kwargs)
- return dm
- @dataclasses.dataclass(frozen=True)
- class MetricInfo:
- # Name of python distance function
- canonical_name: str
- # All aliases, including canonical_name
- aka: set[str]
- # unvectorized distance function
- dist_func: Callable
- # Optimized cdist function
- cdist_func: Callable
- # Optimized pdist function
- pdist_func: Callable
- # function that checks kwargs and computes default values:
- # f(X, m, n, **kwargs)
- validator: Callable | None = None
- # list of supported types:
- # X (pdist) and XA (cdist) are used to choose the type. if there is no
- # match the first type is used. Default double
- types: list[str] = dataclasses.field(default_factory=lambda: ['double'])
- # true if out array must be C-contiguous
- requires_contiguous_out: bool = True
- # Registry of implemented metrics:
- _METRIC_INFOS = [
- MetricInfo(
- canonical_name='braycurtis',
- aka={'braycurtis'},
- dist_func=braycurtis,
- cdist_func=_distance_pybind.cdist_braycurtis,
- pdist_func=_distance_pybind.pdist_braycurtis,
- ),
- MetricInfo(
- canonical_name='canberra',
- aka={'canberra'},
- dist_func=canberra,
- cdist_func=_distance_pybind.cdist_canberra,
- pdist_func=_distance_pybind.pdist_canberra,
- ),
- MetricInfo(
- canonical_name='chebyshev',
- aka={'chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'},
- dist_func=chebyshev,
- cdist_func=_distance_pybind.cdist_chebyshev,
- pdist_func=_distance_pybind.pdist_chebyshev,
- ),
- MetricInfo(
- canonical_name='cityblock',
- aka={'cityblock', 'cblock', 'cb', 'c'},
- dist_func=cityblock,
- cdist_func=_distance_pybind.cdist_cityblock,
- pdist_func=_distance_pybind.pdist_cityblock,
- ),
- MetricInfo(
- canonical_name='correlation',
- aka={'correlation', 'co'},
- dist_func=correlation,
- cdist_func=CDistMetricWrapper('correlation'),
- pdist_func=PDistMetricWrapper('correlation'),
- ),
- MetricInfo(
- canonical_name='cosine',
- aka={'cosine', 'cos'},
- dist_func=cosine,
- cdist_func=CDistMetricWrapper('cosine'),
- pdist_func=PDistMetricWrapper('cosine'),
- ),
- MetricInfo(
- canonical_name='dice',
- aka={'dice'},
- types=['bool'],
- dist_func=dice,
- cdist_func=_distance_pybind.cdist_dice,
- pdist_func=_distance_pybind.pdist_dice,
- ),
- MetricInfo(
- canonical_name='euclidean',
- aka={'euclidean', 'euclid', 'eu', 'e'},
- dist_func=euclidean,
- cdist_func=_distance_pybind.cdist_euclidean,
- pdist_func=_distance_pybind.pdist_euclidean,
- ),
- MetricInfo(
- canonical_name='hamming',
- aka={'matching', 'hamming', 'hamm', 'ha', 'h'},
- types=['double', 'bool'],
- validator=_validate_hamming_kwargs,
- dist_func=hamming,
- cdist_func=_distance_pybind.cdist_hamming,
- pdist_func=_distance_pybind.pdist_hamming,
- ),
- MetricInfo(
- canonical_name='jaccard',
- aka={'jaccard', 'jacc', 'ja', 'j'},
- types=['double', 'bool'],
- dist_func=jaccard,
- cdist_func=_distance_pybind.cdist_jaccard,
- pdist_func=_distance_pybind.pdist_jaccard,
- ),
- MetricInfo(
- canonical_name='jensenshannon',
- aka={'jensenshannon', 'js'},
- dist_func=jensenshannon,
- cdist_func=CDistMetricWrapper('jensenshannon'),
- pdist_func=PDistMetricWrapper('jensenshannon'),
- ),
- MetricInfo(
- canonical_name='mahalanobis',
- aka={'mahalanobis', 'mahal', 'mah'},
- validator=_validate_mahalanobis_kwargs,
- dist_func=mahalanobis,
- cdist_func=CDistMetricWrapper('mahalanobis'),
- pdist_func=PDistMetricWrapper('mahalanobis'),
- ),
- MetricInfo(
- canonical_name='minkowski',
- aka={'minkowski', 'mi', 'm', 'pnorm'},
- validator=_validate_minkowski_kwargs,
- dist_func=minkowski,
- cdist_func=_distance_pybind.cdist_minkowski,
- pdist_func=_distance_pybind.pdist_minkowski,
- ),
- MetricInfo(
- canonical_name='rogerstanimoto',
- aka={'rogerstanimoto'},
- types=['bool'],
- dist_func=rogerstanimoto,
- cdist_func=_distance_pybind.cdist_rogerstanimoto,
- pdist_func=_distance_pybind.pdist_rogerstanimoto,
- ),
- MetricInfo(
- canonical_name='russellrao',
- aka={'russellrao'},
- types=['bool'],
- dist_func=russellrao,
- cdist_func=_distance_pybind.cdist_russellrao,
- pdist_func=_distance_pybind.pdist_russellrao,
- ),
- MetricInfo(
- canonical_name='seuclidean',
- aka={'seuclidean', 'se', 's'},
- validator=_validate_seuclidean_kwargs,
- dist_func=seuclidean,
- cdist_func=CDistMetricWrapper('seuclidean'),
- pdist_func=PDistMetricWrapper('seuclidean'),
- ),
- MetricInfo(
- canonical_name='sokalsneath',
- aka={'sokalsneath'},
- types=['bool'],
- dist_func=sokalsneath,
- cdist_func=_distance_pybind.cdist_sokalsneath,
- pdist_func=_distance_pybind.pdist_sokalsneath,
- ),
- MetricInfo(
- canonical_name='sqeuclidean',
- aka={'sqeuclidean', 'sqe', 'sqeuclid'},
- dist_func=sqeuclidean,
- cdist_func=_distance_pybind.cdist_sqeuclidean,
- pdist_func=_distance_pybind.pdist_sqeuclidean,
- ),
- MetricInfo(
- canonical_name='yule',
- aka={'yule'},
- types=['bool'],
- dist_func=yule,
- cdist_func=_distance_pybind.cdist_yule,
- pdist_func=_distance_pybind.pdist_yule,
- ),
- ]
- _METRICS = {info.canonical_name: info for info in _METRIC_INFOS}
- _METRIC_ALIAS = {alias: info
- for info in _METRIC_INFOS
- for alias in info.aka}
- _METRICS_NAMES = list(_METRICS.keys())
- _TEST_METRICS = {'test_' + info.canonical_name: info for info in _METRIC_INFOS}
- def pdist(X, metric='euclidean', *, out=None, **kwargs):
- """
- Pairwise distances between observations in n-dimensional space.
- See Notes for common calling conventions.
- Parameters
- ----------
- X : array_like
- An m by n array of m original observations in an
- n-dimensional space.
- metric : str or function, optional
- The distance metric to use. The distance function can
- be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
- 'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
- 'jaccard', 'jensenshannon',
- 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
- 'russellrao', 'seuclidean', 'sokalsneath',
- 'sqeuclidean', 'yule'.
- out : ndarray, optional
- The output array.
- If not None, condensed distance matrix Y is stored in this array.
- **kwargs : dict, optional
- Extra arguments to `metric`: refer to each metric documentation for a
- list of all possible arguments.
- Some possible arguments:
- p : scalar
- The p-norm to apply for Minkowski, weighted and unweighted.
- Default: 2.
- w : ndarray
- The weight vector for metrics that support weights (e.g., Minkowski).
- V : ndarray
- The variance vector for standardized Euclidean.
- Default: var(X, axis=0, ddof=1)
- VI : ndarray
- The inverse of the covariance matrix for Mahalanobis.
- Default: inv(cov(X.T)).T
- Returns
- -------
- Y : ndarray
- Returns a condensed distance matrix Y. For each :math:`i` and :math:`j`
- (where :math:`i<j<m`),where m is the number of original observations.
- The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``m
- * i + j - ((i + 2) * (i + 1)) // 2``.
- See Also
- --------
- squareform : converts between condensed distance matrices and
- square distance matrices.
- Notes
- -----
- See ``squareform`` for information on how to calculate the index of
- this entry or to convert the condensed distance matrix to a
- redundant square matrix.
- The following are common calling conventions.
- 1. ``Y = pdist(X, 'euclidean')``
- Computes the distance between m points using Euclidean distance
- (2-norm) as the distance metric between the points. The points
- are arranged as m n-dimensional row vectors in the matrix X.
- 2. ``Y = pdist(X, 'minkowski', p=2.)``
- Computes the distances using the Minkowski distance
- :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
- that this is only a quasi-metric if :math:`0 < p < 1`).
- 3. ``Y = pdist(X, 'cityblock')``
- Computes the city block or Manhattan distance between the
- points.
- 4. ``Y = pdist(X, 'seuclidean', V=None)``
- Computes the standardized Euclidean distance. The standardized
- Euclidean distance between two n-vectors ``u`` and ``v`` is
- .. math::
- \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}
- V is the variance vector; V[i] is the variance computed over all
- the i'th components of the points. If not passed, it is
- automatically computed.
- 5. ``Y = pdist(X, 'sqeuclidean')``
- Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
- the vectors.
- 6. ``Y = pdist(X, 'cosine')``
- Computes the cosine distance between vectors u and v,
- .. math::
- 1 - \\frac{u \\cdot v}
- {{\\|u\\|}_2 {\\|v\\|}_2}
- where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
- :math:`u \\cdot v` is the dot product of ``u`` and ``v``.
- 7. ``Y = pdist(X, 'correlation')``
- Computes the correlation distance between vectors u and v. This is
- .. math::
- 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
- {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
- where :math:`\\bar{v}` is the mean of the elements of vector v,
- and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
- 8. ``Y = pdist(X, 'hamming')``
- Computes the normalized Hamming distance, or the proportion of
- those vector elements between two n-vectors ``u`` and ``v``
- which disagree. To save memory, the matrix ``X`` can be of type
- boolean.
- 9. ``Y = pdist(X, 'jaccard')``
- Computes the Jaccard distance between the points. Given two
- vectors, ``u`` and ``v``, the Jaccard distance is the
- proportion of those elements ``u[i]`` and ``v[i]`` that
- disagree.
- 10. ``Y = pdist(X, 'jensenshannon')``
- Computes the Jensen-Shannon distance between two probability arrays.
- Given two probability vectors, :math:`p` and :math:`q`, the
- Jensen-Shannon distance is
- .. math::
- \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
- where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
- and :math:`D` is the Kullback-Leibler divergence.
- 11. ``Y = pdist(X, 'chebyshev')``
- Computes the Chebyshev distance between the points. The
- Chebyshev distance between two n-vectors ``u`` and ``v`` is the
- maximum norm-1 distance between their respective elements. More
- precisely, the distance is given by
- .. math::
- d(u,v) = \\max_i {|u_i-v_i|}
- 12. ``Y = pdist(X, 'canberra')``
- Computes the Canberra distance between the points. The
- Canberra distance between two points ``u`` and ``v`` is
- .. math::
- d(u,v) = \\sum_i \\frac{|u_i-v_i|}
- {|u_i|+|v_i|}
- 13. ``Y = pdist(X, 'braycurtis')``
- Computes the Bray-Curtis distance between the points. The
- Bray-Curtis distance between two points ``u`` and ``v`` is
- .. math::
- d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}
- {\\sum_i {|u_i+v_i|}}
- 14. ``Y = pdist(X, 'mahalanobis', VI=None)``
- Computes the Mahalanobis distance between the points. The
- Mahalanobis distance between two points ``u`` and ``v`` is
- :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
- variable) is the inverse covariance. If ``VI`` is not None,
- ``VI`` will be used as the inverse covariance matrix.
- 15. ``Y = pdist(X, 'yule')``
- Computes the Yule distance between each pair of boolean
- vectors. (see yule function documentation)
- 16. ``Y = pdist(X, 'matching')``
- Synonym for 'hamming'.
- 17. ``Y = pdist(X, 'dice')``
- Computes the Dice distance between each pair of boolean
- vectors. (see dice function documentation)
- 18. ``Y = pdist(X, 'rogerstanimoto')``
- Computes the Rogers-Tanimoto distance between each pair of
- boolean vectors. (see rogerstanimoto function documentation)
- 19. ``Y = pdist(X, 'russellrao')``
- Computes the Russell-Rao distance between each pair of
- boolean vectors. (see russellrao function documentation)
- 20. ``Y = pdist(X, 'sokalsneath')``
- Computes the Sokal-Sneath distance between each pair of
- boolean vectors. (see sokalsneath function documentation)
- 21. ``Y = pdist(X, f)``
- Computes the distance between all pairs of vectors in X
- using the user supplied 2-arity function f. For example,
- Euclidean distance between the vectors could be computed
- as follows::
- dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))
- Note that you should avoid passing a reference to one of
- the distance functions defined in this library. For example,::
- dm = pdist(X, sokalsneath)
- would calculate the pair-wise distances between the vectors in
- X using the Python function sokalsneath. This would result in
- sokalsneath being called :math:`{n \\choose 2}` times, which
- is inefficient. Instead, the optimized C version is more
- efficient, and we call it using the following syntax.::
- dm = pdist(X, 'sokalsneath')
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial.distance import pdist
- ``x`` is an array of five points in three-dimensional space.
- >>> x = np.array([[2, 0, 2], [2, 2, 3], [-2, 4, 5], [0, 1, 9], [2, 2, 4]])
- ``pdist(x)`` with no additional arguments computes the 10 pairwise
- Euclidean distances:
- >>> pdist(x)
- array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
- 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
- The following computes the pairwise Minkowski distances with ``p = 3.5``:
- >>> pdist(x, metric='minkowski', p=3.5)
- array([2.04898923, 5.1154929 , 7.02700737, 2.43802731, 4.19042714,
- 6.03956994, 1. , 4.45128103, 4.10636143, 5.0619695 ])
- The pairwise city block or Manhattan distances:
- >>> pdist(x, metric='cityblock')
- array([ 3., 11., 10., 4., 8., 9., 1., 9., 7., 8.])
- """
- # You can also call this as:
- # Y = pdist(X, 'test_abc')
- # where 'abc' is the metric being tested. This computes the distance
- # between all pairs of vectors in X using the distance metric 'abc' but
- # with a more succinct, verifiable, but less efficient implementation.
- X = _asarray(X)
- if X.ndim != 2:
- raise ValueError('A 2-dimensional array must be passed. '
- f'(Shape was {X.shape}).')
- n = X.shape[0]
- return xpx.lazy_apply(_np_pdist, X, out,
- # lazy_apply doesn't support Array kwargs
- kwargs.pop('w', None),
- kwargs.pop('V', None),
- kwargs.pop('VI', None),
- # See src/distance_pybind.cpp::pdist
- shape=((n * (n - 1)) // 2, ), dtype=X.dtype,
- as_numpy=True, metric=metric, **kwargs)
- def _np_pdist(X, out, w, V, VI, metric='euclidean', **kwargs):
- X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
- check_finite=False)
- m, n = X.shape
- if w is not None:
- kwargs["w"] = w
- if V is not None:
- kwargs["V"] = V
- if VI is not None:
- kwargs["VI"] = VI
- if callable(metric):
- mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
- metric_info = _METRIC_ALIAS.get(mstr, None)
- if metric_info is not None:
- X, typ, kwargs = _validate_pdist_input(
- X, m, n, metric_info, **kwargs)
- return _pdist_callable(X, metric=metric, out=out, **kwargs)
- elif isinstance(metric, str):
- mstr = metric.lower()
- metric_info = _METRIC_ALIAS.get(mstr, None)
- if metric_info is not None:
- pdist_fn = metric_info.pdist_func
- return pdist_fn(X, out=out, **kwargs)
- elif mstr.startswith("test_"):
- metric_info = _TEST_METRICS.get(mstr, None)
- if metric_info is None:
- raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
- X, typ, kwargs = _validate_pdist_input(
- X, m, n, metric_info, **kwargs)
- return _pdist_callable(
- X, metric=metric_info.dist_func, out=out, **kwargs)
- else:
- raise ValueError(f'Unknown Distance Metric: {mstr}')
- else:
- raise TypeError('2nd argument metric must be a string identifier '
- 'or a function.')
- def squareform(X, force="no", checks=True):
- """
- Convert a vector-form distance vector to a square-form distance
- matrix, and vice-versa.
- Parameters
- ----------
- X : array_like
- Either a condensed or redundant distance matrix.
- force : str, optional
- As with MATLAB(TM), if force is equal to ``'tovector'`` or
- ``'tomatrix'``, the input will be treated as a distance matrix or
- distance vector respectively.
- checks : bool, optional
- If set to False, no checks will be made for matrix
- symmetry nor zero diagonals. This is useful if it is known that
- ``X - X.T1`` is small and ``diag(X)`` is close to zero.
- These values are ignored any way so they do not disrupt the
- squareform transformation.
- Returns
- -------
- Y : ndarray
- If a condensed distance matrix is passed, a redundant one is
- returned, or if a redundant one is passed, a condensed distance
- matrix is returned.
- Notes
- -----
- 1. ``v = squareform(X)``
- Given a square n-by-n symmetric distance matrix ``X``,
- ``v = squareform(X)`` returns an ``n * (n-1) / 2``
- (i.e. binomial coefficient n choose 2) sized vector `v`
- where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
- is the distance between distinct points ``i`` and ``j``.
- If ``X`` is non-square or asymmetric, an error is raised.
- 2. ``X = squareform(v)``
- Given an ``n * (n-1) / 2`` sized vector ``v``
- for some integer ``n >= 1`` encoding distances as described,
- ``X = squareform(v)`` returns an n-by-n distance matrix ``X``.
- The ``X[i, j]`` and ``X[j, i]`` values are set to
- :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
- and all diagonal elements are zero.
- In SciPy 0.19.0, ``squareform`` stopped casting all input types to
- float64, and started returning arrays of the same dtype as the input.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial.distance import pdist, squareform
- ``x`` is an array of five points in three-dimensional space.
- >>> x = np.array([[2, 0, 2], [2, 2, 3], [-2, 4, 5], [0, 1, 9], [2, 2, 4]])
- ``pdist(x)`` computes the Euclidean distances between each pair of
- points in ``x``. The distances are returned in a one-dimensional
- array with length ``5*(5 - 1)/2 = 10``.
- >>> distvec = pdist(x)
- >>> distvec
- array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
- 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
- ``squareform(distvec)`` returns the 5x5 distance matrix.
- >>> m = squareform(distvec)
- >>> m
- array([[0. , 2.23606798, 6.40312424, 7.34846923, 2.82842712],
- [2.23606798, 0. , 4.89897949, 6.40312424, 1. ],
- [6.40312424, 4.89897949, 0. , 5.38516481, 4.58257569],
- [7.34846923, 6.40312424, 5.38516481, 0. , 5.47722558],
- [2.82842712, 1. , 4.58257569, 5.47722558, 0. ]])
- When given a square distance matrix ``m``, ``squareform(m)`` returns
- the one-dimensional condensed distance vector associated with the
- matrix. In this case, we recover ``distvec``.
- >>> squareform(m)
- array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
- 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
- """
- X = np.ascontiguousarray(X)
- s = X.shape
- if force.lower() == 'tomatrix':
- if len(s) != 1:
- raise ValueError("Forcing 'tomatrix' but input X is not a "
- "distance vector.")
- elif force.lower() == 'tovector':
- if len(s) != 2:
- raise ValueError("Forcing 'tovector' but input X is not a "
- "distance matrix.")
- # X = squareform(v)
- if len(s) == 1:
- if s[0] == 0:
- return np.zeros((1, 1), dtype=X.dtype)
- # Grab the closest value to the square root of the number
- # of elements times 2 to see if the number of elements
- # is indeed a binomial coefficient.
- d = int(np.ceil(np.sqrt(s[0] * 2)))
- # Check that v is of valid dimensions.
- if d * (d - 1) != s[0] * 2:
- raise ValueError('Incompatible vector size. It must be a binomial '
- 'coefficient n choose 2 for some integer n >= 2.')
- # Allocate memory for the distance matrix.
- M = np.zeros((d, d), dtype=X.dtype)
- # Since the C code does not support striding using strides.
- # The dimensions are used instead.
- X = _copy_array_if_base_present(X)
- # Fill in the values of the distance matrix.
- _distance_wrap.to_squareform_from_vector_wrap(M, X)
- # Return the distance matrix.
- return M
- elif len(s) == 2:
- if s[0] != s[1]:
- raise ValueError('The matrix argument must be square.')
- if checks:
- is_valid_dm(X, throw=True, name='X')
- # One-side of the dimensions is set here.
- d = s[0]
- if d <= 1:
- return np.array([], dtype=X.dtype)
- # Create a vector.
- v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype)
- # Since the C code does not support striding using strides.
- # The dimensions are used instead.
- X = _copy_array_if_base_present(X)
- # Convert the vector to squareform.
- _distance_wrap.to_vector_from_squareform_wrap(X, v)
- return v
- else:
- raise ValueError("The first argument must be one or two dimensional "
- f"array. A {len(s)}-dimensional array is not permitted")
- def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):
- """
- Return True if input array satisfies basic distance matrix properties
- (symmetry and zero diagonal).
- This function checks whether the input is a 2-dimensional square NumPy array
- with a zero diagonal and symmetry within a specified tolerance. These are
- necessary properties for a distance matrix but not sufficient -- in particular,
- this function does **not** check the triangle inequality, which is required
- for a true metric distance matrix.
- The triangle inequality states that for any three points ``i``, ``j``, and ``k``:
- ``D[i,k] <= D[i,j] + D[j,k]``
-
- Parameters
- ----------
- D : array_like
- The candidate object to test for basic distance matrix properties.
- tol : float, optional
- The distance matrix is considered symmetric if the absolute difference
- between entries ``ij`` and ``ji`` is less than or equal to `tol`. The same
- tolerance is used to determine whether diagonal entries are effectively zero.
- throw : bool, optional
- If True, raises an exception when the input is invalid.
- name : str, optional
- The name of the variable to check. This is used in exception messages when
- `throw` is True to identify the offending variable.
- warning : bool, optional
- If True, a warning message is raised instead of throwing an exception.
-
- Returns
- -------
- valid : bool
- True if the input satisfies the symmetry and zero-diagonal conditions.
- Raises
- ------
- ValueError
- If `throw` is True and `D` is not a valid distance matrix.
- UserWarning
- If `warning` is True and `D` is not a valid distance matrix.
- Notes
- -----
- This function does not check the triangle inequality, which is required for
- a complete validation of a metric distance matrix. Only structural properties
- (symmetry and zero diagonal) are verified. Small numerical deviations from symmetry
- or exact zero diagonal are tolerated within the `tol` parameter.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial.distance import is_valid_dm
- This matrix is a valid distance matrix.
- >>> d = np.array([[0.0, 1.1, 1.2, 1.3],
- ... [1.1, 0.0, 1.0, 1.4],
- ... [1.2, 1.0, 0.0, 1.5],
- ... [1.3, 1.4, 1.5, 0.0]])
- >>> is_valid_dm(d)
- True
- In the following examples, the input is not a valid distance matrix.
- Not square:
- >>> is_valid_dm([[0, 2, 2], [2, 0, 2]])
- False
- Nonzero diagonal element:
- >>> is_valid_dm([[0, 1, 1], [1, 2, 3], [1, 3, 0]])
- False
- Not symmetric:
- >>> is_valid_dm([[0, 1, 3], [2, 0, 1], [3, 1, 0]])
- False
- """
- D = np.asarray(D, order='c')
- valid = True
- try:
- s = D.shape
- if len(D.shape) != 2:
- if name:
- raise ValueError(f"Distance matrix '{name}' must have shape=2 "
- "(i.e. be two-dimensional).")
- else:
- raise ValueError('Distance matrix must have shape=2 (i.e. '
- 'be two-dimensional).')
- if tol == 0.0:
- if not (D == D.T).all():
- if name:
- raise ValueError(f"Distance matrix '{name}' must be symmetric.")
- else:
- raise ValueError('Distance matrix must be symmetric.')
- if not (D[range(0, s[0]), range(0, s[0])] == 0).all():
- if name:
- raise ValueError(f"Distance matrix '{name}' diagonal must be zero.")
- else:
- raise ValueError('Distance matrix diagonal must be zero.')
- else:
- if not (D - D.T <= tol).all():
- if name:
- raise ValueError(f'Distance matrix \'{name}\' must be '
- f'symmetric within tolerance {tol:5.5f}.')
- else:
- raise ValueError('Distance matrix must be symmetric within '
- f'tolerance {tol:5.5f}.')
- if not (D[range(0, s[0]), range(0, s[0])] <= tol).all():
- if name:
- raise ValueError(f'Distance matrix \'{name}\' diagonal must be '
- f'close to zero within tolerance {tol:5.5f}.')
- else:
- raise ValueError(('Distance matrix \'{}\' diagonal must be close '
- 'to zero within tolerance {:5.5f}.').format(*tol))
- except Exception as e:
- if throw:
- raise
- if warning:
- warnings.warn(str(e), stacklevel=2)
- valid = False
- return valid
- def is_valid_y(y, warning=False, throw=False, name=None):
- """
- Return True if the input array is a valid condensed distance matrix.
- Condensed distance matrices must be 1-dimensional numpy arrays.
- Their length must be a binomial coefficient :math:`{n \\choose 2}`
- for some positive integer n.
- Parameters
- ----------
- y : array_like
- The condensed distance matrix.
- warning : bool, optional
- Invokes a warning if the variable passed is not a valid
- condensed distance matrix. The warning message explains why
- the distance matrix is not valid. `name` is used when
- referencing the offending variable.
- throw : bool, optional
- Throws an exception if the variable passed is not a valid
- condensed distance matrix.
- name : str, optional
- Used when referencing the offending variable in the
- warning or exception message.
- Returns
- -------
- bool
- True if the input array is a valid condensed distance matrix,
- False otherwise.
- Examples
- --------
- >>> from scipy.spatial.distance import is_valid_y
- This vector is a valid condensed distance matrix. The length is 6,
- which corresponds to ``n = 4``, since ``4*(4 - 1)/2`` is 6.
- >>> v = [1.0, 1.2, 1.0, 0.5, 1.3, 0.9]
- >>> is_valid_y(v)
- True
- An input vector with length, say, 7, is not a valid condensed distance
- matrix.
- >>> is_valid_y([1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7])
- False
- """
- y = _asarray(y)
- name_str = f"'{name}' " if name else ""
- try:
- if len(y.shape) != 1:
- raise ValueError(f"Condensed distance matrix {name_str}must "
- "have shape=1 (i.e. be one-dimensional).")
- n = y.shape[0]
- d = int(np.ceil(np.sqrt(n * 2)))
- if (d * (d - 1) / 2) != n:
- raise ValueError(f"Length n of condensed distance matrix {name_str}"
- "must be a binomial coefficient, i.e. "
- "there must be a k such that (k \\choose 2)=n)!")
- except Exception as e:
- if throw:
- raise
- if warning:
- warnings.warn(str(e), stacklevel=2)
- return False
- return True
- def num_obs_dm(d):
- """
- Return the number of original observations that correspond to a
- square, redundant distance matrix.
- Parameters
- ----------
- d : array_like
- The target distance matrix.
- Returns
- -------
- num_obs_dm : int
- The number of observations in the redundant distance matrix.
- Examples
- --------
- Find the number of original observations corresponding
- to a square redundant distance matrix d.
- >>> from scipy.spatial.distance import num_obs_dm
- >>> d = [[0, 100, 200], [100, 0, 150], [200, 150, 0]]
- >>> num_obs_dm(d)
- 3
- """
- d = np.asarray(d, order='c')
- is_valid_dm(d, tol=np.inf, throw=True, name='d')
- return d.shape[0]
- def num_obs_y(Y):
- """
- Return the number of original observations that correspond to a
- condensed distance matrix.
- Parameters
- ----------
- Y : array_like
- Condensed distance matrix.
- Returns
- -------
- n : int
- The number of observations in the condensed distance matrix `Y`.
- Examples
- --------
- Find the number of original observations corresponding to a
- condensed distance matrix Y.
- >>> from scipy.spatial.distance import num_obs_y
- >>> Y = [1, 2, 3.5, 7, 10, 4]
- >>> num_obs_y(Y)
- 4
- """
- Y = _asarray(Y)
- is_valid_y(Y, throw=True, name='Y')
- k = Y.shape[0]
- if k == 0:
- raise ValueError("The number of observations cannot be determined on "
- "an empty distance matrix.")
- d = int(np.ceil(np.sqrt(k * 2)))
- if (d * (d - 1) / 2) != k:
- raise ValueError("Invalid condensed distance matrix passed. Must be "
- "some k where k=(n choose 2) for some n >= 2.")
- return d
- def _prepare_out_argument(out, dtype, expected_shape):
- if out is None:
- return np.empty(expected_shape, dtype=dtype)
- if out.shape != expected_shape:
- raise ValueError("Output array has incorrect shape.")
- if not out.flags.c_contiguous:
- raise ValueError("Output array must be C-contiguous.")
- if out.dtype != np.float64:
- raise ValueError("Output array must be double type.")
- return out
- def _pdist_callable(X, *, out, metric, **kwargs):
- n = X.shape[0]
- out_size = (n * (n - 1)) // 2
- dm = _prepare_out_argument(out, np.float64, (out_size,))
- k = 0
- for i in range(X.shape[0] - 1):
- for j in range(i + 1, X.shape[0]):
- dm[k] = metric(X[i], X[j], **kwargs)
- k += 1
- return dm
- def _cdist_callable(XA, XB, *, out, metric, **kwargs):
- mA = XA.shape[0]
- mB = XB.shape[0]
- dm = _prepare_out_argument(out, np.float64, (mA, mB))
- for i in range(mA):
- for j in range(mB):
- dm[i, j] = metric(XA[i], XB[j], **kwargs)
- return dm
- def cdist(XA, XB, metric='euclidean', *, out=None, **kwargs):
- """
- Compute distance between each pair of the two collections of inputs.
- See Notes for common calling conventions.
- Parameters
- ----------
- XA : array_like
- An :math:`m_A` by :math:`n` array of :math:`m_A`
- original observations in an :math:`n`-dimensional space.
- Inputs are converted to float type.
- XB : array_like
- An :math:`m_B` by :math:`n` array of :math:`m_B`
- original observations in an :math:`n`-dimensional space.
- Inputs are converted to float type.
- metric : str or callable, optional
- The distance metric to use. If a string, the distance function can be
- 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
- 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
- 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
- 'seuclidean', 'sokalsneath', 'sqeuclidean', 'yule'.
- **kwargs : dict, optional
- Extra arguments to `metric`: refer to each metric documentation for a
- list of all possible arguments.
- Some possible arguments:
- p : scalar
- The p-norm to apply for Minkowski, weighted and unweighted.
- Default: 2.
- w : array_like
- The weight vector for metrics that support weights (e.g., Minkowski).
- V : array_like
- The variance vector for standardized Euclidean.
- Default: var(vstack([XA, XB]), axis=0, ddof=1)
- VI : array_like
- The inverse of the covariance matrix for Mahalanobis.
- Default: inv(cov(vstack([XA, XB].T))).T
- out : ndarray
- The output array
- If not None, the distance matrix Y is stored in this array.
- Returns
- -------
- Y : ndarray
- A :math:`m_A` by :math:`m_B` distance matrix is returned.
- For each :math:`i` and :math:`j`, the metric
- ``dist(u=XA[i], v=XB[j])`` is computed and stored in the
- :math:`ij` th entry.
- Raises
- ------
- ValueError
- An exception is thrown if `XA` and `XB` do not have
- the same number of columns.
- Notes
- -----
- The following are common calling conventions:
- 1. ``Y = cdist(XA, XB, 'euclidean')``
- Computes the distance between :math:`m` points using
- Euclidean distance (2-norm) as the distance metric between the
- points. The points are arranged as :math:`m`
- :math:`n`-dimensional row vectors in the matrix X.
- 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``
- Computes the distances using the Minkowski distance
- :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
- that this is only a quasi-metric if :math:`0 < p < 1`).
- 3. ``Y = cdist(XA, XB, 'cityblock')``
- Computes the city block or Manhattan distance between the
- points.
- 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``
- Computes the standardized Euclidean distance. The standardized
- Euclidean distance between two n-vectors ``u`` and ``v`` is
- .. math::
- \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.
- V is the variance vector; V[i] is the variance computed over all
- the i'th components of the points. If not passed, it is
- automatically computed.
- 5. ``Y = cdist(XA, XB, 'sqeuclidean')``
- Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
- the vectors.
- 6. ``Y = cdist(XA, XB, 'cosine')``
- Computes the cosine distance between vectors u and v,
- .. math::
- 1 - \\frac{u \\cdot v}
- {{\\|u\\|}_2 {\\|v\\|}_2}
- where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
- :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.
- 7. ``Y = cdist(XA, XB, 'correlation')``
- Computes the correlation distance between vectors u and v. This is
- .. math::
- 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
- {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
- where :math:`\\bar{v}` is the mean of the elements of vector v,
- and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
- 8. ``Y = cdist(XA, XB, 'hamming')``
- Computes the normalized Hamming distance, or the proportion of
- those vector elements between two n-vectors ``u`` and ``v``
- which disagree. To save memory, the matrix ``X`` can be of type
- boolean.
- 9. ``Y = cdist(XA, XB, 'jaccard')``
- Computes the Jaccard distance between the points. Given two
- vectors, ``u`` and ``v``, the Jaccard distance is the
- proportion of those elements ``u[i]`` and ``v[i]`` that
- disagree where at least one of them is non-zero.
- 10. ``Y = cdist(XA, XB, 'jensenshannon')``
- Computes the Jensen-Shannon distance between two probability arrays.
- Given two probability vectors, :math:`p` and :math:`q`, the
- Jensen-Shannon distance is
- .. math::
- \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
- where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
- and :math:`D` is the Kullback-Leibler divergence.
- 11. ``Y = cdist(XA, XB, 'chebyshev')``
- Computes the Chebyshev distance between the points. The
- Chebyshev distance between two n-vectors ``u`` and ``v`` is the
- maximum norm-1 distance between their respective elements. More
- precisely, the distance is given by
- .. math::
- d(u,v) = \\max_i {|u_i-v_i|}.
- 12. ``Y = cdist(XA, XB, 'canberra')``
- Computes the Canberra distance between the points. The
- Canberra distance between two points ``u`` and ``v`` is
- .. math::
- d(u,v) = \\sum_i \\frac{|u_i-v_i|}
- {|u_i|+|v_i|}.
- 13. ``Y = cdist(XA, XB, 'braycurtis')``
- Computes the Bray-Curtis distance between the points. The
- Bray-Curtis distance between two points ``u`` and ``v`` is
- .. math::
- d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}
- {\\sum_i (|u_i+v_i|)}
- 14. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``
- Computes the Mahalanobis distance between the points. The
- Mahalanobis distance between two points ``u`` and ``v`` is
- :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
- variable) is the inverse covariance. If ``VI`` is not None,
- ``VI`` will be used as the inverse covariance matrix.
- 15. ``Y = cdist(XA, XB, 'yule')``
- Computes the Yule distance between the boolean
- vectors. (see `yule` function documentation)
- 16. ``Y = cdist(XA, XB, 'matching')``
- Synonym for 'hamming'.
- 17. ``Y = cdist(XA, XB, 'dice')``
- Computes the Dice distance between the boolean vectors. (see
- `dice` function documentation).
- 18. ``Y = cdist(XA, XB, 'rogerstanimoto')``
- Computes the Rogers-Tanimoto distance between the boolean
- vectors. (see `rogerstanimoto` function documentation)
- 19. ``Y = cdist(XA, XB, 'russellrao')``
- Computes the Russell-Rao distance between the boolean
- vectors. (see `russellrao` function documentation)
- 20. ``Y = cdist(XA, XB, 'sokalsneath')``
- Computes the Sokal-Sneath distance between the vectors. (see
- `sokalsneath` function documentation)
- 21. ``Y = cdist(XA, XB, f)``
- Computes the distance between all pairs of vectors in X
- using the user supplied 2-arity function f. For example,
- Euclidean distance between the vectors could be computed
- as follows::
- dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))
- Note that you should avoid passing a reference to one of
- the distance functions defined in this library. For example,::
- dm = cdist(XA, XB, sokalsneath)
- would calculate the pair-wise distances between the vectors in
- X using the Python function `sokalsneath`. This would result in
- sokalsneath being called :math:`{n \\choose 2}` times, which
- is inefficient. Instead, the optimized C version is more
- efficient, and we call it using the following syntax::
- dm = cdist(XA, XB, 'sokalsneath')
- Examples
- --------
- Find the Euclidean distances between four 2-D coordinates:
- >>> from scipy.spatial import distance
- >>> import numpy as np
- >>> coords = [(35.0456, -85.2672),
- ... (35.1174, -89.9711),
- ... (35.9728, -83.9422),
- ... (36.1667, -86.7833)]
- >>> distance.cdist(coords, coords, 'euclidean')
- array([[ 0. , 4.7044, 1.6172, 1.8856],
- [ 4.7044, 0. , 6.0893, 3.3561],
- [ 1.6172, 6.0893, 0. , 2.8477],
- [ 1.8856, 3.3561, 2.8477, 0. ]])
- Find the Manhattan distance from a 3-D point to the corners of the unit
- cube:
- >>> a = np.array([[0, 0, 0],
- ... [0, 0, 1],
- ... [0, 1, 0],
- ... [0, 1, 1],
- ... [1, 0, 0],
- ... [1, 0, 1],
- ... [1, 1, 0],
- ... [1, 1, 1]])
- >>> b = np.array([[ 0.1, 0.2, 0.4]])
- >>> distance.cdist(a, b, 'cityblock')
- array([[ 0.7],
- [ 0.9],
- [ 1.3],
- [ 1.5],
- [ 1.5],
- [ 1.7],
- [ 2.1],
- [ 2.3]])
- """
- # You can also call this as:
- # Y = cdist(XA, XB, 'test_abc')
- # where 'abc' is the metric being tested. This computes the distance
- # between all pairs of vectors in XA and XB using the distance metric 'abc'
- # but with a more succinct, verifiable, but less efficient implementation.
- XA = np.asarray(XA)
- XB = np.asarray(XB)
- s = XA.shape
- sB = XB.shape
- if len(s) != 2:
- raise ValueError('XA must be a 2-dimensional array.')
- if len(sB) != 2:
- raise ValueError('XB must be a 2-dimensional array.')
- if s[1] != sB[1]:
- raise ValueError('XA and XB must have the same number of columns '
- '(i.e. feature dimension.)')
- mA = s[0]
- mB = sB[0]
- n = s[1]
- if callable(metric):
- mstr = getattr(metric, '__name__', 'Unknown')
- metric_info = _METRIC_ALIAS.get(mstr, None)
- if metric_info is not None:
- XA, XB, typ, kwargs = _validate_cdist_input(
- XA, XB, mA, mB, n, metric_info, **kwargs)
- return _cdist_callable(XA, XB, metric=metric, out=out, **kwargs)
- elif isinstance(metric, str):
- mstr = metric.lower()
- metric_info = _METRIC_ALIAS.get(mstr, None)
- if metric_info is not None:
- cdist_fn = metric_info.cdist_func
- return cdist_fn(XA, XB, out=out, **kwargs)
- elif mstr.startswith("test_"):
- metric_info = _TEST_METRICS.get(mstr, None)
- if metric_info is None:
- raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
- XA, XB, typ, kwargs = _validate_cdist_input(
- XA, XB, mA, mB, n, metric_info, **kwargs)
- return _cdist_callable(
- XA, XB, metric=metric_info.dist_func, out=out, **kwargs)
- else:
- raise ValueError(f'Unknown Distance Metric: {mstr}')
- else:
- raise TypeError('2nd argument metric must be a string identifier '
- 'or a function.')
|