distance.py 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939
  1. """
  2. Distance computations (:mod:`scipy.spatial.distance`)
  3. =====================================================
  4. .. sectionauthor:: Damian Eads
  5. Function reference
  6. ------------------
  7. Distance matrix computation from a collection of raw observation vectors
  8. stored in a rectangular array.
  9. .. autosummary::
  10. :toctree: generated/
  11. pdist -- pairwise distances between observation vectors.
  12. cdist -- distances between two collections of observation vectors
  13. squareform -- convert distance matrix to a condensed one and vice versa
  14. directed_hausdorff -- directed Hausdorff distance between arrays
  15. Predicates for checking the validity of distance matrices, both
  16. condensed and redundant. Also contained in this module are functions
  17. for computing the number of observations in a distance matrix.
  18. .. autosummary::
  19. :toctree: generated/
  20. is_valid_dm -- checks for a valid distance matrix
  21. is_valid_y -- checks for a valid condensed distance matrix
  22. num_obs_dm -- # of observations in a distance matrix
  23. num_obs_y -- # of observations in a condensed distance matrix
  24. Distance functions between two numeric vectors ``u`` and ``v``. Computing
  25. distances over a large collection of vectors is inefficient for these
  26. functions. Use ``pdist`` for this purpose.
  27. .. autosummary::
  28. :toctree: generated/
  29. braycurtis -- the Bray-Curtis distance.
  30. canberra -- the Canberra distance.
  31. chebyshev -- the Chebyshev distance.
  32. cityblock -- the Manhattan distance.
  33. correlation -- the Correlation distance.
  34. cosine -- the Cosine distance.
  35. euclidean -- the Euclidean distance.
  36. jensenshannon -- the Jensen-Shannon distance.
  37. mahalanobis -- the Mahalanobis distance.
  38. minkowski -- the Minkowski distance.
  39. seuclidean -- the normalized Euclidean distance.
  40. sqeuclidean -- the squared Euclidean distance.
  41. Distance functions between two boolean vectors (representing sets) ``u`` and
  42. ``v``. As in the case of numerical vectors, ``pdist`` is more efficient for
  43. computing the distances between all pairs.
  44. .. autosummary::
  45. :toctree: generated/
  46. dice -- the Dice dissimilarity.
  47. hamming -- the Hamming distance.
  48. jaccard -- the Jaccard distance.
  49. rogerstanimoto -- the Rogers-Tanimoto dissimilarity.
  50. russellrao -- the Russell-Rao dissimilarity.
  51. sokalsneath -- the Sokal-Sneath dissimilarity.
  52. yule -- the Yule dissimilarity.
  53. :func:`hamming` also operates over discrete numerical vectors.
  54. """
  55. # Copyright (C) Damian Eads, 2007-2008. New BSD License.
  56. __all__ = [
  57. 'braycurtis',
  58. 'canberra',
  59. 'cdist',
  60. 'chebyshev',
  61. 'cityblock',
  62. 'correlation',
  63. 'cosine',
  64. 'dice',
  65. 'directed_hausdorff',
  66. 'euclidean',
  67. 'hamming',
  68. 'is_valid_dm',
  69. 'is_valid_y',
  70. 'jaccard',
  71. 'jensenshannon',
  72. 'mahalanobis',
  73. 'minkowski',
  74. 'num_obs_dm',
  75. 'num_obs_y',
  76. 'pdist',
  77. 'rogerstanimoto',
  78. 'russellrao',
  79. 'seuclidean',
  80. 'sokalsneath',
  81. 'sqeuclidean',
  82. 'squareform',
  83. 'yule'
  84. ]
  85. import math
  86. import warnings
  87. import dataclasses
  88. from collections.abc import Callable
  89. from functools import partial
  90. import numpy as np
  91. from scipy._lib._array_api import _asarray
  92. from scipy._lib._util import _asarray_validated, _transition_to_rng
  93. from scipy._lib import array_api_extra as xpx
  94. from scipy.linalg import norm
  95. from scipy.special import rel_entr
  96. from . import _hausdorff, _distance_pybind, _distance_wrap
  97. def _copy_array_if_base_present(a):
  98. """Copy the array if its base points to a parent array."""
  99. if a.base is not None:
  100. return a.copy()
  101. return a
  102. def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
  103. XA = XA - XA.mean(axis=1, keepdims=True)
  104. XB = XB - XB.mean(axis=1, keepdims=True)
  105. _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)
  106. def _correlation_pdist_wrap(X, dm, **kwargs):
  107. X2 = X - X.mean(axis=1, keepdims=True)
  108. _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)
  109. def _convert_to_type(X, out_type):
  110. return np.ascontiguousarray(X, dtype=out_type)
  111. def _nbool_correspond_all(u, v, w=None):
  112. if u.dtype == v.dtype == bool and w is None:
  113. not_u = ~u
  114. not_v = ~v
  115. nff = (not_u & not_v).sum()
  116. nft = (not_u & v).sum()
  117. ntf = (u & not_v).sum()
  118. ntt = (u & v).sum()
  119. else:
  120. dtype = np.result_type(int, u.dtype, v.dtype)
  121. u = u.astype(dtype)
  122. v = v.astype(dtype)
  123. not_u = 1.0 - u
  124. not_v = 1.0 - v
  125. if w is not None:
  126. not_u = w * not_u
  127. u = w * u
  128. nff = (not_u * not_v).sum()
  129. nft = (not_u * v).sum()
  130. ntf = (u * not_v).sum()
  131. ntt = (u * v).sum()
  132. return (nff, nft, ntf, ntt)
  133. def _nbool_correspond_ft_tf(u, v, w=None):
  134. if u.dtype == v.dtype == bool and w is None:
  135. not_u = ~u
  136. not_v = ~v
  137. nft = (not_u & v).sum()
  138. ntf = (u & not_v).sum()
  139. else:
  140. dtype = np.result_type(int, u.dtype, v.dtype)
  141. u = u.astype(dtype)
  142. v = v.astype(dtype)
  143. not_u = 1.0 - u
  144. not_v = 1.0 - v
  145. if w is not None:
  146. not_u = w * not_u
  147. u = w * u
  148. nft = (not_u * v).sum()
  149. ntf = (u * not_v).sum()
  150. return (nft, ntf)
  151. def _validate_cdist_input(XA, XB, mA, mB, n, metric_info, **kwargs):
  152. # get supported types
  153. types = metric_info.types
  154. # choose best type
  155. typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
  156. # validate data
  157. XA = _convert_to_type(XA, out_type=typ)
  158. XB = _convert_to_type(XB, out_type=typ)
  159. # validate kwargs
  160. _validate_kwargs = metric_info.validator
  161. if _validate_kwargs:
  162. kwargs = _validate_kwargs((XA, XB), mA + mB, n, **kwargs)
  163. return XA, XB, typ, kwargs
  164. def _validate_weight_with_size(X, m, n, **kwargs):
  165. w = kwargs.pop('w', None)
  166. if w is None:
  167. return kwargs
  168. if w.ndim != 1 or w.shape[0] != n:
  169. raise ValueError("Weights must have same size as input vector. "
  170. f"{w.shape[0]} vs. {n}")
  171. kwargs['w'] = _validate_weights(w)
  172. return kwargs
  173. def _validate_hamming_kwargs(X, m, n, **kwargs):
  174. w = kwargs.get('w', np.ones((n,), dtype='double'))
  175. if w.ndim != 1 or w.shape[0] != n:
  176. raise ValueError(f"Weights must have same size as input vector. "
  177. f"{w.shape[0]} vs. {n}")
  178. kwargs['w'] = _validate_weights(w)
  179. return kwargs
  180. def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
  181. VI = kwargs.pop('VI', None)
  182. if VI is None:
  183. if m <= n:
  184. # There are fewer observations than the dimension of
  185. # the observations.
  186. raise ValueError(
  187. f"The number of observations ({m}) is too small; "
  188. f"the covariance matrix is singular. For observations "
  189. f"with {n} dimensions, at least {n + 1} observations are required.")
  190. if isinstance(X, tuple):
  191. X = np.vstack(X)
  192. CV = np.atleast_2d(np.cov(X.astype(np.float64, copy=False).T))
  193. VI = np.linalg.inv(CV).T.copy()
  194. kwargs["VI"] = _convert_to_double(VI)
  195. return kwargs
  196. def _validate_minkowski_kwargs(X, m, n, **kwargs):
  197. kwargs = _validate_weight_with_size(X, m, n, **kwargs)
  198. if 'p' not in kwargs:
  199. kwargs['p'] = 2.
  200. else:
  201. if kwargs['p'] <= 0:
  202. raise ValueError("p must be greater than 0")
  203. return kwargs
  204. def _validate_pdist_input(X, m, n, metric_info, **kwargs):
  205. # get supported types
  206. types = metric_info.types
  207. # choose best type
  208. typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
  209. # validate data
  210. X = _convert_to_type(X, out_type=typ)
  211. # validate kwargs
  212. _validate_kwargs = metric_info.validator
  213. if _validate_kwargs:
  214. kwargs = _validate_kwargs(X, m, n, **kwargs)
  215. return X, typ, kwargs
  216. def _validate_seuclidean_kwargs(X, m, n, **kwargs):
  217. V = kwargs.pop('V', None)
  218. if V is None:
  219. if isinstance(X, tuple):
  220. X = np.vstack(X)
  221. V = np.var(X.astype(np.float64, copy=False), axis=0, ddof=1)
  222. else:
  223. V = np.asarray(V, order='c')
  224. if len(V.shape) != 1:
  225. raise ValueError('Variance vector V must '
  226. 'be one-dimensional.')
  227. if V.shape[0] != n:
  228. raise ValueError('Variance vector V must be of the same '
  229. 'dimension as the vectors on which the distances '
  230. 'are computed.')
  231. kwargs['V'] = _convert_to_double(V)
  232. return kwargs
  233. def _validate_vector(u, dtype=None):
  234. # XXX Is order='c' really necessary?
  235. u = np.asarray(u, dtype=dtype, order='c')
  236. if u.ndim == 1:
  237. return u
  238. raise ValueError("Input vector should be 1-D.")
  239. def _validate_weights(w, dtype=np.float64):
  240. w = _validate_vector(w, dtype=dtype)
  241. if np.any(w < 0):
  242. raise ValueError("Input weights should be all non-negative")
  243. return w
  244. @_transition_to_rng('seed', position_num=2, replace_doc=False)
  245. def directed_hausdorff(u, v, rng=0):
  246. """
  247. Compute the directed Hausdorff distance between two 2-D arrays.
  248. Distances between pairs are calculated using a Euclidean metric.
  249. Parameters
  250. ----------
  251. u : (M,N) array_like
  252. Input array with M points in N dimensions.
  253. v : (O,N) array_like
  254. Input array with O points in N dimensions.
  255. rng : int or `numpy.random.Generator` or None, optional
  256. Pseudorandom number generator state. Default is 0 so the
  257. shuffling of `u` and `v` is reproducible.
  258. If `rng` is passed by keyword, types other than `numpy.random.Generator` are
  259. passed to `numpy.random.default_rng` to instantiate a ``Generator``.
  260. If `rng` is already a ``Generator`` instance, then the provided instance is
  261. used.
  262. If this argument is passed by position or `seed` is passed by keyword,
  263. legacy behavior for the argument `seed` applies:
  264. - If `seed` is None, a new ``RandomState`` instance is used. The state is
  265. initialized using data from ``/dev/urandom`` (or the Windows analogue)
  266. if available or from the system clock otherwise.
  267. - If `seed` is an int, a new ``RandomState`` instance is used,
  268. seeded with `seed`.
  269. - If `seed` is already a ``Generator`` or ``RandomState`` instance, then
  270. that instance is used.
  271. .. versionchanged:: 1.15.0
  272. As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
  273. transition from use of `numpy.random.RandomState` to
  274. `numpy.random.Generator`, this keyword was changed from `seed` to `rng`.
  275. For an interim period, both keywords will continue to work, although only
  276. one may be specified at a time. After the interim period, function calls
  277. using the `seed` keyword will emit warnings. The behavior of both `seed`
  278. and `rng` are outlined above, but only the `rng` keyword should be used in
  279. new code.
  280. Returns
  281. -------
  282. d : double
  283. The directed Hausdorff distance between arrays `u` and `v`,
  284. index_1 : int
  285. index of point contributing to Hausdorff pair in `u`
  286. index_2 : int
  287. index of point contributing to Hausdorff pair in `v`
  288. Raises
  289. ------
  290. ValueError
  291. An exception is thrown if `u` and `v` do not have
  292. the same number of columns.
  293. See Also
  294. --------
  295. scipy.spatial.procrustes : Another similarity test for two data sets
  296. Notes
  297. -----
  298. Uses the early break technique and the random sampling approach
  299. described by [1]_. Although worst-case performance is ``O(m * o)``
  300. (as with the brute force algorithm), this is unlikely in practice
  301. as the input data would have to require the algorithm to explore
  302. every single point interaction, and after the algorithm shuffles
  303. the input points at that. The best case performance is ``O(m)``,
  304. which is satisfied by selecting an inner loop distance that is
  305. less than cmax and leads to an early break as often as possible.
  306. The authors have formally shown that the average runtime is
  307. closer to ``O(m)``.
  308. .. versionadded:: 0.19.0
  309. References
  310. ----------
  311. .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for
  312. calculating the exact Hausdorff distance." IEEE Transactions On
  313. Pattern Analysis And Machine Intelligence, vol. 37, no. 11,
  314. pp. 2153-63, 2015. :doi:`10.1109/TPAMI.2015.2408351`.
  315. Examples
  316. --------
  317. Find the directed Hausdorff distance between two 2-D arrays of
  318. coordinates:
  319. >>> from scipy.spatial.distance import directed_hausdorff
  320. >>> import numpy as np
  321. >>> u = np.array([(1.0, 0.0),
  322. ... (0.0, 1.0),
  323. ... (-1.0, 0.0),
  324. ... (0.0, -1.0)])
  325. >>> v = np.array([(2.0, 0.0),
  326. ... (0.0, 2.0),
  327. ... (-2.0, 0.0),
  328. ... (0.0, -4.0)])
  329. >>> directed_hausdorff(u, v)[0]
  330. 2.23606797749979
  331. >>> directed_hausdorff(v, u)[0]
  332. 3.0
  333. Find the general (symmetric) Hausdorff distance between two 2-D
  334. arrays of coordinates:
  335. >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
  336. 3.0
  337. Find the indices of the points that generate the Hausdorff distance
  338. (the Hausdorff pair):
  339. >>> directed_hausdorff(v, u)[1:]
  340. (3, 3)
  341. """
  342. u = np.asarray(u, dtype=np.float64, order='c')
  343. v = np.asarray(v, dtype=np.float64, order='c')
  344. if u.shape[1] != v.shape[1]:
  345. raise ValueError('u and v need to have the same '
  346. 'number of columns')
  347. result = _hausdorff.directed_hausdorff(u, v, rng)
  348. return result
  349. def minkowski(u, v, p=2, w=None):
  350. """
  351. Compute the Minkowski distance between two 1-D arrays.
  352. The Minkowski distance between 1-D arrays `u` and `v`,
  353. is defined as
  354. .. math::
  355. {\\|u-v\\|}_p = (\\sum{|u_i - v_i|^p})^{1/p}.
  356. \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.
  357. Parameters
  358. ----------
  359. u : (N,) array_like
  360. Input array.
  361. v : (N,) array_like
  362. Input array.
  363. p : scalar
  364. The order of the norm of the difference :math:`{\\|u-v\\|}_p`. Note
  365. that for :math:`0 < p < 1`, the triangle inequality only holds with
  366. an additional multiplicative factor, i.e. it is only a quasi-metric.
  367. w : (N,) array_like, optional
  368. The weights for each value in `u` and `v`. Default is None,
  369. which gives each value a weight of 1.0
  370. Returns
  371. -------
  372. minkowski : double
  373. The Minkowski distance between vectors `u` and `v`.
  374. Examples
  375. --------
  376. >>> from scipy.spatial import distance
  377. >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)
  378. 2.0
  379. >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)
  380. 1.4142135623730951
  381. >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)
  382. 1.2599210498948732
  383. >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)
  384. 1.0
  385. >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)
  386. 1.0
  387. >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)
  388. 1.0
  389. """
  390. u = _validate_vector(u)
  391. v = _validate_vector(v)
  392. if p <= 0:
  393. raise ValueError("p must be greater than 0")
  394. u_v = u - v
  395. if w is not None:
  396. w = _validate_weights(w)
  397. if p == 1:
  398. root_w = w
  399. elif p == 2:
  400. # better precision and speed
  401. root_w = np.sqrt(w)
  402. elif p == np.inf:
  403. root_w = (w != 0)
  404. else:
  405. root_w = np.power(w, 1/p)
  406. u_v = root_w * u_v
  407. dist = norm(u_v, ord=p)
  408. return dist
  409. def euclidean(u, v, w=None):
  410. """
  411. Computes the Euclidean distance between two 1-D arrays.
  412. The Euclidean distance between 1-D arrays `u` and `v`, is defined as
  413. .. math::
  414. {\\|u-v\\|}_2
  415. \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2}
  416. Parameters
  417. ----------
  418. u : (N,) array_like
  419. Input array.
  420. v : (N,) array_like
  421. Input array.
  422. w : (N,) array_like, optional
  423. The weights for each value in `u` and `v`. Default is None,
  424. which gives each value a weight of 1.0
  425. Returns
  426. -------
  427. euclidean : double
  428. The Euclidean distance between vectors `u` and `v`.
  429. Examples
  430. --------
  431. >>> from scipy.spatial import distance
  432. >>> distance.euclidean([1, 0, 0], [0, 1, 0])
  433. 1.4142135623730951
  434. >>> distance.euclidean([1, 1, 0], [0, 1, 0])
  435. 1.0
  436. """
  437. return minkowski(u, v, p=2, w=w)
  438. def sqeuclidean(u, v, w=None):
  439. """
  440. Compute the squared Euclidean distance between two 1-D arrays.
  441. The squared Euclidean distance between `u` and `v` is defined as
  442. .. math::
  443. \\sum_i{w_i |u_i - v_i|^2}
  444. Parameters
  445. ----------
  446. u : (N,) array_like
  447. Input array.
  448. v : (N,) array_like
  449. Input array.
  450. w : (N,) array_like, optional
  451. The weights for each value in `u` and `v`. Default is None,
  452. which gives each value a weight of 1.0
  453. Returns
  454. -------
  455. sqeuclidean : double
  456. The squared Euclidean distance between vectors `u` and `v`.
  457. Examples
  458. --------
  459. >>> from scipy.spatial import distance
  460. >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0])
  461. 2.0
  462. >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0])
  463. 1.0
  464. """
  465. # Preserve float dtypes, but convert everything else to np.float64
  466. # for stability.
  467. utype, vtype = None, None
  468. if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)):
  469. utype = np.float64
  470. if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)):
  471. vtype = np.float64
  472. u = _validate_vector(u, dtype=utype)
  473. v = _validate_vector(v, dtype=vtype)
  474. u_v = u - v
  475. u_v_w = u_v # only want weights applied once
  476. if w is not None:
  477. w = _validate_weights(w)
  478. u_v_w = w * u_v
  479. return np.dot(u_v, u_v_w)
  480. def correlation(u, v, w=None, centered=True):
  481. """
  482. Compute the correlation distance between two 1-D arrays.
  483. The correlation distance between `u` and `v`, is
  484. defined as
  485. .. math::
  486. 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
  487. {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
  488. where :math:`\\bar{u}` is the mean of the elements of `u`
  489. and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
  490. Parameters
  491. ----------
  492. u : (N,) array_like of floats
  493. Input array.
  494. v : (N,) array_like of floats
  495. Input array.
  496. w : (N,) array_like of floats, optional
  497. The weights for each value in `u` and `v`. Default is None,
  498. which gives each value a weight of 1.0
  499. centered : bool, optional
  500. If True, `u` and `v` will be centered. Default is True.
  501. Returns
  502. -------
  503. correlation : double
  504. The correlation distance between 1-D array `u` and `v`.
  505. Examples
  506. --------
  507. Find the correlation between two arrays.
  508. >>> from scipy.spatial.distance import correlation
  509. >>> correlation([1, 0, 1], [1, 1, 0])
  510. 1.5
  511. Using a weighting array, the correlation can be calculated as:
  512. >>> correlation([1, 0, 1], [1, 1, 0], w=[0.9, 0.1, 0.1])
  513. 1.1
  514. If centering is not needed, the correlation can be calculated as:
  515. >>> correlation([1, 0, 1], [1, 1, 0], centered=False)
  516. 0.5
  517. """
  518. u = _validate_vector(u)
  519. v = _validate_vector(v)
  520. if np.iscomplexobj(u) or np.iscomplexobj(v):
  521. msg = "`u` and `v` must be real."
  522. raise TypeError(msg)
  523. if w is not None:
  524. w = _validate_weights(w)
  525. w = w / w.sum()
  526. if centered:
  527. if w is not None:
  528. umu = np.dot(u, w)
  529. vmu = np.dot(v, w)
  530. else:
  531. umu = np.mean(u)
  532. vmu = np.mean(v)
  533. u = u - umu
  534. v = v - vmu
  535. if w is not None:
  536. vw = v * w
  537. uw = u * w
  538. else:
  539. vw, uw = v, u
  540. uv = np.dot(u, vw)
  541. uu = np.dot(u, uw)
  542. vv = np.dot(v, vw)
  543. dist = 1.0 - uv / math.sqrt(uu * vv)
  544. # Clip the result to avoid rounding error
  545. return np.clip(dist, 0.0, 2.0)
  546. def cosine(u, v, w=None):
  547. """
  548. Compute the Cosine distance between 1-D arrays.
  549. The Cosine distance between `u` and `v`, is defined as
  550. .. math::
  551. 1 - \\frac{u \\cdot v}
  552. {\\|u\\|_2 \\|v\\|_2}.
  553. where :math:`u \\cdot v` is the dot product of :math:`u` and
  554. :math:`v`.
  555. Parameters
  556. ----------
  557. u : (N,) array_like of floats
  558. Input array.
  559. v : (N,) array_like of floats
  560. Input array.
  561. w : (N,) array_like of floats, optional
  562. The weights for each value in `u` and `v`. Default is None,
  563. which gives each value a weight of 1.0
  564. Returns
  565. -------
  566. cosine : double
  567. The Cosine distance between vectors `u` and `v`.
  568. Examples
  569. --------
  570. >>> from scipy.spatial import distance
  571. >>> distance.cosine([1, 0, 0], [0, 1, 0])
  572. 1.0
  573. >>> distance.cosine([100, 0, 0], [0, 1, 0])
  574. 1.0
  575. >>> distance.cosine([1, 1, 0], [0, 1, 0])
  576. 0.29289321881345254
  577. """
  578. # cosine distance is also referred to as 'uncentered correlation',
  579. # or 'reflective correlation'
  580. return correlation(u, v, w=w, centered=False)
  581. def hamming(u, v, w=None):
  582. """
  583. Compute the Hamming distance between two 1-D arrays.
  584. The Hamming distance between 1-D arrays `u` and `v`, is simply the
  585. proportion of disagreeing components in `u` and `v`. If `u` and `v` are
  586. boolean vectors, the Hamming distance is
  587. .. math::
  588. \\frac{c_{01} + c_{10}}{n}
  589. where :math:`c_{ij}` is the number of occurrences of
  590. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  591. :math:`k < n`.
  592. Parameters
  593. ----------
  594. u : (N,) array_like
  595. Input array.
  596. v : (N,) array_like
  597. Input array.
  598. w : (N,) array_like, optional
  599. The weights for each value in `u` and `v`. Default is None,
  600. which gives each value a weight of 1.0
  601. Returns
  602. -------
  603. hamming : double
  604. The Hamming distance between vectors `u` and `v`.
  605. Examples
  606. --------
  607. >>> from scipy.spatial import distance
  608. >>> distance.hamming([1, 0, 0], [0, 1, 0])
  609. 0.66666666666666663
  610. >>> distance.hamming([1, 0, 0], [1, 1, 0])
  611. 0.33333333333333331
  612. >>> distance.hamming([1, 0, 0], [2, 0, 0])
  613. 0.33333333333333331
  614. >>> distance.hamming([1, 0, 0], [3, 0, 0])
  615. 0.33333333333333331
  616. """
  617. u = _validate_vector(u)
  618. v = _validate_vector(v)
  619. if u.shape != v.shape:
  620. raise ValueError('The 1d arrays must have equal lengths.')
  621. u_ne_v = u != v
  622. if w is not None:
  623. w = _validate_weights(w)
  624. if w.shape != u.shape:
  625. raise ValueError("'w' should have the same length as 'u' and 'v'.")
  626. w = w / w.sum()
  627. return np.dot(u_ne_v, w)
  628. return np.mean(u_ne_v)
  629. def jaccard(u, v, w=None):
  630. r"""
  631. Compute the Jaccard dissimilarity between two boolean vectors.
  632. Given boolean vectors :math:`u \equiv (u_1, \cdots, u_n)`
  633. and :math:`v \equiv (v_1, \cdots, v_n)` that are not both zero,
  634. their *Jaccard dissimilarity* is defined as ([1]_, p. 26)
  635. .. math::
  636. d_\textrm{jaccard}(u, v) := \frac{c_{10} + c_{01}}
  637. {c_{11} + c_{10} + c_{01}}
  638. where
  639. .. math::
  640. c_{ij} := \sum_{1 \le k \le n, u_k=i, v_k=j} 1
  641. for :math:`i, j \in \{ 0, 1\}`. If :math:`u` and :math:`v` are both zero,
  642. their Jaccard dissimilarity is defined to be zero. [2]_
  643. If a (non-negative) weight vector :math:`w \equiv (w_1, \cdots, w_n)`
  644. is supplied, the *weighted Jaccard dissimilarity* is defined similarly
  645. but with :math:`c_{ij}` replaced by
  646. .. math::
  647. \tilde{c}_{ij} := \sum_{1 \le k \le n, u_k=i, v_k=j} w_k
  648. Parameters
  649. ----------
  650. u : (N,) array_like of bools
  651. Input vector.
  652. v : (N,) array_like of bools
  653. Input vector.
  654. w : (N,) array_like of floats, optional
  655. Weights for each pair of :math:`(u_k, v_k)`. Default is ``None``,
  656. which gives each pair a weight of ``1.0``.
  657. Returns
  658. -------
  659. jaccard : float
  660. The Jaccard dissimilarity between vectors `u` and `v`, optionally
  661. weighted by `w` if supplied.
  662. Notes
  663. -----
  664. The Jaccard dissimilarity satisfies the triangle inequality and is
  665. qualified as a metric. [2]_
  666. The *Jaccard index*, or *Jaccard similarity coefficient*, is equal to
  667. one minus the Jaccard dissimilarity. [3]_
  668. The dissimilarity between general (finite) sets may be computed by
  669. encoding them as boolean vectors and computing the dissimilarity
  670. between the encoded vectors.
  671. For example, subsets :math:`A,B` of :math:`\{ 1, 2, ..., n \}` may be
  672. encoded into boolean vectors :math:`u, v` by setting
  673. :math:`u_k := 1_{k \in A}`, :math:`v_k := 1_{k \in B}`
  674. for :math:`k = 1,2,\cdots,n`.
  675. .. versionchanged:: 1.2.0
  676. Previously, if all (positively weighted) elements in `u` and `v` are
  677. zero, the function would return ``nan``. This was changed to return
  678. ``0`` instead.
  679. .. versionchanged:: 1.15.0
  680. Non-0/1 numeric input used to produce an ad hoc result. Since 1.15.0,
  681. numeric input is converted to Boolean before computation.
  682. References
  683. ----------
  684. .. [1] Kaufman, L. and Rousseeuw, P. J. (1990). "Finding Groups in Data:
  685. An Introduction to Cluster Analysis." John Wiley & Sons, Inc.
  686. .. [2] Kosub, S. (2019). "A note on the triangle inequality for the
  687. Jaccard distance." *Pattern Recognition Letters*, 120:36-38.
  688. .. [3] https://en.wikipedia.org/wiki/Jaccard_index
  689. Examples
  690. --------
  691. >>> from scipy.spatial import distance
  692. Non-zero vectors with no matching 1s have dissimilarity of 1.0:
  693. >>> distance.jaccard([1, 0, 0], [0, 1, 0])
  694. 1.0
  695. Vectors with some matching 1s have dissimilarity less than 1.0:
  696. >>> distance.jaccard([1, 0, 0, 0], [1, 1, 1, 0])
  697. 0.6666666666666666
  698. Identical vectors, including zero vectors, have dissimilarity of 0.0:
  699. >>> distance.jaccard([1, 0, 0], [1, 0, 0])
  700. 0.0
  701. >>> distance.jaccard([0, 0, 0], [0, 0, 0])
  702. 0.0
  703. The following example computes the dissimilarity from a confusion matrix
  704. directly by setting the weight vector to the frequency of True Positive,
  705. False Negative, False Positive, and True Negative:
  706. >>> distance.jaccard([1, 1, 0, 0], [1, 0, 1, 0], [31, 41, 59, 26])
  707. 0.7633587786259542 # (41+59)/(31+41+59)
  708. """
  709. u = _validate_vector(u)
  710. v = _validate_vector(v)
  711. unequal = np.bitwise_xor(u != 0, v != 0)
  712. nonzero = np.bitwise_or(u != 0, v != 0)
  713. if w is not None:
  714. w = _validate_weights(w)
  715. unequal = w * unequal
  716. nonzero = w * nonzero
  717. a = np.float64(unequal.sum())
  718. b = np.float64(nonzero.sum())
  719. return (a / b) if b != 0 else np.float64(0)
  720. def seuclidean(u, v, V):
  721. """
  722. Return the standardized Euclidean distance between two 1-D arrays.
  723. The standardized Euclidean distance between two n-vectors `u` and `v` is
  724. .. math::
  725. \\sqrt{\\sum\\limits_i \\frac{1}{V_i} \\left(u_i-v_i \\right)^2}
  726. ``V`` is the variance vector; ``V[I]`` is the variance computed over all the i-th
  727. components of the points. If not passed, it is automatically computed.
  728. Parameters
  729. ----------
  730. u : (N,) array_like
  731. Input array.
  732. v : (N,) array_like
  733. Input array.
  734. V : (N,) array_like
  735. `V` is a 1-D array of component variances. It is usually computed
  736. among a larger collection of vectors.
  737. Returns
  738. -------
  739. seuclidean : double
  740. The standardized Euclidean distance between vectors `u` and `v`.
  741. Examples
  742. --------
  743. >>> from scipy.spatial import distance
  744. >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1])
  745. 4.4721359549995796
  746. >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1])
  747. 3.3166247903553998
  748. >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1])
  749. 3.1780497164141406
  750. """
  751. u = _validate_vector(u)
  752. v = _validate_vector(v)
  753. V = _validate_vector(V, dtype=np.float64)
  754. if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
  755. raise TypeError('V must be a 1-D array of the same dimension '
  756. 'as u and v.')
  757. return euclidean(u, v, w=1/V)
  758. def cityblock(u, v, w=None):
  759. """
  760. Compute the City Block (Manhattan) distance.
  761. Computes the Manhattan distance between two 1-D arrays `u` and `v`,
  762. which is defined as
  763. .. math::
  764. \\sum_i {\\left| u_i - v_i \\right|}.
  765. Parameters
  766. ----------
  767. u : (N,) array_like
  768. Input array.
  769. v : (N,) array_like
  770. Input array.
  771. w : (N,) array_like, optional
  772. The weights for each value in `u` and `v`. Default is None,
  773. which gives each value a weight of 1.0
  774. Returns
  775. -------
  776. cityblock : double
  777. The City Block (Manhattan) distance between vectors `u` and `v`.
  778. Examples
  779. --------
  780. >>> from scipy.spatial import distance
  781. >>> distance.cityblock([1, 0, 0], [0, 1, 0])
  782. 2
  783. >>> distance.cityblock([1, 0, 0], [0, 2, 0])
  784. 3
  785. >>> distance.cityblock([1, 0, 0], [1, 1, 0])
  786. 1
  787. """
  788. u = _validate_vector(u)
  789. v = _validate_vector(v)
  790. l1_diff = abs(u - v)
  791. if w is not None:
  792. w = _validate_weights(w)
  793. l1_diff = w * l1_diff
  794. return l1_diff.sum()
  795. def mahalanobis(u, v, VI):
  796. """
  797. Compute the Mahalanobis distance between two 1-D arrays.
  798. The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as
  799. .. math::
  800. \\sqrt{ (u-v) V^{-1} (u-v)^T }
  801. where ``V`` is the covariance matrix. Note that the argument `VI`
  802. is the inverse of ``V``.
  803. Parameters
  804. ----------
  805. u : (N,) array_like
  806. Input array.
  807. v : (N,) array_like
  808. Input array.
  809. VI : array_like
  810. The inverse of the covariance matrix.
  811. Returns
  812. -------
  813. mahalanobis : double
  814. The Mahalanobis distance between vectors `u` and `v`.
  815. Examples
  816. --------
  817. >>> from scipy.spatial import distance
  818. >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
  819. >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)
  820. 1.0
  821. >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)
  822. 1.0
  823. >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
  824. 1.7320508075688772
  825. """
  826. u = _validate_vector(u)
  827. v = _validate_vector(v)
  828. VI = np.atleast_2d(VI)
  829. delta = u - v
  830. m = np.dot(np.dot(delta, VI), delta)
  831. return np.sqrt(m)
  832. def chebyshev(u, v, w=None):
  833. r"""
  834. Compute the Chebyshev distance.
  835. The *Chebyshev distance* between real vectors
  836. :math:`u \equiv (u_1, \cdots, u_n)` and
  837. :math:`v \equiv (v_1, \cdots, v_n)` is defined as [1]_
  838. .. math::
  839. d_\textrm{chebyshev}(u,v) := \max_{1 \le i \le n} |u_i-v_i|
  840. If a (non-negative) weight vector :math:`w \equiv (w_1, \cdots, w_n)`
  841. is supplied, the *weighted Chebyshev distance* is defined to be the
  842. weighted Minkowski distance of infinite order; that is,
  843. .. math::
  844. \begin{align}
  845. d_\textrm{chebyshev}(u,v;w) &:= \lim_{p\rightarrow \infty}
  846. \left( \sum_{i=1}^n w_i | u_i-v_i |^p \right)^\frac{1}{p} \\
  847. &= \max_{1 \le i \le n} 1_{w_i > 0} | u_i - v_i |
  848. \end{align}
  849. Parameters
  850. ----------
  851. u : (N,) array_like of floats
  852. Input vector.
  853. v : (N,) array_like of floats
  854. Input vector.
  855. w : (N,) array_like of floats, optional
  856. Weight vector. Default is ``None``, which gives all pairs
  857. :math:`(u_i, v_i)` the same weight ``1.0``.
  858. Returns
  859. -------
  860. chebyshev : float
  861. The Chebyshev distance between vectors `u` and `v`, optionally weighted
  862. by `w`.
  863. References
  864. ----------
  865. .. [1] https://en.wikipedia.org/wiki/Chebyshev_distance
  866. Examples
  867. --------
  868. >>> from scipy.spatial import distance
  869. >>> distance.chebyshev([1, 0, 0], [0, 1, 0])
  870. 1
  871. >>> distance.chebyshev([1, 1, 0], [0, 1, 0])
  872. 1
  873. """
  874. u = _validate_vector(u)
  875. v = _validate_vector(v)
  876. if w is not None:
  877. w = _validate_weights(w)
  878. return max((w > 0) * abs(u - v))
  879. return max(abs(u - v))
  880. def braycurtis(u, v, w=None):
  881. """
  882. Compute the Bray-Curtis distance between two 1-D arrays.
  883. Bray-Curtis distance is defined as
  884. .. math::
  885. \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|}
  886. The Bray-Curtis distance is in the range [0, 1] if all coordinates are
  887. positive, and is undefined if the inputs are of length zero.
  888. Parameters
  889. ----------
  890. u : (N,) array_like
  891. Input array.
  892. v : (N,) array_like
  893. Input array.
  894. w : (N,) array_like, optional
  895. The weights for each value in `u` and `v`. Default is None,
  896. which gives each value a weight of 1.0
  897. Returns
  898. -------
  899. braycurtis : double
  900. The Bray-Curtis distance between 1-D arrays `u` and `v`.
  901. Examples
  902. --------
  903. >>> from scipy.spatial import distance
  904. >>> distance.braycurtis([1, 0, 0], [0, 1, 0])
  905. 1.0
  906. >>> distance.braycurtis([1, 1, 0], [0, 1, 0])
  907. 0.33333333333333331
  908. """
  909. u = _validate_vector(u)
  910. v = _validate_vector(v, dtype=np.float64)
  911. l1_diff = abs(u - v)
  912. l1_sum = abs(u + v)
  913. if w is not None:
  914. w = _validate_weights(w)
  915. l1_diff = w * l1_diff
  916. l1_sum = w * l1_sum
  917. return l1_diff.sum() / l1_sum.sum()
  918. def canberra(u, v, w=None):
  919. """
  920. Compute the Canberra distance between two 1-D arrays.
  921. The Canberra distance is defined as
  922. .. math::
  923. d(u,v) = \\sum_i \\frac{|u_i-v_i|}
  924. {|u_i|+|v_i|}.
  925. Parameters
  926. ----------
  927. u : (N,) array_like
  928. Input array.
  929. v : (N,) array_like
  930. Input array.
  931. w : (N,) array_like, optional
  932. The weights for each value in `u` and `v`. Default is None,
  933. which gives each value a weight of 1.0
  934. Returns
  935. -------
  936. canberra : double
  937. The Canberra distance between vectors `u` and `v`.
  938. Notes
  939. -----
  940. When ``u[i]`` and ``v[i]`` are 0 for given i, then the fraction 0/0 = 0 is
  941. used in the calculation.
  942. Examples
  943. --------
  944. >>> from scipy.spatial import distance
  945. >>> distance.canberra([1, 0, 0], [0, 1, 0])
  946. 2.0
  947. >>> distance.canberra([1, 1, 0], [0, 1, 0])
  948. 1.0
  949. """
  950. u = _validate_vector(u)
  951. v = _validate_vector(v, dtype=np.float64)
  952. if w is not None:
  953. w = _validate_weights(w)
  954. with np.errstate(invalid='ignore'):
  955. abs_uv = abs(u - v)
  956. abs_u = abs(u)
  957. abs_v = abs(v)
  958. d = abs_uv / (abs_u + abs_v)
  959. if w is not None:
  960. d = w * d
  961. d = np.nansum(d)
  962. return d
  963. def jensenshannon(p, q, base=None, *, axis=0, keepdims=False):
  964. """
  965. Compute the Jensen-Shannon distance (metric) between
  966. two probability arrays. This is the square root
  967. of the Jensen-Shannon divergence.
  968. The Jensen-Shannon distance between two probability
  969. vectors `p` and `q` is defined as,
  970. .. math::
  971. \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
  972. where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
  973. and :math:`D` is the Kullback-Leibler divergence.
  974. This routine will normalize `p` and `q` if they don't sum to 1.0.
  975. Parameters
  976. ----------
  977. p : (N,) array_like
  978. left probability vector
  979. q : (N,) array_like
  980. right probability vector
  981. base : double, optional
  982. the base of the logarithm used to compute the output
  983. if not given, then the routine uses the default base of
  984. scipy.stats.entropy.
  985. axis : int, optional
  986. Axis along which the Jensen-Shannon distances are computed. The default
  987. is 0.
  988. .. versionadded:: 1.7.0
  989. keepdims : bool, optional
  990. If this is set to `True`, the reduced axes are left in the
  991. result as dimensions with size one. With this option,
  992. the result will broadcast correctly against the input array.
  993. Default is False.
  994. .. versionadded:: 1.7.0
  995. Returns
  996. -------
  997. js : double or ndarray
  998. The Jensen-Shannon distances between `p` and `q` along the `axis`.
  999. Notes
  1000. -----
  1001. .. versionadded:: 1.2.0
  1002. Examples
  1003. --------
  1004. >>> from scipy.spatial import distance
  1005. >>> import numpy as np
  1006. >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0)
  1007. 1.0
  1008. >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5])
  1009. 0.46450140402245893
  1010. >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0])
  1011. 0.0
  1012. >>> a = np.array([[1, 2, 3, 4],
  1013. ... [5, 6, 7, 8],
  1014. ... [9, 10, 11, 12]])
  1015. >>> b = np.array([[13, 14, 15, 16],
  1016. ... [17, 18, 19, 20],
  1017. ... [21, 22, 23, 24]])
  1018. >>> distance.jensenshannon(a, b, axis=0)
  1019. array([0.1954288, 0.1447697, 0.1138377, 0.0927636])
  1020. >>> distance.jensenshannon(a, b, axis=1)
  1021. array([0.1402339, 0.0399106, 0.0201815])
  1022. """
  1023. p = np.asarray(p)
  1024. q = np.asarray(q)
  1025. p = p / np.sum(p, axis=axis, keepdims=True)
  1026. q = q / np.sum(q, axis=axis, keepdims=True)
  1027. m = (p + q) / 2.0
  1028. left = rel_entr(p, m)
  1029. right = rel_entr(q, m)
  1030. left_sum = np.sum(left, axis=axis, keepdims=keepdims)
  1031. right_sum = np.sum(right, axis=axis, keepdims=keepdims)
  1032. js = left_sum + right_sum
  1033. if base is not None:
  1034. js /= np.log(base)
  1035. return np.sqrt(js / 2.0)
  1036. def yule(u, v, w=None):
  1037. """
  1038. Compute the Yule dissimilarity between two boolean 1-D arrays.
  1039. The Yule dissimilarity is defined as
  1040. .. math::
  1041. \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}}
  1042. where :math:`c_{ij}` is the number of occurrences of
  1043. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  1044. :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`.
  1045. Parameters
  1046. ----------
  1047. u : (N,) array_like, bool
  1048. Input array.
  1049. v : (N,) array_like, bool
  1050. Input array.
  1051. w : (N,) array_like, optional
  1052. The weights for each value in `u` and `v`. Default is None,
  1053. which gives each value a weight of 1.0
  1054. Returns
  1055. -------
  1056. yule : double
  1057. The Yule dissimilarity between vectors `u` and `v`.
  1058. Examples
  1059. --------
  1060. >>> from scipy.spatial import distance
  1061. >>> distance.yule([1, 0, 0], [0, 1, 0])
  1062. 2.0
  1063. >>> distance.yule([1, 1, 0], [0, 1, 0])
  1064. 0.0
  1065. """
  1066. u = _validate_vector(u)
  1067. v = _validate_vector(v)
  1068. if w is not None:
  1069. w = _validate_weights(w)
  1070. (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
  1071. half_R = ntf * nft
  1072. if half_R == 0:
  1073. return 0.0
  1074. else:
  1075. return float(2.0 * half_R / (ntt * nff + half_R))
  1076. def dice(u, v, w=None):
  1077. """
  1078. Compute the Dice dissimilarity between two boolean 1-D arrays.
  1079. The Dice dissimilarity between `u` and `v`, is
  1080. .. math::
  1081. \\frac{c_{TF} + c_{FT}}
  1082. {2c_{TT} + c_{FT} + c_{TF}}
  1083. where :math:`c_{ij}` is the number of occurrences of
  1084. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  1085. :math:`k < n`.
  1086. Parameters
  1087. ----------
  1088. u : (N,) array_like, bool
  1089. Input 1-D array.
  1090. v : (N,) array_like, bool
  1091. Input 1-D array.
  1092. w : (N,) array_like, optional
  1093. The weights for each value in `u` and `v`. Default is None,
  1094. which gives each value a weight of 1.0
  1095. Returns
  1096. -------
  1097. dice : double
  1098. The Dice dissimilarity between 1-D arrays `u` and `v`.
  1099. Notes
  1100. -----
  1101. This function computes the Dice dissimilarity index. To compute the
  1102. Dice similarity index, convert one to the other with similarity =
  1103. 1 - dissimilarity.
  1104. Examples
  1105. --------
  1106. >>> from scipy.spatial import distance
  1107. >>> distance.dice([1, 0, 0], [0, 1, 0])
  1108. 1.0
  1109. >>> distance.dice([1, 0, 0], [1, 1, 0])
  1110. 0.3333333333333333
  1111. >>> distance.dice([1, 0, 0], [2, 0, 0])
  1112. -0.3333333333333333
  1113. """
  1114. u = _validate_vector(u)
  1115. v = _validate_vector(v)
  1116. if w is not None:
  1117. w = _validate_weights(w)
  1118. if u.dtype == v.dtype == bool and w is None:
  1119. ntt = (u & v).sum()
  1120. else:
  1121. dtype = np.result_type(int, u.dtype, v.dtype)
  1122. u = u.astype(dtype)
  1123. v = v.astype(dtype)
  1124. if w is None:
  1125. ntt = (u * v).sum()
  1126. else:
  1127. ntt = (u * v * w).sum()
  1128. (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
  1129. return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))
  1130. def rogerstanimoto(u, v, w=None):
  1131. """
  1132. Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays.
  1133. The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays
  1134. `u` and `v`, is defined as
  1135. .. math::
  1136. \\frac{R}
  1137. {c_{TT} + c_{FF} + R}
  1138. where :math:`c_{ij}` is the number of occurrences of
  1139. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  1140. :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
  1141. Parameters
  1142. ----------
  1143. u : (N,) array_like, bool
  1144. Input array.
  1145. v : (N,) array_like, bool
  1146. Input array.
  1147. w : (N,) array_like, optional
  1148. The weights for each value in `u` and `v`. Default is None,
  1149. which gives each value a weight of 1.0
  1150. Returns
  1151. -------
  1152. rogerstanimoto : double
  1153. The Rogers-Tanimoto dissimilarity between vectors
  1154. `u` and `v`.
  1155. Examples
  1156. --------
  1157. >>> from scipy.spatial import distance
  1158. >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0])
  1159. 0.8
  1160. >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0])
  1161. 0.5
  1162. >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0])
  1163. -1.0
  1164. """
  1165. u = _validate_vector(u)
  1166. v = _validate_vector(v)
  1167. if w is not None:
  1168. w = _validate_weights(w)
  1169. (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
  1170. return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
  1171. def russellrao(u, v, w=None):
  1172. """
  1173. Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
  1174. The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and
  1175. `v`, is defined as
  1176. .. math::
  1177. \\frac{n - c_{TT}}
  1178. {n}
  1179. where :math:`c_{ij}` is the number of occurrences of
  1180. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  1181. :math:`k < n`.
  1182. Parameters
  1183. ----------
  1184. u : (N,) array_like, bool
  1185. Input array.
  1186. v : (N,) array_like, bool
  1187. Input array.
  1188. w : (N,) array_like, optional
  1189. The weights for each value in `u` and `v`. Default is None,
  1190. which gives each value a weight of 1.0
  1191. Returns
  1192. -------
  1193. russellrao : double
  1194. The Russell-Rao dissimilarity between vectors `u` and `v`.
  1195. Examples
  1196. --------
  1197. >>> from scipy.spatial import distance
  1198. >>> distance.russellrao([1, 0, 0], [0, 1, 0])
  1199. 1.0
  1200. >>> distance.russellrao([1, 0, 0], [1, 1, 0])
  1201. 0.6666666666666666
  1202. >>> distance.russellrao([1, 0, 0], [2, 0, 0])
  1203. 0.3333333333333333
  1204. """
  1205. u = _validate_vector(u)
  1206. v = _validate_vector(v)
  1207. if u.dtype == v.dtype == bool and w is None:
  1208. ntt = (u & v).sum()
  1209. n = float(len(u))
  1210. elif w is None:
  1211. ntt = (u * v).sum()
  1212. n = float(len(u))
  1213. else:
  1214. w = _validate_weights(w)
  1215. ntt = (u * v * w).sum()
  1216. n = w.sum()
  1217. return float(n - ntt) / n
  1218. def sokalsneath(u, v, w=None):
  1219. """
  1220. Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays.
  1221. The Sokal-Sneath dissimilarity between `u` and `v`,
  1222. .. math::
  1223. \\frac{R}
  1224. {c_{TT} + R}
  1225. where :math:`c_{ij}` is the number of occurrences of
  1226. :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
  1227. :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
  1228. Parameters
  1229. ----------
  1230. u : (N,) array_like, bool
  1231. Input array.
  1232. v : (N,) array_like, bool
  1233. Input array.
  1234. w : (N,) array_like, optional
  1235. The weights for each value in `u` and `v`. Default is None,
  1236. which gives each value a weight of 1.0
  1237. Returns
  1238. -------
  1239. sokalsneath : double
  1240. The Sokal-Sneath dissimilarity between vectors `u` and `v`.
  1241. Examples
  1242. --------
  1243. >>> from scipy.spatial import distance
  1244. >>> distance.sokalsneath([1, 0, 0], [0, 1, 0])
  1245. 1.0
  1246. >>> distance.sokalsneath([1, 0, 0], [1, 1, 0])
  1247. 0.66666666666666663
  1248. >>> distance.sokalsneath([1, 0, 0], [2, 1, 0])
  1249. 0.0
  1250. >>> distance.sokalsneath([1, 0, 0], [3, 1, 0])
  1251. -2.0
  1252. """
  1253. u = _validate_vector(u)
  1254. v = _validate_vector(v)
  1255. if u.dtype == v.dtype == bool and w is None:
  1256. ntt = (u & v).sum()
  1257. elif w is None:
  1258. ntt = (u * v).sum()
  1259. else:
  1260. w = _validate_weights(w)
  1261. ntt = (u * v * w).sum()
  1262. (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
  1263. denom = np.array(ntt + 2.0 * (ntf + nft))
  1264. if not denom.any():
  1265. raise ValueError('Sokal-Sneath dissimilarity is not defined for '
  1266. 'vectors that are entirely false.')
  1267. return float(2.0 * (ntf + nft)) / denom
  1268. _convert_to_double = partial(_convert_to_type, out_type=np.float64)
  1269. _convert_to_bool = partial(_convert_to_type, out_type=bool)
  1270. # adding python-only wrappers to _distance_wrap module
  1271. _distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap
  1272. _distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap
  1273. @dataclasses.dataclass(frozen=True)
  1274. class CDistMetricWrapper:
  1275. metric_name: str
  1276. def __call__(self, XA, XB, *, out=None, **kwargs):
  1277. XA = np.ascontiguousarray(XA)
  1278. XB = np.ascontiguousarray(XB)
  1279. mA, n = XA.shape
  1280. mB, _ = XB.shape
  1281. metric_name = self.metric_name
  1282. metric_info = _METRICS[metric_name]
  1283. XA, XB, typ, kwargs = _validate_cdist_input(
  1284. XA, XB, mA, mB, n, metric_info, **kwargs)
  1285. w = kwargs.pop('w', None)
  1286. if w is not None:
  1287. metric = metric_info.dist_func
  1288. return _cdist_callable(
  1289. XA, XB, metric=metric, out=out, w=w, **kwargs)
  1290. dm = _prepare_out_argument(out, np.float64, (mA, mB))
  1291. # get cdist wrapper
  1292. cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')
  1293. cdist_fn(XA, XB, dm, **kwargs)
  1294. return dm
  1295. @dataclasses.dataclass(frozen=True)
  1296. class PDistMetricWrapper:
  1297. metric_name: str
  1298. def __call__(self, X, *, out=None, **kwargs):
  1299. X = np.ascontiguousarray(X)
  1300. m, n = X.shape
  1301. metric_name = self.metric_name
  1302. metric_info = _METRICS[metric_name]
  1303. X, typ, kwargs = _validate_pdist_input(
  1304. X, m, n, metric_info, **kwargs)
  1305. out_size = (m * (m - 1)) // 2
  1306. w = kwargs.pop('w', None)
  1307. if w is not None:
  1308. metric = metric_info.dist_func
  1309. return _pdist_callable(
  1310. X, metric=metric, out=out, w=w, **kwargs)
  1311. dm = _prepare_out_argument(out, np.float64, (out_size,))
  1312. # get pdist wrapper
  1313. pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')
  1314. pdist_fn(X, dm, **kwargs)
  1315. return dm
  1316. @dataclasses.dataclass(frozen=True)
  1317. class MetricInfo:
  1318. # Name of python distance function
  1319. canonical_name: str
  1320. # All aliases, including canonical_name
  1321. aka: set[str]
  1322. # unvectorized distance function
  1323. dist_func: Callable
  1324. # Optimized cdist function
  1325. cdist_func: Callable
  1326. # Optimized pdist function
  1327. pdist_func: Callable
  1328. # function that checks kwargs and computes default values:
  1329. # f(X, m, n, **kwargs)
  1330. validator: Callable | None = None
  1331. # list of supported types:
  1332. # X (pdist) and XA (cdist) are used to choose the type. if there is no
  1333. # match the first type is used. Default double
  1334. types: list[str] = dataclasses.field(default_factory=lambda: ['double'])
  1335. # true if out array must be C-contiguous
  1336. requires_contiguous_out: bool = True
  1337. # Registry of implemented metrics:
  1338. _METRIC_INFOS = [
  1339. MetricInfo(
  1340. canonical_name='braycurtis',
  1341. aka={'braycurtis'},
  1342. dist_func=braycurtis,
  1343. cdist_func=_distance_pybind.cdist_braycurtis,
  1344. pdist_func=_distance_pybind.pdist_braycurtis,
  1345. ),
  1346. MetricInfo(
  1347. canonical_name='canberra',
  1348. aka={'canberra'},
  1349. dist_func=canberra,
  1350. cdist_func=_distance_pybind.cdist_canberra,
  1351. pdist_func=_distance_pybind.pdist_canberra,
  1352. ),
  1353. MetricInfo(
  1354. canonical_name='chebyshev',
  1355. aka={'chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'},
  1356. dist_func=chebyshev,
  1357. cdist_func=_distance_pybind.cdist_chebyshev,
  1358. pdist_func=_distance_pybind.pdist_chebyshev,
  1359. ),
  1360. MetricInfo(
  1361. canonical_name='cityblock',
  1362. aka={'cityblock', 'cblock', 'cb', 'c'},
  1363. dist_func=cityblock,
  1364. cdist_func=_distance_pybind.cdist_cityblock,
  1365. pdist_func=_distance_pybind.pdist_cityblock,
  1366. ),
  1367. MetricInfo(
  1368. canonical_name='correlation',
  1369. aka={'correlation', 'co'},
  1370. dist_func=correlation,
  1371. cdist_func=CDistMetricWrapper('correlation'),
  1372. pdist_func=PDistMetricWrapper('correlation'),
  1373. ),
  1374. MetricInfo(
  1375. canonical_name='cosine',
  1376. aka={'cosine', 'cos'},
  1377. dist_func=cosine,
  1378. cdist_func=CDistMetricWrapper('cosine'),
  1379. pdist_func=PDistMetricWrapper('cosine'),
  1380. ),
  1381. MetricInfo(
  1382. canonical_name='dice',
  1383. aka={'dice'},
  1384. types=['bool'],
  1385. dist_func=dice,
  1386. cdist_func=_distance_pybind.cdist_dice,
  1387. pdist_func=_distance_pybind.pdist_dice,
  1388. ),
  1389. MetricInfo(
  1390. canonical_name='euclidean',
  1391. aka={'euclidean', 'euclid', 'eu', 'e'},
  1392. dist_func=euclidean,
  1393. cdist_func=_distance_pybind.cdist_euclidean,
  1394. pdist_func=_distance_pybind.pdist_euclidean,
  1395. ),
  1396. MetricInfo(
  1397. canonical_name='hamming',
  1398. aka={'matching', 'hamming', 'hamm', 'ha', 'h'},
  1399. types=['double', 'bool'],
  1400. validator=_validate_hamming_kwargs,
  1401. dist_func=hamming,
  1402. cdist_func=_distance_pybind.cdist_hamming,
  1403. pdist_func=_distance_pybind.pdist_hamming,
  1404. ),
  1405. MetricInfo(
  1406. canonical_name='jaccard',
  1407. aka={'jaccard', 'jacc', 'ja', 'j'},
  1408. types=['double', 'bool'],
  1409. dist_func=jaccard,
  1410. cdist_func=_distance_pybind.cdist_jaccard,
  1411. pdist_func=_distance_pybind.pdist_jaccard,
  1412. ),
  1413. MetricInfo(
  1414. canonical_name='jensenshannon',
  1415. aka={'jensenshannon', 'js'},
  1416. dist_func=jensenshannon,
  1417. cdist_func=CDistMetricWrapper('jensenshannon'),
  1418. pdist_func=PDistMetricWrapper('jensenshannon'),
  1419. ),
  1420. MetricInfo(
  1421. canonical_name='mahalanobis',
  1422. aka={'mahalanobis', 'mahal', 'mah'},
  1423. validator=_validate_mahalanobis_kwargs,
  1424. dist_func=mahalanobis,
  1425. cdist_func=CDistMetricWrapper('mahalanobis'),
  1426. pdist_func=PDistMetricWrapper('mahalanobis'),
  1427. ),
  1428. MetricInfo(
  1429. canonical_name='minkowski',
  1430. aka={'minkowski', 'mi', 'm', 'pnorm'},
  1431. validator=_validate_minkowski_kwargs,
  1432. dist_func=minkowski,
  1433. cdist_func=_distance_pybind.cdist_minkowski,
  1434. pdist_func=_distance_pybind.pdist_minkowski,
  1435. ),
  1436. MetricInfo(
  1437. canonical_name='rogerstanimoto',
  1438. aka={'rogerstanimoto'},
  1439. types=['bool'],
  1440. dist_func=rogerstanimoto,
  1441. cdist_func=_distance_pybind.cdist_rogerstanimoto,
  1442. pdist_func=_distance_pybind.pdist_rogerstanimoto,
  1443. ),
  1444. MetricInfo(
  1445. canonical_name='russellrao',
  1446. aka={'russellrao'},
  1447. types=['bool'],
  1448. dist_func=russellrao,
  1449. cdist_func=_distance_pybind.cdist_russellrao,
  1450. pdist_func=_distance_pybind.pdist_russellrao,
  1451. ),
  1452. MetricInfo(
  1453. canonical_name='seuclidean',
  1454. aka={'seuclidean', 'se', 's'},
  1455. validator=_validate_seuclidean_kwargs,
  1456. dist_func=seuclidean,
  1457. cdist_func=CDistMetricWrapper('seuclidean'),
  1458. pdist_func=PDistMetricWrapper('seuclidean'),
  1459. ),
  1460. MetricInfo(
  1461. canonical_name='sokalsneath',
  1462. aka={'sokalsneath'},
  1463. types=['bool'],
  1464. dist_func=sokalsneath,
  1465. cdist_func=_distance_pybind.cdist_sokalsneath,
  1466. pdist_func=_distance_pybind.pdist_sokalsneath,
  1467. ),
  1468. MetricInfo(
  1469. canonical_name='sqeuclidean',
  1470. aka={'sqeuclidean', 'sqe', 'sqeuclid'},
  1471. dist_func=sqeuclidean,
  1472. cdist_func=_distance_pybind.cdist_sqeuclidean,
  1473. pdist_func=_distance_pybind.pdist_sqeuclidean,
  1474. ),
  1475. MetricInfo(
  1476. canonical_name='yule',
  1477. aka={'yule'},
  1478. types=['bool'],
  1479. dist_func=yule,
  1480. cdist_func=_distance_pybind.cdist_yule,
  1481. pdist_func=_distance_pybind.pdist_yule,
  1482. ),
  1483. ]
  1484. _METRICS = {info.canonical_name: info for info in _METRIC_INFOS}
  1485. _METRIC_ALIAS = {alias: info
  1486. for info in _METRIC_INFOS
  1487. for alias in info.aka}
  1488. _METRICS_NAMES = list(_METRICS.keys())
  1489. _TEST_METRICS = {'test_' + info.canonical_name: info for info in _METRIC_INFOS}
  1490. def pdist(X, metric='euclidean', *, out=None, **kwargs):
  1491. """
  1492. Pairwise distances between observations in n-dimensional space.
  1493. See Notes for common calling conventions.
  1494. Parameters
  1495. ----------
  1496. X : array_like
  1497. An m by n array of m original observations in an
  1498. n-dimensional space.
  1499. metric : str or function, optional
  1500. The distance metric to use. The distance function can
  1501. be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
  1502. 'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
  1503. 'jaccard', 'jensenshannon',
  1504. 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
  1505. 'russellrao', 'seuclidean', 'sokalsneath',
  1506. 'sqeuclidean', 'yule'.
  1507. out : ndarray, optional
  1508. The output array.
  1509. If not None, condensed distance matrix Y is stored in this array.
  1510. **kwargs : dict, optional
  1511. Extra arguments to `metric`: refer to each metric documentation for a
  1512. list of all possible arguments.
  1513. Some possible arguments:
  1514. p : scalar
  1515. The p-norm to apply for Minkowski, weighted and unweighted.
  1516. Default: 2.
  1517. w : ndarray
  1518. The weight vector for metrics that support weights (e.g., Minkowski).
  1519. V : ndarray
  1520. The variance vector for standardized Euclidean.
  1521. Default: var(X, axis=0, ddof=1)
  1522. VI : ndarray
  1523. The inverse of the covariance matrix for Mahalanobis.
  1524. Default: inv(cov(X.T)).T
  1525. Returns
  1526. -------
  1527. Y : ndarray
  1528. Returns a condensed distance matrix Y. For each :math:`i` and :math:`j`
  1529. (where :math:`i<j<m`),where m is the number of original observations.
  1530. The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``m
  1531. * i + j - ((i + 2) * (i + 1)) // 2``.
  1532. See Also
  1533. --------
  1534. squareform : converts between condensed distance matrices and
  1535. square distance matrices.
  1536. Notes
  1537. -----
  1538. See ``squareform`` for information on how to calculate the index of
  1539. this entry or to convert the condensed distance matrix to a
  1540. redundant square matrix.
  1541. The following are common calling conventions.
  1542. 1. ``Y = pdist(X, 'euclidean')``
  1543. Computes the distance between m points using Euclidean distance
  1544. (2-norm) as the distance metric between the points. The points
  1545. are arranged as m n-dimensional row vectors in the matrix X.
  1546. 2. ``Y = pdist(X, 'minkowski', p=2.)``
  1547. Computes the distances using the Minkowski distance
  1548. :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
  1549. that this is only a quasi-metric if :math:`0 < p < 1`).
  1550. 3. ``Y = pdist(X, 'cityblock')``
  1551. Computes the city block or Manhattan distance between the
  1552. points.
  1553. 4. ``Y = pdist(X, 'seuclidean', V=None)``
  1554. Computes the standardized Euclidean distance. The standardized
  1555. Euclidean distance between two n-vectors ``u`` and ``v`` is
  1556. .. math::
  1557. \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}
  1558. V is the variance vector; V[i] is the variance computed over all
  1559. the i'th components of the points. If not passed, it is
  1560. automatically computed.
  1561. 5. ``Y = pdist(X, 'sqeuclidean')``
  1562. Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
  1563. the vectors.
  1564. 6. ``Y = pdist(X, 'cosine')``
  1565. Computes the cosine distance between vectors u and v,
  1566. .. math::
  1567. 1 - \\frac{u \\cdot v}
  1568. {{\\|u\\|}_2 {\\|v\\|}_2}
  1569. where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
  1570. :math:`u \\cdot v` is the dot product of ``u`` and ``v``.
  1571. 7. ``Y = pdist(X, 'correlation')``
  1572. Computes the correlation distance between vectors u and v. This is
  1573. .. math::
  1574. 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
  1575. {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
  1576. where :math:`\\bar{v}` is the mean of the elements of vector v,
  1577. and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
  1578. 8. ``Y = pdist(X, 'hamming')``
  1579. Computes the normalized Hamming distance, or the proportion of
  1580. those vector elements between two n-vectors ``u`` and ``v``
  1581. which disagree. To save memory, the matrix ``X`` can be of type
  1582. boolean.
  1583. 9. ``Y = pdist(X, 'jaccard')``
  1584. Computes the Jaccard distance between the points. Given two
  1585. vectors, ``u`` and ``v``, the Jaccard distance is the
  1586. proportion of those elements ``u[i]`` and ``v[i]`` that
  1587. disagree.
  1588. 10. ``Y = pdist(X, 'jensenshannon')``
  1589. Computes the Jensen-Shannon distance between two probability arrays.
  1590. Given two probability vectors, :math:`p` and :math:`q`, the
  1591. Jensen-Shannon distance is
  1592. .. math::
  1593. \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
  1594. where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
  1595. and :math:`D` is the Kullback-Leibler divergence.
  1596. 11. ``Y = pdist(X, 'chebyshev')``
  1597. Computes the Chebyshev distance between the points. The
  1598. Chebyshev distance between two n-vectors ``u`` and ``v`` is the
  1599. maximum norm-1 distance between their respective elements. More
  1600. precisely, the distance is given by
  1601. .. math::
  1602. d(u,v) = \\max_i {|u_i-v_i|}
  1603. 12. ``Y = pdist(X, 'canberra')``
  1604. Computes the Canberra distance between the points. The
  1605. Canberra distance between two points ``u`` and ``v`` is
  1606. .. math::
  1607. d(u,v) = \\sum_i \\frac{|u_i-v_i|}
  1608. {|u_i|+|v_i|}
  1609. 13. ``Y = pdist(X, 'braycurtis')``
  1610. Computes the Bray-Curtis distance between the points. The
  1611. Bray-Curtis distance between two points ``u`` and ``v`` is
  1612. .. math::
  1613. d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}
  1614. {\\sum_i {|u_i+v_i|}}
  1615. 14. ``Y = pdist(X, 'mahalanobis', VI=None)``
  1616. Computes the Mahalanobis distance between the points. The
  1617. Mahalanobis distance between two points ``u`` and ``v`` is
  1618. :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
  1619. variable) is the inverse covariance. If ``VI`` is not None,
  1620. ``VI`` will be used as the inverse covariance matrix.
  1621. 15. ``Y = pdist(X, 'yule')``
  1622. Computes the Yule distance between each pair of boolean
  1623. vectors. (see yule function documentation)
  1624. 16. ``Y = pdist(X, 'matching')``
  1625. Synonym for 'hamming'.
  1626. 17. ``Y = pdist(X, 'dice')``
  1627. Computes the Dice distance between each pair of boolean
  1628. vectors. (see dice function documentation)
  1629. 18. ``Y = pdist(X, 'rogerstanimoto')``
  1630. Computes the Rogers-Tanimoto distance between each pair of
  1631. boolean vectors. (see rogerstanimoto function documentation)
  1632. 19. ``Y = pdist(X, 'russellrao')``
  1633. Computes the Russell-Rao distance between each pair of
  1634. boolean vectors. (see russellrao function documentation)
  1635. 20. ``Y = pdist(X, 'sokalsneath')``
  1636. Computes the Sokal-Sneath distance between each pair of
  1637. boolean vectors. (see sokalsneath function documentation)
  1638. 21. ``Y = pdist(X, f)``
  1639. Computes the distance between all pairs of vectors in X
  1640. using the user supplied 2-arity function f. For example,
  1641. Euclidean distance between the vectors could be computed
  1642. as follows::
  1643. dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))
  1644. Note that you should avoid passing a reference to one of
  1645. the distance functions defined in this library. For example,::
  1646. dm = pdist(X, sokalsneath)
  1647. would calculate the pair-wise distances between the vectors in
  1648. X using the Python function sokalsneath. This would result in
  1649. sokalsneath being called :math:`{n \\choose 2}` times, which
  1650. is inefficient. Instead, the optimized C version is more
  1651. efficient, and we call it using the following syntax.::
  1652. dm = pdist(X, 'sokalsneath')
  1653. Examples
  1654. --------
  1655. >>> import numpy as np
  1656. >>> from scipy.spatial.distance import pdist
  1657. ``x`` is an array of five points in three-dimensional space.
  1658. >>> x = np.array([[2, 0, 2], [2, 2, 3], [-2, 4, 5], [0, 1, 9], [2, 2, 4]])
  1659. ``pdist(x)`` with no additional arguments computes the 10 pairwise
  1660. Euclidean distances:
  1661. >>> pdist(x)
  1662. array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
  1663. 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
  1664. The following computes the pairwise Minkowski distances with ``p = 3.5``:
  1665. >>> pdist(x, metric='minkowski', p=3.5)
  1666. array([2.04898923, 5.1154929 , 7.02700737, 2.43802731, 4.19042714,
  1667. 6.03956994, 1. , 4.45128103, 4.10636143, 5.0619695 ])
  1668. The pairwise city block or Manhattan distances:
  1669. >>> pdist(x, metric='cityblock')
  1670. array([ 3., 11., 10., 4., 8., 9., 1., 9., 7., 8.])
  1671. """
  1672. # You can also call this as:
  1673. # Y = pdist(X, 'test_abc')
  1674. # where 'abc' is the metric being tested. This computes the distance
  1675. # between all pairs of vectors in X using the distance metric 'abc' but
  1676. # with a more succinct, verifiable, but less efficient implementation.
  1677. X = _asarray(X)
  1678. if X.ndim != 2:
  1679. raise ValueError('A 2-dimensional array must be passed. '
  1680. f'(Shape was {X.shape}).')
  1681. n = X.shape[0]
  1682. return xpx.lazy_apply(_np_pdist, X, out,
  1683. # lazy_apply doesn't support Array kwargs
  1684. kwargs.pop('w', None),
  1685. kwargs.pop('V', None),
  1686. kwargs.pop('VI', None),
  1687. # See src/distance_pybind.cpp::pdist
  1688. shape=((n * (n - 1)) // 2, ), dtype=X.dtype,
  1689. as_numpy=True, metric=metric, **kwargs)
  1690. def _np_pdist(X, out, w, V, VI, metric='euclidean', **kwargs):
  1691. X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
  1692. check_finite=False)
  1693. m, n = X.shape
  1694. if w is not None:
  1695. kwargs["w"] = w
  1696. if V is not None:
  1697. kwargs["V"] = V
  1698. if VI is not None:
  1699. kwargs["VI"] = VI
  1700. if callable(metric):
  1701. mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
  1702. metric_info = _METRIC_ALIAS.get(mstr, None)
  1703. if metric_info is not None:
  1704. X, typ, kwargs = _validate_pdist_input(
  1705. X, m, n, metric_info, **kwargs)
  1706. return _pdist_callable(X, metric=metric, out=out, **kwargs)
  1707. elif isinstance(metric, str):
  1708. mstr = metric.lower()
  1709. metric_info = _METRIC_ALIAS.get(mstr, None)
  1710. if metric_info is not None:
  1711. pdist_fn = metric_info.pdist_func
  1712. return pdist_fn(X, out=out, **kwargs)
  1713. elif mstr.startswith("test_"):
  1714. metric_info = _TEST_METRICS.get(mstr, None)
  1715. if metric_info is None:
  1716. raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
  1717. X, typ, kwargs = _validate_pdist_input(
  1718. X, m, n, metric_info, **kwargs)
  1719. return _pdist_callable(
  1720. X, metric=metric_info.dist_func, out=out, **kwargs)
  1721. else:
  1722. raise ValueError(f'Unknown Distance Metric: {mstr}')
  1723. else:
  1724. raise TypeError('2nd argument metric must be a string identifier '
  1725. 'or a function.')
  1726. def squareform(X, force="no", checks=True):
  1727. """
  1728. Convert a vector-form distance vector to a square-form distance
  1729. matrix, and vice-versa.
  1730. Parameters
  1731. ----------
  1732. X : array_like
  1733. Either a condensed or redundant distance matrix.
  1734. force : str, optional
  1735. As with MATLAB(TM), if force is equal to ``'tovector'`` or
  1736. ``'tomatrix'``, the input will be treated as a distance matrix or
  1737. distance vector respectively.
  1738. checks : bool, optional
  1739. If set to False, no checks will be made for matrix
  1740. symmetry nor zero diagonals. This is useful if it is known that
  1741. ``X - X.T1`` is small and ``diag(X)`` is close to zero.
  1742. These values are ignored any way so they do not disrupt the
  1743. squareform transformation.
  1744. Returns
  1745. -------
  1746. Y : ndarray
  1747. If a condensed distance matrix is passed, a redundant one is
  1748. returned, or if a redundant one is passed, a condensed distance
  1749. matrix is returned.
  1750. Notes
  1751. -----
  1752. 1. ``v = squareform(X)``
  1753. Given a square n-by-n symmetric distance matrix ``X``,
  1754. ``v = squareform(X)`` returns an ``n * (n-1) / 2``
  1755. (i.e. binomial coefficient n choose 2) sized vector `v`
  1756. where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
  1757. is the distance between distinct points ``i`` and ``j``.
  1758. If ``X`` is non-square or asymmetric, an error is raised.
  1759. 2. ``X = squareform(v)``
  1760. Given an ``n * (n-1) / 2`` sized vector ``v``
  1761. for some integer ``n >= 1`` encoding distances as described,
  1762. ``X = squareform(v)`` returns an n-by-n distance matrix ``X``.
  1763. The ``X[i, j]`` and ``X[j, i]`` values are set to
  1764. :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
  1765. and all diagonal elements are zero.
  1766. In SciPy 0.19.0, ``squareform`` stopped casting all input types to
  1767. float64, and started returning arrays of the same dtype as the input.
  1768. Examples
  1769. --------
  1770. >>> import numpy as np
  1771. >>> from scipy.spatial.distance import pdist, squareform
  1772. ``x`` is an array of five points in three-dimensional space.
  1773. >>> x = np.array([[2, 0, 2], [2, 2, 3], [-2, 4, 5], [0, 1, 9], [2, 2, 4]])
  1774. ``pdist(x)`` computes the Euclidean distances between each pair of
  1775. points in ``x``. The distances are returned in a one-dimensional
  1776. array with length ``5*(5 - 1)/2 = 10``.
  1777. >>> distvec = pdist(x)
  1778. >>> distvec
  1779. array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
  1780. 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
  1781. ``squareform(distvec)`` returns the 5x5 distance matrix.
  1782. >>> m = squareform(distvec)
  1783. >>> m
  1784. array([[0. , 2.23606798, 6.40312424, 7.34846923, 2.82842712],
  1785. [2.23606798, 0. , 4.89897949, 6.40312424, 1. ],
  1786. [6.40312424, 4.89897949, 0. , 5.38516481, 4.58257569],
  1787. [7.34846923, 6.40312424, 5.38516481, 0. , 5.47722558],
  1788. [2.82842712, 1. , 4.58257569, 5.47722558, 0. ]])
  1789. When given a square distance matrix ``m``, ``squareform(m)`` returns
  1790. the one-dimensional condensed distance vector associated with the
  1791. matrix. In this case, we recover ``distvec``.
  1792. >>> squareform(m)
  1793. array([2.23606798, 6.40312424, 7.34846923, 2.82842712, 4.89897949,
  1794. 6.40312424, 1. , 5.38516481, 4.58257569, 5.47722558])
  1795. """
  1796. X = np.ascontiguousarray(X)
  1797. s = X.shape
  1798. if force.lower() == 'tomatrix':
  1799. if len(s) != 1:
  1800. raise ValueError("Forcing 'tomatrix' but input X is not a "
  1801. "distance vector.")
  1802. elif force.lower() == 'tovector':
  1803. if len(s) != 2:
  1804. raise ValueError("Forcing 'tovector' but input X is not a "
  1805. "distance matrix.")
  1806. # X = squareform(v)
  1807. if len(s) == 1:
  1808. if s[0] == 0:
  1809. return np.zeros((1, 1), dtype=X.dtype)
  1810. # Grab the closest value to the square root of the number
  1811. # of elements times 2 to see if the number of elements
  1812. # is indeed a binomial coefficient.
  1813. d = int(np.ceil(np.sqrt(s[0] * 2)))
  1814. # Check that v is of valid dimensions.
  1815. if d * (d - 1) != s[0] * 2:
  1816. raise ValueError('Incompatible vector size. It must be a binomial '
  1817. 'coefficient n choose 2 for some integer n >= 2.')
  1818. # Allocate memory for the distance matrix.
  1819. M = np.zeros((d, d), dtype=X.dtype)
  1820. # Since the C code does not support striding using strides.
  1821. # The dimensions are used instead.
  1822. X = _copy_array_if_base_present(X)
  1823. # Fill in the values of the distance matrix.
  1824. _distance_wrap.to_squareform_from_vector_wrap(M, X)
  1825. # Return the distance matrix.
  1826. return M
  1827. elif len(s) == 2:
  1828. if s[0] != s[1]:
  1829. raise ValueError('The matrix argument must be square.')
  1830. if checks:
  1831. is_valid_dm(X, throw=True, name='X')
  1832. # One-side of the dimensions is set here.
  1833. d = s[0]
  1834. if d <= 1:
  1835. return np.array([], dtype=X.dtype)
  1836. # Create a vector.
  1837. v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype)
  1838. # Since the C code does not support striding using strides.
  1839. # The dimensions are used instead.
  1840. X = _copy_array_if_base_present(X)
  1841. # Convert the vector to squareform.
  1842. _distance_wrap.to_vector_from_squareform_wrap(X, v)
  1843. return v
  1844. else:
  1845. raise ValueError("The first argument must be one or two dimensional "
  1846. f"array. A {len(s)}-dimensional array is not permitted")
  1847. def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):
  1848. """
  1849. Return True if input array satisfies basic distance matrix properties
  1850. (symmetry and zero diagonal).
  1851. This function checks whether the input is a 2-dimensional square NumPy array
  1852. with a zero diagonal and symmetry within a specified tolerance. These are
  1853. necessary properties for a distance matrix but not sufficient -- in particular,
  1854. this function does **not** check the triangle inequality, which is required
  1855. for a true metric distance matrix.
  1856. The triangle inequality states that for any three points ``i``, ``j``, and ``k``:
  1857. ``D[i,k] <= D[i,j] + D[j,k]``
  1858. Parameters
  1859. ----------
  1860. D : array_like
  1861. The candidate object to test for basic distance matrix properties.
  1862. tol : float, optional
  1863. The distance matrix is considered symmetric if the absolute difference
  1864. between entries ``ij`` and ``ji`` is less than or equal to `tol`. The same
  1865. tolerance is used to determine whether diagonal entries are effectively zero.
  1866. throw : bool, optional
  1867. If True, raises an exception when the input is invalid.
  1868. name : str, optional
  1869. The name of the variable to check. This is used in exception messages when
  1870. `throw` is True to identify the offending variable.
  1871. warning : bool, optional
  1872. If True, a warning message is raised instead of throwing an exception.
  1873. Returns
  1874. -------
  1875. valid : bool
  1876. True if the input satisfies the symmetry and zero-diagonal conditions.
  1877. Raises
  1878. ------
  1879. ValueError
  1880. If `throw` is True and `D` is not a valid distance matrix.
  1881. UserWarning
  1882. If `warning` is True and `D` is not a valid distance matrix.
  1883. Notes
  1884. -----
  1885. This function does not check the triangle inequality, which is required for
  1886. a complete validation of a metric distance matrix. Only structural properties
  1887. (symmetry and zero diagonal) are verified. Small numerical deviations from symmetry
  1888. or exact zero diagonal are tolerated within the `tol` parameter.
  1889. Examples
  1890. --------
  1891. >>> import numpy as np
  1892. >>> from scipy.spatial.distance import is_valid_dm
  1893. This matrix is a valid distance matrix.
  1894. >>> d = np.array([[0.0, 1.1, 1.2, 1.3],
  1895. ... [1.1, 0.0, 1.0, 1.4],
  1896. ... [1.2, 1.0, 0.0, 1.5],
  1897. ... [1.3, 1.4, 1.5, 0.0]])
  1898. >>> is_valid_dm(d)
  1899. True
  1900. In the following examples, the input is not a valid distance matrix.
  1901. Not square:
  1902. >>> is_valid_dm([[0, 2, 2], [2, 0, 2]])
  1903. False
  1904. Nonzero diagonal element:
  1905. >>> is_valid_dm([[0, 1, 1], [1, 2, 3], [1, 3, 0]])
  1906. False
  1907. Not symmetric:
  1908. >>> is_valid_dm([[0, 1, 3], [2, 0, 1], [3, 1, 0]])
  1909. False
  1910. """
  1911. D = np.asarray(D, order='c')
  1912. valid = True
  1913. try:
  1914. s = D.shape
  1915. if len(D.shape) != 2:
  1916. if name:
  1917. raise ValueError(f"Distance matrix '{name}' must have shape=2 "
  1918. "(i.e. be two-dimensional).")
  1919. else:
  1920. raise ValueError('Distance matrix must have shape=2 (i.e. '
  1921. 'be two-dimensional).')
  1922. if tol == 0.0:
  1923. if not (D == D.T).all():
  1924. if name:
  1925. raise ValueError(f"Distance matrix '{name}' must be symmetric.")
  1926. else:
  1927. raise ValueError('Distance matrix must be symmetric.')
  1928. if not (D[range(0, s[0]), range(0, s[0])] == 0).all():
  1929. if name:
  1930. raise ValueError(f"Distance matrix '{name}' diagonal must be zero.")
  1931. else:
  1932. raise ValueError('Distance matrix diagonal must be zero.')
  1933. else:
  1934. if not (D - D.T <= tol).all():
  1935. if name:
  1936. raise ValueError(f'Distance matrix \'{name}\' must be '
  1937. f'symmetric within tolerance {tol:5.5f}.')
  1938. else:
  1939. raise ValueError('Distance matrix must be symmetric within '
  1940. f'tolerance {tol:5.5f}.')
  1941. if not (D[range(0, s[0]), range(0, s[0])] <= tol).all():
  1942. if name:
  1943. raise ValueError(f'Distance matrix \'{name}\' diagonal must be '
  1944. f'close to zero within tolerance {tol:5.5f}.')
  1945. else:
  1946. raise ValueError(('Distance matrix \'{}\' diagonal must be close '
  1947. 'to zero within tolerance {:5.5f}.').format(*tol))
  1948. except Exception as e:
  1949. if throw:
  1950. raise
  1951. if warning:
  1952. warnings.warn(str(e), stacklevel=2)
  1953. valid = False
  1954. return valid
  1955. def is_valid_y(y, warning=False, throw=False, name=None):
  1956. """
  1957. Return True if the input array is a valid condensed distance matrix.
  1958. Condensed distance matrices must be 1-dimensional numpy arrays.
  1959. Their length must be a binomial coefficient :math:`{n \\choose 2}`
  1960. for some positive integer n.
  1961. Parameters
  1962. ----------
  1963. y : array_like
  1964. The condensed distance matrix.
  1965. warning : bool, optional
  1966. Invokes a warning if the variable passed is not a valid
  1967. condensed distance matrix. The warning message explains why
  1968. the distance matrix is not valid. `name` is used when
  1969. referencing the offending variable.
  1970. throw : bool, optional
  1971. Throws an exception if the variable passed is not a valid
  1972. condensed distance matrix.
  1973. name : str, optional
  1974. Used when referencing the offending variable in the
  1975. warning or exception message.
  1976. Returns
  1977. -------
  1978. bool
  1979. True if the input array is a valid condensed distance matrix,
  1980. False otherwise.
  1981. Examples
  1982. --------
  1983. >>> from scipy.spatial.distance import is_valid_y
  1984. This vector is a valid condensed distance matrix. The length is 6,
  1985. which corresponds to ``n = 4``, since ``4*(4 - 1)/2`` is 6.
  1986. >>> v = [1.0, 1.2, 1.0, 0.5, 1.3, 0.9]
  1987. >>> is_valid_y(v)
  1988. True
  1989. An input vector with length, say, 7, is not a valid condensed distance
  1990. matrix.
  1991. >>> is_valid_y([1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7])
  1992. False
  1993. """
  1994. y = _asarray(y)
  1995. name_str = f"'{name}' " if name else ""
  1996. try:
  1997. if len(y.shape) != 1:
  1998. raise ValueError(f"Condensed distance matrix {name_str}must "
  1999. "have shape=1 (i.e. be one-dimensional).")
  2000. n = y.shape[0]
  2001. d = int(np.ceil(np.sqrt(n * 2)))
  2002. if (d * (d - 1) / 2) != n:
  2003. raise ValueError(f"Length n of condensed distance matrix {name_str}"
  2004. "must be a binomial coefficient, i.e. "
  2005. "there must be a k such that (k \\choose 2)=n)!")
  2006. except Exception as e:
  2007. if throw:
  2008. raise
  2009. if warning:
  2010. warnings.warn(str(e), stacklevel=2)
  2011. return False
  2012. return True
  2013. def num_obs_dm(d):
  2014. """
  2015. Return the number of original observations that correspond to a
  2016. square, redundant distance matrix.
  2017. Parameters
  2018. ----------
  2019. d : array_like
  2020. The target distance matrix.
  2021. Returns
  2022. -------
  2023. num_obs_dm : int
  2024. The number of observations in the redundant distance matrix.
  2025. Examples
  2026. --------
  2027. Find the number of original observations corresponding
  2028. to a square redundant distance matrix d.
  2029. >>> from scipy.spatial.distance import num_obs_dm
  2030. >>> d = [[0, 100, 200], [100, 0, 150], [200, 150, 0]]
  2031. >>> num_obs_dm(d)
  2032. 3
  2033. """
  2034. d = np.asarray(d, order='c')
  2035. is_valid_dm(d, tol=np.inf, throw=True, name='d')
  2036. return d.shape[0]
  2037. def num_obs_y(Y):
  2038. """
  2039. Return the number of original observations that correspond to a
  2040. condensed distance matrix.
  2041. Parameters
  2042. ----------
  2043. Y : array_like
  2044. Condensed distance matrix.
  2045. Returns
  2046. -------
  2047. n : int
  2048. The number of observations in the condensed distance matrix `Y`.
  2049. Examples
  2050. --------
  2051. Find the number of original observations corresponding to a
  2052. condensed distance matrix Y.
  2053. >>> from scipy.spatial.distance import num_obs_y
  2054. >>> Y = [1, 2, 3.5, 7, 10, 4]
  2055. >>> num_obs_y(Y)
  2056. 4
  2057. """
  2058. Y = _asarray(Y)
  2059. is_valid_y(Y, throw=True, name='Y')
  2060. k = Y.shape[0]
  2061. if k == 0:
  2062. raise ValueError("The number of observations cannot be determined on "
  2063. "an empty distance matrix.")
  2064. d = int(np.ceil(np.sqrt(k * 2)))
  2065. if (d * (d - 1) / 2) != k:
  2066. raise ValueError("Invalid condensed distance matrix passed. Must be "
  2067. "some k where k=(n choose 2) for some n >= 2.")
  2068. return d
  2069. def _prepare_out_argument(out, dtype, expected_shape):
  2070. if out is None:
  2071. return np.empty(expected_shape, dtype=dtype)
  2072. if out.shape != expected_shape:
  2073. raise ValueError("Output array has incorrect shape.")
  2074. if not out.flags.c_contiguous:
  2075. raise ValueError("Output array must be C-contiguous.")
  2076. if out.dtype != np.float64:
  2077. raise ValueError("Output array must be double type.")
  2078. return out
  2079. def _pdist_callable(X, *, out, metric, **kwargs):
  2080. n = X.shape[0]
  2081. out_size = (n * (n - 1)) // 2
  2082. dm = _prepare_out_argument(out, np.float64, (out_size,))
  2083. k = 0
  2084. for i in range(X.shape[0] - 1):
  2085. for j in range(i + 1, X.shape[0]):
  2086. dm[k] = metric(X[i], X[j], **kwargs)
  2087. k += 1
  2088. return dm
  2089. def _cdist_callable(XA, XB, *, out, metric, **kwargs):
  2090. mA = XA.shape[0]
  2091. mB = XB.shape[0]
  2092. dm = _prepare_out_argument(out, np.float64, (mA, mB))
  2093. for i in range(mA):
  2094. for j in range(mB):
  2095. dm[i, j] = metric(XA[i], XB[j], **kwargs)
  2096. return dm
  2097. def cdist(XA, XB, metric='euclidean', *, out=None, **kwargs):
  2098. """
  2099. Compute distance between each pair of the two collections of inputs.
  2100. See Notes for common calling conventions.
  2101. Parameters
  2102. ----------
  2103. XA : array_like
  2104. An :math:`m_A` by :math:`n` array of :math:`m_A`
  2105. original observations in an :math:`n`-dimensional space.
  2106. Inputs are converted to float type.
  2107. XB : array_like
  2108. An :math:`m_B` by :math:`n` array of :math:`m_B`
  2109. original observations in an :math:`n`-dimensional space.
  2110. Inputs are converted to float type.
  2111. metric : str or callable, optional
  2112. The distance metric to use. If a string, the distance function can be
  2113. 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
  2114. 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
  2115. 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
  2116. 'seuclidean', 'sokalsneath', 'sqeuclidean', 'yule'.
  2117. **kwargs : dict, optional
  2118. Extra arguments to `metric`: refer to each metric documentation for a
  2119. list of all possible arguments.
  2120. Some possible arguments:
  2121. p : scalar
  2122. The p-norm to apply for Minkowski, weighted and unweighted.
  2123. Default: 2.
  2124. w : array_like
  2125. The weight vector for metrics that support weights (e.g., Minkowski).
  2126. V : array_like
  2127. The variance vector for standardized Euclidean.
  2128. Default: var(vstack([XA, XB]), axis=0, ddof=1)
  2129. VI : array_like
  2130. The inverse of the covariance matrix for Mahalanobis.
  2131. Default: inv(cov(vstack([XA, XB].T))).T
  2132. out : ndarray
  2133. The output array
  2134. If not None, the distance matrix Y is stored in this array.
  2135. Returns
  2136. -------
  2137. Y : ndarray
  2138. A :math:`m_A` by :math:`m_B` distance matrix is returned.
  2139. For each :math:`i` and :math:`j`, the metric
  2140. ``dist(u=XA[i], v=XB[j])`` is computed and stored in the
  2141. :math:`ij` th entry.
  2142. Raises
  2143. ------
  2144. ValueError
  2145. An exception is thrown if `XA` and `XB` do not have
  2146. the same number of columns.
  2147. Notes
  2148. -----
  2149. The following are common calling conventions:
  2150. 1. ``Y = cdist(XA, XB, 'euclidean')``
  2151. Computes the distance between :math:`m` points using
  2152. Euclidean distance (2-norm) as the distance metric between the
  2153. points. The points are arranged as :math:`m`
  2154. :math:`n`-dimensional row vectors in the matrix X.
  2155. 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``
  2156. Computes the distances using the Minkowski distance
  2157. :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
  2158. that this is only a quasi-metric if :math:`0 < p < 1`).
  2159. 3. ``Y = cdist(XA, XB, 'cityblock')``
  2160. Computes the city block or Manhattan distance between the
  2161. points.
  2162. 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``
  2163. Computes the standardized Euclidean distance. The standardized
  2164. Euclidean distance between two n-vectors ``u`` and ``v`` is
  2165. .. math::
  2166. \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.
  2167. V is the variance vector; V[i] is the variance computed over all
  2168. the i'th components of the points. If not passed, it is
  2169. automatically computed.
  2170. 5. ``Y = cdist(XA, XB, 'sqeuclidean')``
  2171. Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
  2172. the vectors.
  2173. 6. ``Y = cdist(XA, XB, 'cosine')``
  2174. Computes the cosine distance between vectors u and v,
  2175. .. math::
  2176. 1 - \\frac{u \\cdot v}
  2177. {{\\|u\\|}_2 {\\|v\\|}_2}
  2178. where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
  2179. :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.
  2180. 7. ``Y = cdist(XA, XB, 'correlation')``
  2181. Computes the correlation distance between vectors u and v. This is
  2182. .. math::
  2183. 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
  2184. {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
  2185. where :math:`\\bar{v}` is the mean of the elements of vector v,
  2186. and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
  2187. 8. ``Y = cdist(XA, XB, 'hamming')``
  2188. Computes the normalized Hamming distance, or the proportion of
  2189. those vector elements between two n-vectors ``u`` and ``v``
  2190. which disagree. To save memory, the matrix ``X`` can be of type
  2191. boolean.
  2192. 9. ``Y = cdist(XA, XB, 'jaccard')``
  2193. Computes the Jaccard distance between the points. Given two
  2194. vectors, ``u`` and ``v``, the Jaccard distance is the
  2195. proportion of those elements ``u[i]`` and ``v[i]`` that
  2196. disagree where at least one of them is non-zero.
  2197. 10. ``Y = cdist(XA, XB, 'jensenshannon')``
  2198. Computes the Jensen-Shannon distance between two probability arrays.
  2199. Given two probability vectors, :math:`p` and :math:`q`, the
  2200. Jensen-Shannon distance is
  2201. .. math::
  2202. \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
  2203. where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
  2204. and :math:`D` is the Kullback-Leibler divergence.
  2205. 11. ``Y = cdist(XA, XB, 'chebyshev')``
  2206. Computes the Chebyshev distance between the points. The
  2207. Chebyshev distance between two n-vectors ``u`` and ``v`` is the
  2208. maximum norm-1 distance between their respective elements. More
  2209. precisely, the distance is given by
  2210. .. math::
  2211. d(u,v) = \\max_i {|u_i-v_i|}.
  2212. 12. ``Y = cdist(XA, XB, 'canberra')``
  2213. Computes the Canberra distance between the points. The
  2214. Canberra distance between two points ``u`` and ``v`` is
  2215. .. math::
  2216. d(u,v) = \\sum_i \\frac{|u_i-v_i|}
  2217. {|u_i|+|v_i|}.
  2218. 13. ``Y = cdist(XA, XB, 'braycurtis')``
  2219. Computes the Bray-Curtis distance between the points. The
  2220. Bray-Curtis distance between two points ``u`` and ``v`` is
  2221. .. math::
  2222. d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}
  2223. {\\sum_i (|u_i+v_i|)}
  2224. 14. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``
  2225. Computes the Mahalanobis distance between the points. The
  2226. Mahalanobis distance between two points ``u`` and ``v`` is
  2227. :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
  2228. variable) is the inverse covariance. If ``VI`` is not None,
  2229. ``VI`` will be used as the inverse covariance matrix.
  2230. 15. ``Y = cdist(XA, XB, 'yule')``
  2231. Computes the Yule distance between the boolean
  2232. vectors. (see `yule` function documentation)
  2233. 16. ``Y = cdist(XA, XB, 'matching')``
  2234. Synonym for 'hamming'.
  2235. 17. ``Y = cdist(XA, XB, 'dice')``
  2236. Computes the Dice distance between the boolean vectors. (see
  2237. `dice` function documentation).
  2238. 18. ``Y = cdist(XA, XB, 'rogerstanimoto')``
  2239. Computes the Rogers-Tanimoto distance between the boolean
  2240. vectors. (see `rogerstanimoto` function documentation)
  2241. 19. ``Y = cdist(XA, XB, 'russellrao')``
  2242. Computes the Russell-Rao distance between the boolean
  2243. vectors. (see `russellrao` function documentation)
  2244. 20. ``Y = cdist(XA, XB, 'sokalsneath')``
  2245. Computes the Sokal-Sneath distance between the vectors. (see
  2246. `sokalsneath` function documentation)
  2247. 21. ``Y = cdist(XA, XB, f)``
  2248. Computes the distance between all pairs of vectors in X
  2249. using the user supplied 2-arity function f. For example,
  2250. Euclidean distance between the vectors could be computed
  2251. as follows::
  2252. dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))
  2253. Note that you should avoid passing a reference to one of
  2254. the distance functions defined in this library. For example,::
  2255. dm = cdist(XA, XB, sokalsneath)
  2256. would calculate the pair-wise distances between the vectors in
  2257. X using the Python function `sokalsneath`. This would result in
  2258. sokalsneath being called :math:`{n \\choose 2}` times, which
  2259. is inefficient. Instead, the optimized C version is more
  2260. efficient, and we call it using the following syntax::
  2261. dm = cdist(XA, XB, 'sokalsneath')
  2262. Examples
  2263. --------
  2264. Find the Euclidean distances between four 2-D coordinates:
  2265. >>> from scipy.spatial import distance
  2266. >>> import numpy as np
  2267. >>> coords = [(35.0456, -85.2672),
  2268. ... (35.1174, -89.9711),
  2269. ... (35.9728, -83.9422),
  2270. ... (36.1667, -86.7833)]
  2271. >>> distance.cdist(coords, coords, 'euclidean')
  2272. array([[ 0. , 4.7044, 1.6172, 1.8856],
  2273. [ 4.7044, 0. , 6.0893, 3.3561],
  2274. [ 1.6172, 6.0893, 0. , 2.8477],
  2275. [ 1.8856, 3.3561, 2.8477, 0. ]])
  2276. Find the Manhattan distance from a 3-D point to the corners of the unit
  2277. cube:
  2278. >>> a = np.array([[0, 0, 0],
  2279. ... [0, 0, 1],
  2280. ... [0, 1, 0],
  2281. ... [0, 1, 1],
  2282. ... [1, 0, 0],
  2283. ... [1, 0, 1],
  2284. ... [1, 1, 0],
  2285. ... [1, 1, 1]])
  2286. >>> b = np.array([[ 0.1, 0.2, 0.4]])
  2287. >>> distance.cdist(a, b, 'cityblock')
  2288. array([[ 0.7],
  2289. [ 0.9],
  2290. [ 1.3],
  2291. [ 1.5],
  2292. [ 1.5],
  2293. [ 1.7],
  2294. [ 2.1],
  2295. [ 2.3]])
  2296. """
  2297. # You can also call this as:
  2298. # Y = cdist(XA, XB, 'test_abc')
  2299. # where 'abc' is the metric being tested. This computes the distance
  2300. # between all pairs of vectors in XA and XB using the distance metric 'abc'
  2301. # but with a more succinct, verifiable, but less efficient implementation.
  2302. XA = np.asarray(XA)
  2303. XB = np.asarray(XB)
  2304. s = XA.shape
  2305. sB = XB.shape
  2306. if len(s) != 2:
  2307. raise ValueError('XA must be a 2-dimensional array.')
  2308. if len(sB) != 2:
  2309. raise ValueError('XB must be a 2-dimensional array.')
  2310. if s[1] != sB[1]:
  2311. raise ValueError('XA and XB must have the same number of columns '
  2312. '(i.e. feature dimension.)')
  2313. mA = s[0]
  2314. mB = sB[0]
  2315. n = s[1]
  2316. if callable(metric):
  2317. mstr = getattr(metric, '__name__', 'Unknown')
  2318. metric_info = _METRIC_ALIAS.get(mstr, None)
  2319. if metric_info is not None:
  2320. XA, XB, typ, kwargs = _validate_cdist_input(
  2321. XA, XB, mA, mB, n, metric_info, **kwargs)
  2322. return _cdist_callable(XA, XB, metric=metric, out=out, **kwargs)
  2323. elif isinstance(metric, str):
  2324. mstr = metric.lower()
  2325. metric_info = _METRIC_ALIAS.get(mstr, None)
  2326. if metric_info is not None:
  2327. cdist_fn = metric_info.cdist_func
  2328. return cdist_fn(XA, XB, out=out, **kwargs)
  2329. elif mstr.startswith("test_"):
  2330. metric_info = _TEST_METRICS.get(mstr, None)
  2331. if metric_info is None:
  2332. raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
  2333. XA, XB, typ, kwargs = _validate_cdist_input(
  2334. XA, XB, mA, mB, n, metric_info, **kwargs)
  2335. return _cdist_callable(
  2336. XA, XB, metric=metric_info.dist_func, out=out, **kwargs)
  2337. else:
  2338. raise ValueError(f'Unknown Distance Metric: {mstr}')
  2339. else:
  2340. raise TypeError('2nd argument metric must be a string identifier '
  2341. 'or a function.')