hierarchy.py 153 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338
  1. """
  2. Hierarchical clustering (:mod:`scipy.cluster.hierarchy`)
  3. ========================================================
  4. .. currentmodule:: scipy.cluster.hierarchy
  5. These functions cut hierarchical clusterings into flat clusterings
  6. or find the roots of the forest formed by a cut by providing the flat
  7. cluster ids of each observation.
  8. .. autosummary::
  9. :toctree: generated/
  10. fcluster
  11. fclusterdata
  12. leaders
  13. These are routines for agglomerative clustering.
  14. .. autosummary::
  15. :toctree: generated/
  16. linkage
  17. single
  18. complete
  19. average
  20. weighted
  21. centroid
  22. median
  23. ward
  24. These routines compute statistics on hierarchies.
  25. .. autosummary::
  26. :toctree: generated/
  27. cophenet
  28. from_mlab_linkage
  29. inconsistent
  30. maxinconsts
  31. maxdists
  32. maxRstat
  33. to_mlab_linkage
  34. Routines for visualizing flat clusters.
  35. .. autosummary::
  36. :toctree: generated/
  37. dendrogram
  38. These are data structures and routines for representing hierarchies as
  39. tree objects.
  40. .. autosummary::
  41. :toctree: generated/
  42. ClusterNode
  43. leaves_list
  44. to_tree
  45. cut_tree
  46. optimal_leaf_ordering
  47. These are predicates for checking the validity of linkage and
  48. inconsistency matrices as well as for checking isomorphism of two
  49. flat cluster assignments.
  50. .. autosummary::
  51. :toctree: generated/
  52. is_valid_im
  53. is_valid_linkage
  54. is_isomorphic
  55. is_monotonic
  56. correspond
  57. num_obs_linkage
  58. Utility routines for plotting:
  59. .. autosummary::
  60. :toctree: generated/
  61. set_link_color_palette
  62. Utility classes:
  63. .. autosummary::
  64. :toctree: generated/
  65. DisjointSet -- data structure for incremental connectivity queries
  66. """
  67. # Copyright (C) Damian Eads, 2007-2008. New BSD License.
  68. # hierarchy.py (derived from cluster.py, http://scipy-cluster.googlecode.com)
  69. #
  70. # Author: Damian Eads
  71. # Date: September 22, 2007
  72. #
  73. # Copyright (c) 2007, 2008, Damian Eads
  74. #
  75. # All rights reserved.
  76. #
  77. # Redistribution and use in source and binary forms, with or without
  78. # modification, are permitted provided that the following conditions
  79. # are met:
  80. # - Redistributions of source code must retain the above
  81. # copyright notice, this list of conditions and the
  82. # following disclaimer.
  83. # - Redistributions in binary form must reproduce the above copyright
  84. # notice, this list of conditions and the following disclaimer
  85. # in the documentation and/or other materials provided with the
  86. # distribution.
  87. # - Neither the name of the author nor the names of its
  88. # contributors may be used to endorse or promote products derived
  89. # from this software without specific prior written permission.
  90. #
  91. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  92. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  93. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  94. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  95. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  96. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  97. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  98. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  99. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  100. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  101. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  102. import warnings
  103. import bisect
  104. from collections import deque
  105. import numpy as np
  106. from . import _hierarchy, _optimal_leaf_ordering
  107. import scipy.spatial.distance as distance
  108. from scipy._lib._array_api import (_asarray, array_namespace, is_dask,
  109. is_lazy_array, xp_capabilities, xp_copy)
  110. from scipy._lib._disjoint_set import DisjointSet
  111. import scipy._lib.array_api_extra as xpx
  112. _LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3,
  113. 'median': 4, 'ward': 5, 'weighted': 6}
  114. _EUCLIDEAN_METHODS = ('centroid', 'median', 'ward')
  115. __all__ = ['ClusterNode', 'DisjointSet', 'average', 'centroid', 'complete',
  116. 'cophenet', 'correspond', 'cut_tree', 'dendrogram', 'fcluster',
  117. 'fclusterdata', 'from_mlab_linkage', 'inconsistent',
  118. 'is_isomorphic', 'is_monotonic', 'is_valid_im', 'is_valid_linkage',
  119. 'leaders', 'leaves_list', 'linkage', 'maxRstat', 'maxdists',
  120. 'maxinconsts', 'median', 'num_obs_linkage', 'optimal_leaf_ordering',
  121. 'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree',
  122. 'ward', 'weighted']
  123. class ClusterWarning(UserWarning):
  124. pass
  125. def _warning(s):
  126. warnings.warn(f'scipy.cluster: {s}', ClusterWarning, stacklevel=3)
  127. def int_floor(arr, xp):
  128. # array_api_strict is strict about not allowing `int()` on a float array.
  129. # That's typically not needed, here it is - so explicitly convert
  130. return int(xp.asarray(arr, dtype=xp.int64))
  131. lazy_cython = xp_capabilities(
  132. cpu_only=True, reason="Cython code",
  133. warnings=[("dask.array", "merges chunks")])
  134. @lazy_cython
  135. def single(y):
  136. """
  137. Perform single/min/nearest linkage on the condensed distance matrix ``y``.
  138. Parameters
  139. ----------
  140. y : ndarray
  141. The upper triangular of the distance matrix. The result of
  142. ``pdist`` is returned in this form.
  143. Returns
  144. -------
  145. Z : ndarray
  146. The linkage matrix.
  147. See Also
  148. --------
  149. linkage : for advanced creation of hierarchical clusterings.
  150. scipy.spatial.distance.pdist : pairwise distance metrics
  151. Examples
  152. --------
  153. >>> from scipy.cluster.hierarchy import single, fcluster
  154. >>> from scipy.spatial.distance import pdist
  155. First, we need a toy dataset to play with::
  156. x x x x
  157. x x
  158. x x
  159. x x x x
  160. >>> X = [[0, 0], [0, 1], [1, 0],
  161. ... [0, 4], [0, 3], [1, 4],
  162. ... [4, 0], [3, 0], [4, 1],
  163. ... [4, 4], [3, 4], [4, 3]]
  164. Then, we get a condensed distance matrix from this dataset:
  165. >>> y = pdist(X)
  166. Finally, we can perform the clustering:
  167. >>> Z = single(y)
  168. >>> Z
  169. array([[ 0., 1., 1., 2.],
  170. [ 2., 12., 1., 3.],
  171. [ 3., 4., 1., 2.],
  172. [ 5., 14., 1., 3.],
  173. [ 6., 7., 1., 2.],
  174. [ 8., 16., 1., 3.],
  175. [ 9., 10., 1., 2.],
  176. [11., 18., 1., 3.],
  177. [13., 15., 2., 6.],
  178. [17., 20., 2., 9.],
  179. [19., 21., 2., 12.]])
  180. The linkage matrix ``Z`` represents a dendrogram - see
  181. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  182. contents.
  183. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  184. each initial point would belong given a distance threshold:
  185. >>> fcluster(Z, 0.9, criterion='distance')
  186. array([ 7, 8, 9, 10, 11, 12, 4, 5, 6, 1, 2, 3], dtype=int32)
  187. >>> fcluster(Z, 1, criterion='distance')
  188. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  189. >>> fcluster(Z, 2, criterion='distance')
  190. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  191. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  192. plot of the dendrogram.
  193. """
  194. return linkage(y, method='single', metric='euclidean')
  195. @lazy_cython
  196. def complete(y):
  197. """
  198. Perform complete/max/farthest point linkage on a condensed distance matrix.
  199. Parameters
  200. ----------
  201. y : ndarray
  202. The upper triangular of the distance matrix. The result of
  203. ``pdist`` is returned in this form.
  204. Returns
  205. -------
  206. Z : ndarray
  207. A linkage matrix containing the hierarchical clustering. See
  208. the `linkage` function documentation for more information
  209. on its structure.
  210. See Also
  211. --------
  212. linkage : for advanced creation of hierarchical clusterings.
  213. scipy.spatial.distance.pdist : pairwise distance metrics
  214. Examples
  215. --------
  216. >>> from scipy.cluster.hierarchy import complete, fcluster
  217. >>> from scipy.spatial.distance import pdist
  218. First, we need a toy dataset to play with::
  219. x x x x
  220. x x
  221. x x
  222. x x x x
  223. >>> X = [[0, 0], [0, 1], [1, 0],
  224. ... [0, 4], [0, 3], [1, 4],
  225. ... [4, 0], [3, 0], [4, 1],
  226. ... [4, 4], [3, 4], [4, 3]]
  227. Then, we get a condensed distance matrix from this dataset:
  228. >>> y = pdist(X)
  229. Finally, we can perform the clustering:
  230. >>> Z = complete(y)
  231. >>> Z
  232. array([[ 0. , 1. , 1. , 2. ],
  233. [ 3. , 4. , 1. , 2. ],
  234. [ 6. , 7. , 1. , 2. ],
  235. [ 9. , 10. , 1. , 2. ],
  236. [ 2. , 12. , 1.41421356, 3. ],
  237. [ 5. , 13. , 1.41421356, 3. ],
  238. [ 8. , 14. , 1.41421356, 3. ],
  239. [11. , 15. , 1.41421356, 3. ],
  240. [16. , 17. , 4.12310563, 6. ],
  241. [18. , 19. , 4.12310563, 6. ],
  242. [20. , 21. , 5.65685425, 12. ]])
  243. The linkage matrix ``Z`` represents a dendrogram - see
  244. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  245. contents.
  246. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  247. each initial point would belong given a distance threshold:
  248. >>> fcluster(Z, 0.9, criterion='distance')
  249. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  250. >>> fcluster(Z, 1.5, criterion='distance')
  251. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  252. >>> fcluster(Z, 4.5, criterion='distance')
  253. array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
  254. >>> fcluster(Z, 6, criterion='distance')
  255. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  256. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  257. plot of the dendrogram.
  258. """
  259. return linkage(y, method='complete', metric='euclidean')
  260. @lazy_cython
  261. def average(y):
  262. """
  263. Perform average/UPGMA linkage on a condensed distance matrix.
  264. Parameters
  265. ----------
  266. y : ndarray
  267. The upper triangular of the distance matrix. The result of
  268. ``pdist`` is returned in this form.
  269. Returns
  270. -------
  271. Z : ndarray
  272. A linkage matrix containing the hierarchical clustering. See
  273. `linkage` for more information on its structure.
  274. See Also
  275. --------
  276. linkage : for advanced creation of hierarchical clusterings.
  277. scipy.spatial.distance.pdist : pairwise distance metrics
  278. Examples
  279. --------
  280. >>> from scipy.cluster.hierarchy import average, fcluster
  281. >>> from scipy.spatial.distance import pdist
  282. First, we need a toy dataset to play with::
  283. x x x x
  284. x x
  285. x x
  286. x x x x
  287. >>> X = [[0, 0], [0, 1], [1, 0],
  288. ... [0, 4], [0, 3], [1, 4],
  289. ... [4, 0], [3, 0], [4, 1],
  290. ... [4, 4], [3, 4], [4, 3]]
  291. Then, we get a condensed distance matrix from this dataset:
  292. >>> y = pdist(X)
  293. Finally, we can perform the clustering:
  294. >>> Z = average(y)
  295. >>> Z
  296. array([[ 0. , 1. , 1. , 2. ],
  297. [ 3. , 4. , 1. , 2. ],
  298. [ 6. , 7. , 1. , 2. ],
  299. [ 9. , 10. , 1. , 2. ],
  300. [ 2. , 12. , 1.20710678, 3. ],
  301. [ 5. , 13. , 1.20710678, 3. ],
  302. [ 8. , 14. , 1.20710678, 3. ],
  303. [11. , 15. , 1.20710678, 3. ],
  304. [16. , 17. , 3.39675184, 6. ],
  305. [18. , 19. , 3.39675184, 6. ],
  306. [20. , 21. , 4.09206523, 12. ]])
  307. The linkage matrix ``Z`` represents a dendrogram - see
  308. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  309. contents.
  310. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  311. each initial point would belong given a distance threshold:
  312. >>> fcluster(Z, 0.9, criterion='distance')
  313. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  314. >>> fcluster(Z, 1.5, criterion='distance')
  315. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  316. >>> fcluster(Z, 4, criterion='distance')
  317. array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
  318. >>> fcluster(Z, 6, criterion='distance')
  319. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  320. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  321. plot of the dendrogram.
  322. """
  323. return linkage(y, method='average', metric='euclidean')
  324. @lazy_cython
  325. def weighted(y):
  326. """
  327. Perform weighted/WPGMA linkage on the condensed distance matrix.
  328. See `linkage` for more information on the return
  329. structure and algorithm.
  330. Parameters
  331. ----------
  332. y : ndarray
  333. The upper triangular of the distance matrix. The result of
  334. ``pdist`` is returned in this form.
  335. Returns
  336. -------
  337. Z : ndarray
  338. A linkage matrix containing the hierarchical clustering. See
  339. `linkage` for more information on its structure.
  340. See Also
  341. --------
  342. linkage : for advanced creation of hierarchical clusterings.
  343. scipy.spatial.distance.pdist : pairwise distance metrics
  344. Examples
  345. --------
  346. >>> from scipy.cluster.hierarchy import weighted, fcluster
  347. >>> from scipy.spatial.distance import pdist
  348. First, we need a toy dataset to play with::
  349. x x x x
  350. x x
  351. x x
  352. x x x x
  353. >>> X = [[0, 0], [0, 1], [1, 0],
  354. ... [0, 4], [0, 3], [1, 4],
  355. ... [4, 0], [3, 0], [4, 1],
  356. ... [4, 4], [3, 4], [4, 3]]
  357. Then, we get a condensed distance matrix from this dataset:
  358. >>> y = pdist(X)
  359. Finally, we can perform the clustering:
  360. >>> Z = weighted(y)
  361. >>> Z
  362. array([[ 0. , 1. , 1. , 2. ],
  363. [ 6. , 7. , 1. , 2. ],
  364. [ 3. , 4. , 1. , 2. ],
  365. [ 9. , 11. , 1. , 2. ],
  366. [ 2. , 12. , 1.20710678, 3. ],
  367. [ 8. , 13. , 1.20710678, 3. ],
  368. [ 5. , 14. , 1.20710678, 3. ],
  369. [10. , 15. , 1.20710678, 3. ],
  370. [18. , 19. , 3.05595762, 6. ],
  371. [16. , 17. , 3.32379407, 6. ],
  372. [20. , 21. , 4.06357713, 12. ]])
  373. The linkage matrix ``Z`` represents a dendrogram - see
  374. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  375. contents.
  376. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  377. each initial point would belong given a distance threshold:
  378. >>> fcluster(Z, 0.9, criterion='distance')
  379. array([ 7, 8, 9, 1, 2, 3, 10, 11, 12, 4, 6, 5], dtype=int32)
  380. >>> fcluster(Z, 1.5, criterion='distance')
  381. array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32)
  382. >>> fcluster(Z, 4, criterion='distance')
  383. array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32)
  384. >>> fcluster(Z, 6, criterion='distance')
  385. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  386. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  387. plot of the dendrogram.
  388. """
  389. return linkage(y, method='weighted', metric='euclidean')
  390. @lazy_cython
  391. def centroid(y):
  392. """
  393. Perform centroid/UPGMC linkage.
  394. See `linkage` for more information on the input matrix,
  395. return structure, and algorithm.
  396. The following are common calling conventions:
  397. 1. ``Z = centroid(y)``
  398. Performs centroid/UPGMC linkage on the condensed distance
  399. matrix ``y``.
  400. 2. ``Z = centroid(X)``
  401. Performs centroid/UPGMC linkage on the observation matrix ``X``
  402. using Euclidean distance as the distance metric.
  403. Parameters
  404. ----------
  405. y : ndarray
  406. A condensed distance matrix. A condensed
  407. distance matrix is a flat array containing the upper
  408. triangular of the distance matrix. This is the form that
  409. ``pdist`` returns. Alternatively, a collection of
  410. m observation vectors in n dimensions may be passed as
  411. an m by n array.
  412. Returns
  413. -------
  414. Z : ndarray
  415. A linkage matrix containing the hierarchical clustering. See
  416. the `linkage` function documentation for more information
  417. on its structure.
  418. See Also
  419. --------
  420. linkage : for advanced creation of hierarchical clusterings.
  421. scipy.spatial.distance.pdist : pairwise distance metrics
  422. Examples
  423. --------
  424. >>> from scipy.cluster.hierarchy import centroid, fcluster
  425. >>> from scipy.spatial.distance import pdist
  426. First, we need a toy dataset to play with::
  427. x x x x
  428. x x
  429. x x
  430. x x x x
  431. >>> X = [[0, 0], [0, 1], [1, 0],
  432. ... [0, 4], [0, 3], [1, 4],
  433. ... [4, 0], [3, 0], [4, 1],
  434. ... [4, 4], [3, 4], [4, 3]]
  435. Then, we get a condensed distance matrix from this dataset:
  436. >>> y = pdist(X)
  437. Finally, we can perform the clustering:
  438. >>> Z = centroid(y)
  439. >>> Z
  440. array([[ 0. , 1. , 1. , 2. ],
  441. [ 3. , 4. , 1. , 2. ],
  442. [ 9. , 10. , 1. , 2. ],
  443. [ 6. , 7. , 1. , 2. ],
  444. [ 2. , 12. , 1.11803399, 3. ],
  445. [ 5. , 13. , 1.11803399, 3. ],
  446. [ 8. , 15. , 1.11803399, 3. ],
  447. [11. , 14. , 1.11803399, 3. ],
  448. [18. , 19. , 3.33333333, 6. ],
  449. [16. , 17. , 3.33333333, 6. ],
  450. [20. , 21. , 3.33333333, 12. ]]) # may vary
  451. The linkage matrix ``Z`` represents a dendrogram - see
  452. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  453. contents.
  454. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  455. each initial point would belong given a distance threshold:
  456. >>> fcluster(Z, 0.9, criterion='distance')
  457. array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) # may vary
  458. >>> fcluster(Z, 1.1, criterion='distance')
  459. array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) # may vary
  460. >>> fcluster(Z, 2, criterion='distance')
  461. array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) # may vary
  462. >>> fcluster(Z, 4, criterion='distance')
  463. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  464. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  465. plot of the dendrogram.
  466. """
  467. return linkage(y, method='centroid', metric='euclidean')
  468. @lazy_cython
  469. def median(y):
  470. """
  471. Perform median/WPGMC linkage.
  472. See `linkage` for more information on the return structure
  473. and algorithm.
  474. The following are common calling conventions:
  475. 1. ``Z = median(y)``
  476. Performs median/WPGMC linkage on the condensed distance matrix
  477. ``y``. See ``linkage`` for more information on the return
  478. structure and algorithm.
  479. 2. ``Z = median(X)``
  480. Performs median/WPGMC linkage on the observation matrix ``X``
  481. using Euclidean distance as the distance metric. See `linkage`
  482. for more information on the return structure and algorithm.
  483. Parameters
  484. ----------
  485. y : ndarray
  486. A condensed distance matrix. A condensed
  487. distance matrix is a flat array containing the upper
  488. triangular of the distance matrix. This is the form that
  489. ``pdist`` returns. Alternatively, a collection of
  490. m observation vectors in n dimensions may be passed as
  491. an m by n array.
  492. Returns
  493. -------
  494. Z : ndarray
  495. The hierarchical clustering encoded as a linkage matrix.
  496. See Also
  497. --------
  498. linkage : for advanced creation of hierarchical clusterings.
  499. scipy.spatial.distance.pdist : pairwise distance metrics
  500. Examples
  501. --------
  502. >>> from scipy.cluster.hierarchy import median, fcluster
  503. >>> from scipy.spatial.distance import pdist
  504. First, we need a toy dataset to play with::
  505. x x x x
  506. x x
  507. x x
  508. x x x x
  509. >>> X = [[0, 0], [0, 1], [1, 0],
  510. ... [0, 4], [0, 3], [1, 4],
  511. ... [4, 0], [3, 0], [4, 1],
  512. ... [4, 4], [3, 4], [4, 3]]
  513. Then, we get a condensed distance matrix from this dataset:
  514. >>> y = pdist(X)
  515. Finally, we can perform the clustering:
  516. >>> Z = median(y)
  517. >>> Z
  518. array([[ 0. , 1. , 1. , 2. ],
  519. [ 3. , 4. , 1. , 2. ],
  520. [ 9. , 10. , 1. , 2. ],
  521. [ 6. , 7. , 1. , 2. ],
  522. [ 2. , 12. , 1.11803399, 3. ],
  523. [ 5. , 13. , 1.11803399, 3. ],
  524. [ 8. , 15. , 1.11803399, 3. ],
  525. [11. , 14. , 1.11803399, 3. ],
  526. [18. , 19. , 3. , 6. ],
  527. [16. , 17. , 3.5 , 6. ],
  528. [20. , 21. , 3.25 , 12. ]])
  529. The linkage matrix ``Z`` represents a dendrogram - see
  530. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  531. contents.
  532. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  533. each initial point would belong given a distance threshold:
  534. >>> fcluster(Z, 0.9, criterion='distance')
  535. array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32)
  536. >>> fcluster(Z, 1.1, criterion='distance')
  537. array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
  538. >>> fcluster(Z, 2, criterion='distance')
  539. array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
  540. >>> fcluster(Z, 4, criterion='distance')
  541. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  542. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  543. plot of the dendrogram.
  544. """
  545. return linkage(y, method='median', metric='euclidean')
  546. @lazy_cython
  547. def ward(y):
  548. """
  549. Perform Ward's linkage on a condensed distance matrix.
  550. See `linkage` for more information on the return structure
  551. and algorithm.
  552. The following are common calling conventions:
  553. 1. ``Z = ward(y)``
  554. Performs Ward's linkage on the condensed distance matrix ``y``.
  555. 2. ``Z = ward(X)``
  556. Performs Ward's linkage on the observation matrix ``X`` using
  557. Euclidean distance as the distance metric.
  558. Parameters
  559. ----------
  560. y : ndarray
  561. A condensed distance matrix. A condensed
  562. distance matrix is a flat array containing the upper
  563. triangular of the distance matrix. This is the form that
  564. ``pdist`` returns. Alternatively, a collection of
  565. m observation vectors in n dimensions may be passed as
  566. an m by n array.
  567. Returns
  568. -------
  569. Z : ndarray
  570. The hierarchical clustering encoded as a linkage matrix. See
  571. `linkage` for more information on the return structure and
  572. algorithm.
  573. See Also
  574. --------
  575. linkage : for advanced creation of hierarchical clusterings.
  576. scipy.spatial.distance.pdist : pairwise distance metrics
  577. Examples
  578. --------
  579. >>> from scipy.cluster.hierarchy import ward, fcluster
  580. >>> from scipy.spatial.distance import pdist
  581. First, we need a toy dataset to play with::
  582. x x x x
  583. x x
  584. x x
  585. x x x x
  586. >>> X = [[0, 0], [0, 1], [1, 0],
  587. ... [0, 4], [0, 3], [1, 4],
  588. ... [4, 0], [3, 0], [4, 1],
  589. ... [4, 4], [3, 4], [4, 3]]
  590. Then, we get a condensed distance matrix from this dataset:
  591. >>> y = pdist(X)
  592. Finally, we can perform the clustering:
  593. >>> Z = ward(y)
  594. >>> Z
  595. array([[ 0. , 1. , 1. , 2. ],
  596. [ 3. , 4. , 1. , 2. ],
  597. [ 6. , 7. , 1. , 2. ],
  598. [ 9. , 10. , 1. , 2. ],
  599. [ 2. , 12. , 1.29099445, 3. ],
  600. [ 5. , 13. , 1.29099445, 3. ],
  601. [ 8. , 14. , 1.29099445, 3. ],
  602. [11. , 15. , 1.29099445, 3. ],
  603. [16. , 17. , 5.77350269, 6. ],
  604. [18. , 19. , 5.77350269, 6. ],
  605. [20. , 21. , 8.16496581, 12. ]])
  606. The linkage matrix ``Z`` represents a dendrogram - see
  607. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  608. contents.
  609. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  610. each initial point would belong given a distance threshold:
  611. >>> fcluster(Z, 0.9, criterion='distance')
  612. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  613. >>> fcluster(Z, 1.1, criterion='distance')
  614. array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
  615. >>> fcluster(Z, 3, criterion='distance')
  616. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  617. >>> fcluster(Z, 9, criterion='distance')
  618. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  619. Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  620. plot of the dendrogram.
  621. """
  622. return linkage(y, method='ward', metric='euclidean')
  623. @lazy_cython
  624. def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
  625. """
  626. Perform hierarchical/agglomerative clustering.
  627. The input y may be either a 1-D condensed distance matrix
  628. or a 2-D array of observation vectors.
  629. If y is a 1-D condensed distance matrix,
  630. then y must be a :math:`\\binom{n}{2}` sized
  631. vector, where n is the number of original observations paired
  632. in the distance matrix. The behavior of this function is very
  633. similar to the MATLAB linkage function.
  634. A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the
  635. :math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and
  636. ``Z[i, 1]`` are combined to form cluster :math:`n + i`. A
  637. cluster with an index less than :math:`n` corresponds to one of
  638. the :math:`n` original observations. The distance between
  639. clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The
  640. fourth value ``Z[i, 3]`` represents the number of original
  641. observations in the newly formed cluster.
  642. The following linkage methods are used to compute the distance
  643. :math:`d(s, t)` between two clusters :math:`s` and
  644. :math:`t`. The algorithm begins with a forest of clusters that
  645. have yet to be used in the hierarchy being formed. When two
  646. clusters :math:`s` and :math:`t` from this forest are combined
  647. into a single cluster :math:`u`, :math:`s` and :math:`t` are
  648. removed from the forest, and :math:`u` is added to the
  649. forest. When only one cluster remains in the forest, the algorithm
  650. stops, and this cluster becomes the root.
  651. A distance matrix is maintained at each iteration. The ``d[i,j]``
  652. entry corresponds to the distance between cluster :math:`i` and
  653. :math:`j` in the original forest.
  654. At each iteration, the algorithm must update the distance matrix
  655. to reflect the distance of the newly formed cluster u with the
  656. remaining clusters in the forest.
  657. Suppose there are :math:`|u|` original observations
  658. :math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and
  659. :math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in
  660. cluster :math:`v`. Recall, :math:`s` and :math:`t` are
  661. combined to form cluster :math:`u`. Let :math:`v` be any
  662. remaining cluster in the forest that is not :math:`u`.
  663. The following are methods for calculating the distance between the
  664. newly formed cluster :math:`u` and each :math:`v`.
  665. * method='single' assigns
  666. .. math::
  667. d(u,v) = \\min(dist(u[i],v[j]))
  668. for all points :math:`i` in cluster :math:`u` and
  669. :math:`j` in cluster :math:`v`. This is also known as the
  670. Nearest Point Algorithm.
  671. * method='complete' assigns
  672. .. math::
  673. d(u, v) = \\max(dist(u[i],v[j]))
  674. for all points :math:`i` in cluster u and :math:`j` in
  675. cluster :math:`v`. This is also known by the Farthest Point
  676. Algorithm or Voor Hees Algorithm.
  677. * method='average' assigns
  678. .. math::
  679. d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])}
  680. {(|u|*|v|)}
  681. for all points :math:`i` and :math:`j` where :math:`|u|`
  682. and :math:`|v|` are the cardinalities of clusters :math:`u`
  683. and :math:`v`, respectively. This is also called the UPGMA
  684. algorithm.
  685. * method='weighted' assigns
  686. .. math::
  687. d(u,v) = (dist(s,v) + dist(t,v))/2
  688. where cluster u was formed with cluster s and t and v
  689. is a remaining cluster in the forest (also called WPGMA).
  690. * method='centroid' assigns
  691. .. math::
  692. dist(s,t) = ||c_s-c_t||_2
  693. where :math:`c_s` and :math:`c_t` are the centroids of
  694. clusters :math:`s` and :math:`t`, respectively. When two
  695. clusters :math:`s` and :math:`t` are combined into a new
  696. cluster :math:`u`, the new centroid is computed over all the
  697. original objects in clusters :math:`s` and :math:`t`. The
  698. distance then becomes the Euclidean distance between the
  699. centroid of :math:`u` and the centroid of a remaining cluster
  700. :math:`v` in the forest. This is also known as the UPGMC
  701. algorithm.
  702. * method='median' assigns :math:`d(s,t)` like the ``centroid``
  703. method. When two clusters :math:`s` and :math:`t` are combined
  704. into a new cluster :math:`u`, the average of centroids s and t
  705. give the new centroid :math:`u`. This is also known as the
  706. WPGMC algorithm.
  707. * method='ward' uses the Ward variance minimization algorithm.
  708. The new entry :math:`d(u,v)` is computed as follows,
  709. .. math::
  710. d(u,v) = \\sqrt{\\frac{|v|+|s|}
  711. {T}d(v,s)^2
  712. + \\frac{|v|+|t|}
  713. {T}d(v,t)^2
  714. - \\frac{|v|}
  715. {T}d(s,t)^2}
  716. where :math:`u` is the newly joined cluster consisting of
  717. clusters :math:`s` and :math:`t`, :math:`v` is an unused
  718. cluster in the forest, :math:`T=|v|+|s|+|t|`, and
  719. :math:`|*|` is the cardinality of its argument. This is also
  720. known as the incremental algorithm.
  721. Warning: When the minimum distance pair in the forest is chosen, there
  722. may be two or more pairs with the same minimum distance. This
  723. implementation may choose a different minimum than the MATLAB
  724. version.
  725. Parameters
  726. ----------
  727. y : ndarray
  728. A condensed distance matrix. A condensed distance matrix
  729. is a flat array containing the upper triangular of the distance matrix.
  730. This is the form that ``pdist`` returns. Alternatively, a collection of
  731. :math:`m` observation vectors in :math:`n` dimensions may be passed as
  732. an :math:`m` by :math:`n` array. All elements of the condensed distance
  733. matrix must be finite, i.e., no NaNs or infs.
  734. method : str, optional
  735. The linkage algorithm to use. See the ``Linkage Methods`` section below
  736. for full descriptions.
  737. metric : str or function, optional
  738. The distance metric to use in the case that y is a collection of
  739. observation vectors; ignored otherwise. See the ``pdist``
  740. function for a list of valid distance metrics. A custom distance
  741. function can also be used.
  742. optimal_ordering : bool, optional
  743. If True, the linkage matrix will be reordered so that the distance
  744. between successive leaves is minimal. This results in a more intuitive
  745. tree structure when the data are visualized. defaults to False, because
  746. this algorithm can be slow, particularly on large datasets [2]_. See
  747. also the `optimal_leaf_ordering` function.
  748. .. versionadded:: 1.0.0
  749. Returns
  750. -------
  751. Z : ndarray
  752. The hierarchical clustering encoded as a linkage matrix.
  753. Notes
  754. -----
  755. 1. For method 'single', an optimized algorithm based on minimum spanning
  756. tree is implemented. It has time complexity :math:`O(n^2)`.
  757. For methods 'complete', 'average', 'weighted' and 'ward', an algorithm
  758. called nearest-neighbors chain is implemented. It also has time
  759. complexity :math:`O(n^2)`.
  760. For other methods, a naive algorithm is implemented with :math:`O(n^3)`
  761. time complexity.
  762. All algorithms use :math:`O(n^2)` memory.
  763. Refer to [1]_ for details about the algorithms.
  764. 2. Methods 'centroid', 'median', and 'ward' are correctly defined only if
  765. Euclidean pairwise metric is used. If `y` is passed as precomputed
  766. pairwise distances, then it is the user's responsibility to assure that
  767. these distances are in fact Euclidean, otherwise the produced result
  768. will be incorrect.
  769. See Also
  770. --------
  771. scipy.spatial.distance.pdist : pairwise distance metrics
  772. References
  773. ----------
  774. .. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering
  775. algorithms", :arXiv:`1109.2378v1`.
  776. .. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal
  777. leaf ordering for hierarchical clustering", 2001. Bioinformatics
  778. :doi:`10.1093/bioinformatics/17.suppl_1.S22`
  779. Examples
  780. --------
  781. >>> from scipy.cluster.hierarchy import dendrogram, linkage
  782. >>> from matplotlib import pyplot as plt
  783. >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
  784. >>> Z = linkage(X, 'ward')
  785. >>> fig = plt.figure(figsize=(25, 10))
  786. >>> dn = dendrogram(Z)
  787. >>> Z = linkage(X, 'single')
  788. >>> fig = plt.figure(figsize=(25, 10))
  789. >>> dn = dendrogram(Z)
  790. >>> plt.show()
  791. """
  792. xp = array_namespace(y)
  793. y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
  794. lazy = is_lazy_array(y)
  795. if method not in _LINKAGE_METHODS:
  796. raise ValueError(f"Invalid method: {method}")
  797. if method in _EUCLIDEAN_METHODS and metric != 'euclidean' and y.ndim == 2:
  798. msg = f"`method={method}` requires the distance metric to be Euclidean"
  799. raise ValueError(msg)
  800. if y.ndim == 1:
  801. distance.is_valid_y(y, throw=True, name='y')
  802. elif y.ndim == 2:
  803. if (not lazy and y.shape[0] == y.shape[1]
  804. and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
  805. and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
  806. warnings.warn('The symmetric non-negative hollow observation '
  807. 'matrix looks suspiciously like an uncondensed '
  808. 'distance matrix',
  809. ClusterWarning, stacklevel=2)
  810. y = distance.pdist(y, metric)
  811. else:
  812. raise ValueError("`y` must be 1 or 2 dimensional.")
  813. if not lazy and not xp.all(xp.isfinite(y)):
  814. raise ValueError("The condensed distance matrix must contain only "
  815. "finite values.")
  816. n = distance.num_obs_y(y)
  817. method_code = _LINKAGE_METHODS[method]
  818. def cy_linkage(y, validate):
  819. if validate and not np.all(np.isfinite(y)):
  820. raise ValueError("The condensed distance matrix must contain only "
  821. "finite values.")
  822. if method == 'single':
  823. return _hierarchy.mst_single_linkage(y, n)
  824. elif method in ('complete', 'average', 'weighted', 'ward'):
  825. return _hierarchy.nn_chain(y, n, method_code)
  826. else:
  827. return _hierarchy.fast_linkage(y, n, method_code)
  828. result = xpx.lazy_apply(cy_linkage, y, validate=lazy,
  829. shape=(n - 1, 4), dtype=xp.float64,
  830. as_numpy=True, xp=xp)
  831. if optimal_ordering:
  832. return optimal_leaf_ordering(result, y)
  833. else:
  834. return result
  835. class ClusterNode:
  836. """
  837. A tree node class for representing a cluster.
  838. Leaf nodes correspond to original observations, while non-leaf nodes
  839. correspond to non-singleton clusters.
  840. The `to_tree` function converts a matrix returned by the linkage
  841. function into an easy-to-use tree representation.
  842. All parameter names are also attributes.
  843. Parameters
  844. ----------
  845. id : int
  846. The node id.
  847. left : ClusterNode instance, optional
  848. The left child tree node.
  849. right : ClusterNode instance, optional
  850. The right child tree node.
  851. dist : float, optional
  852. Distance for this cluster in the linkage matrix.
  853. count : int, optional
  854. The number of samples in this cluster.
  855. See Also
  856. --------
  857. to_tree : for converting a linkage matrix ``Z`` into a tree object.
  858. """
  859. def __init__(self, id, left=None, right=None, dist=0.0, count=1):
  860. if id < 0:
  861. raise ValueError('The id must be non-negative.')
  862. if dist < 0:
  863. raise ValueError('The distance must be non-negative.')
  864. if (left is None and right is not None) or \
  865. (left is not None and right is None):
  866. raise ValueError('Only full or proper binary trees are permitted.'
  867. ' This node has one child.')
  868. if count < 1:
  869. raise ValueError('A cluster must contain at least one original '
  870. 'observation.')
  871. self.id = id
  872. self.left = left
  873. self.right = right
  874. self.dist = dist
  875. if self.left is None:
  876. self.count = count
  877. else:
  878. self.count = left.count + right.count
  879. def __lt__(self, node):
  880. if not isinstance(node, ClusterNode):
  881. raise ValueError("Can't compare ClusterNode "
  882. f"to type {type(node)}")
  883. return self.dist < node.dist
  884. def __gt__(self, node):
  885. if not isinstance(node, ClusterNode):
  886. raise ValueError("Can't compare ClusterNode "
  887. f"to type {type(node)}")
  888. return self.dist > node.dist
  889. def __eq__(self, node):
  890. if not isinstance(node, ClusterNode):
  891. raise ValueError("Can't compare ClusterNode "
  892. f"to type {type(node)}")
  893. return self.dist == node.dist
  894. def get_id(self):
  895. """
  896. The identifier of the target node.
  897. For ``0 <= i < n``, `i` corresponds to original observation i.
  898. For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed
  899. at iteration ``i-n``.
  900. Returns
  901. -------
  902. id : int
  903. The identifier of the target node.
  904. """
  905. return self.id
  906. def get_count(self):
  907. """
  908. The number of leaf nodes (original observations) belonging to
  909. the cluster node nd. If the target node is a leaf, 1 is
  910. returned.
  911. Returns
  912. -------
  913. get_count : int
  914. The number of leaf nodes below the target node.
  915. """
  916. return self.count
  917. def get_left(self):
  918. """
  919. Return a reference to the left child tree object.
  920. Returns
  921. -------
  922. left : ClusterNode
  923. The left child of the target node. If the node is a leaf,
  924. None is returned.
  925. """
  926. return self.left
  927. def get_right(self):
  928. """
  929. Return a reference to the right child tree object.
  930. Returns
  931. -------
  932. right : ClusterNode
  933. The left child of the target node. If the node is a leaf,
  934. None is returned.
  935. """
  936. return self.right
  937. def is_leaf(self):
  938. """
  939. Return True if the target node is a leaf.
  940. Returns
  941. -------
  942. leafness : bool
  943. True if the target node is a leaf node.
  944. """
  945. return self.left is None
  946. def pre_order(self, func=(lambda x: x.id)):
  947. """
  948. Perform pre-order traversal without recursive function calls.
  949. When a leaf node is first encountered, ``func`` is called with
  950. the leaf node as its argument, and its result is appended to
  951. the list.
  952. For example, the statement::
  953. ids = root.pre_order(lambda x: x.id)
  954. returns a list of the node ids corresponding to the leaf nodes
  955. of the tree as they appear from left to right.
  956. Parameters
  957. ----------
  958. func : function
  959. Applied to each leaf ClusterNode object in the pre-order traversal.
  960. Given the ``i``-th leaf node in the pre-order traversal ``n[i]``,
  961. the result of ``func(n[i])`` is stored in ``L[i]``. If not
  962. provided, the index of the original observation to which the node
  963. corresponds is used.
  964. Returns
  965. -------
  966. L : list
  967. The pre-order traversal.
  968. """
  969. # Do a preorder traversal, caching the result. To avoid having to do
  970. # recursion, we'll store the previous index we've visited in a vector.
  971. n = self.count
  972. curNode = [None] * (2 * n)
  973. lvisited = set()
  974. rvisited = set()
  975. curNode[0] = self
  976. k = 0
  977. preorder = []
  978. while k >= 0:
  979. nd = curNode[k]
  980. ndid = nd.id
  981. if nd.is_leaf():
  982. preorder.append(func(nd))
  983. k = k - 1
  984. else:
  985. if ndid not in lvisited:
  986. curNode[k + 1] = nd.left
  987. lvisited.add(ndid)
  988. k = k + 1
  989. elif ndid not in rvisited:
  990. curNode[k + 1] = nd.right
  991. rvisited.add(ndid)
  992. k = k + 1
  993. # If we've visited the left and right of this non-leaf
  994. # node already, go up in the tree.
  995. else:
  996. k = k - 1
  997. return preorder
  998. _cnode_bare = ClusterNode(0)
  999. _cnode_type = type(ClusterNode)
  1000. def _order_cluster_tree(Z):
  1001. """
  1002. Return clustering nodes in bottom-up order by distance.
  1003. Parameters
  1004. ----------
  1005. Z : scipy.cluster.linkage array
  1006. The linkage matrix.
  1007. Returns
  1008. -------
  1009. nodes : list
  1010. A list of ClusterNode objects.
  1011. """
  1012. q = deque()
  1013. tree = to_tree(Z)
  1014. q.append(tree)
  1015. nodes = []
  1016. while q:
  1017. node = q.popleft()
  1018. if not node.is_leaf():
  1019. bisect.insort_left(nodes, node)
  1020. q.append(node.get_right())
  1021. q.append(node.get_left())
  1022. return nodes
  1023. @xp_capabilities(np_only=True, reason="non-standard indexing")
  1024. def cut_tree(Z, n_clusters=None, height=None):
  1025. """
  1026. Given a linkage matrix Z, return the cut tree.
  1027. Parameters
  1028. ----------
  1029. Z : scipy.cluster.linkage array
  1030. The linkage matrix.
  1031. n_clusters : array_like, optional
  1032. Number of clusters in the tree at the cut point.
  1033. height : array_like, optional
  1034. The height at which to cut the tree. Only possible for ultrametric
  1035. trees.
  1036. Returns
  1037. -------
  1038. cutree : array
  1039. An array indicating group membership at each agglomeration step. I.e.,
  1040. for a full cut tree, in the first column each data point is in its own
  1041. cluster. At the next step, two nodes are merged. Finally, all
  1042. singleton and non-singleton clusters are in one group. If `n_clusters`
  1043. or `height` are given, the columns correspond to the columns of
  1044. `n_clusters` or `height`.
  1045. Examples
  1046. --------
  1047. >>> from scipy import cluster
  1048. >>> import numpy as np
  1049. >>> from numpy.random import default_rng
  1050. >>> rng = default_rng()
  1051. >>> X = rng.random((50, 4))
  1052. >>> Z = cluster.hierarchy.ward(X)
  1053. >>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])
  1054. >>> cutree[:10]
  1055. array([[0, 0],
  1056. [1, 1],
  1057. [2, 2],
  1058. [3, 3],
  1059. [3, 4],
  1060. [2, 2],
  1061. [0, 0],
  1062. [1, 5],
  1063. [3, 6],
  1064. [4, 7]]) # random
  1065. """
  1066. xp = array_namespace(Z)
  1067. nobs = num_obs_linkage(Z)
  1068. nodes = _order_cluster_tree(Z)
  1069. if height is not None and n_clusters is not None:
  1070. raise ValueError("At least one of either height or n_clusters "
  1071. "must be None")
  1072. elif height is None and n_clusters is None: # return the full cut tree
  1073. cols_idx = xp.arange(nobs)
  1074. elif height is not None:
  1075. height = xp.asarray(height)
  1076. heights = xp.asarray([x.dist for x in nodes])
  1077. cols_idx = xp.searchsorted(heights, height)
  1078. else:
  1079. n_clusters = xp.asarray(n_clusters)
  1080. cols_idx = nobs - xp.searchsorted(xp.arange(nobs), n_clusters)
  1081. try:
  1082. n_cols = len(cols_idx)
  1083. except TypeError: # scalar
  1084. n_cols = 1
  1085. cols_idx = xp.asarray([cols_idx])
  1086. groups = xp.zeros((n_cols, nobs), dtype=xp.int64)
  1087. last_group = xp.arange(nobs)
  1088. if 0 in cols_idx:
  1089. groups[0] = last_group
  1090. for i, node in enumerate(nodes):
  1091. idx = node.pre_order()
  1092. this_group = xp_copy(last_group, xp=xp)
  1093. # TODO ARRAY_API complex indexing not supported
  1094. this_group[idx] = xp.min(last_group[idx])
  1095. this_group[this_group > xp.max(last_group[idx])] -= 1
  1096. if i + 1 in cols_idx:
  1097. groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group
  1098. last_group = this_group
  1099. return groups.T
  1100. @xp_capabilities(jax_jit=False, allow_dask_compute=True)
  1101. def to_tree(Z, rd=False):
  1102. """
  1103. Convert a linkage matrix into an easy-to-use tree object.
  1104. The reference to the root `ClusterNode` object is returned (by default).
  1105. Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``,
  1106. and ``count`` attribute. The left and right attributes point to
  1107. ClusterNode objects that were combined to generate the cluster.
  1108. If both are None then the `ClusterNode` object is a leaf node, its count
  1109. must be 1, and its distance is meaningless but set to 0.
  1110. *Note: This function is provided for the convenience of the library
  1111. user. ClusterNodes are not used as input to any of the functions in this
  1112. library.*
  1113. Parameters
  1114. ----------
  1115. Z : ndarray
  1116. The linkage matrix in proper form (see the `linkage`
  1117. function documentation).
  1118. rd : bool, optional
  1119. When False (default), a reference to the root `ClusterNode` object is
  1120. returned. Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a
  1121. reference to the root node while ``d`` is a list of `ClusterNode`
  1122. objects - one per original entry in the linkage matrix plus entries
  1123. for all clustering steps. If a cluster id is
  1124. less than the number of samples ``n`` in the data that the linkage
  1125. matrix describes, then it corresponds to a singleton cluster (leaf
  1126. node).
  1127. See `linkage` for more information on the assignment of cluster ids
  1128. to clusters.
  1129. Returns
  1130. -------
  1131. tree : ClusterNode or tuple (ClusterNode, list of ClusterNode)
  1132. If ``rd`` is False, a `ClusterNode`.
  1133. If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number
  1134. of samples. See the description of `rd` above for more details.
  1135. See Also
  1136. --------
  1137. linkage, is_valid_linkage, ClusterNode
  1138. Examples
  1139. --------
  1140. >>> import numpy as np
  1141. >>> from scipy.cluster import hierarchy
  1142. >>> rng = np.random.default_rng()
  1143. >>> x = rng.random((5, 2))
  1144. >>> Z = hierarchy.linkage(x)
  1145. >>> hierarchy.to_tree(Z)
  1146. <scipy.cluster.hierarchy.ClusterNode object at ...
  1147. >>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True)
  1148. >>> rootnode
  1149. <scipy.cluster.hierarchy.ClusterNode object at ...
  1150. >>> len(nodelist)
  1151. 9
  1152. """
  1153. xp = array_namespace(Z)
  1154. Z = _asarray(Z, order='C', xp=xp)
  1155. _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
  1156. # Number of original objects is equal to the number of rows plus 1.
  1157. n = Z.shape[0] + 1
  1158. # Create a list full of None's to store the node objects
  1159. d = [None] * (n * 2 - 1)
  1160. # Create the nodes corresponding to the n original objects.
  1161. for i in range(0, n):
  1162. d[i] = ClusterNode(i)
  1163. nd = None
  1164. for i in range(Z.shape[0]):
  1165. row = Z[i, :]
  1166. fi = int_floor(row[0], xp)
  1167. fj = int_floor(row[1], xp)
  1168. if fi > i + n:
  1169. raise ValueError('Corrupt matrix Z. Index to derivative cluster '
  1170. f'is used before it is formed. See row {fi}, '
  1171. 'column 0')
  1172. if fj > i + n:
  1173. raise ValueError('Corrupt matrix Z. Index to derivative cluster '
  1174. f'is used before it is formed. See row {fj}, '
  1175. 'column 1')
  1176. nd = ClusterNode(i + n, d[fi], d[fj], row[2])
  1177. # ^ id ^ left ^ right ^ dist
  1178. if row[3] != nd.count:
  1179. raise ValueError(f'Corrupt matrix Z. The count Z[{i},3] is '
  1180. 'incorrect.')
  1181. d[n + i] = nd
  1182. if rd:
  1183. return (nd, d)
  1184. else:
  1185. return nd
  1186. @lazy_cython
  1187. def optimal_leaf_ordering(Z, y, metric='euclidean'):
  1188. """
  1189. Given a linkage matrix Z and distance, reorder the cut tree.
  1190. Parameters
  1191. ----------
  1192. Z : ndarray
  1193. The hierarchical clustering encoded as a linkage matrix. See
  1194. `linkage` for more information on the return structure and
  1195. algorithm.
  1196. y : ndarray
  1197. The condensed distance matrix from which Z was generated.
  1198. Alternatively, a collection of m observation vectors in n
  1199. dimensions may be passed as an m by n array.
  1200. metric : str or function, optional
  1201. The distance metric to use in the case that y is a collection of
  1202. observation vectors; ignored otherwise. See the ``pdist``
  1203. function for a list of valid distance metrics. A custom distance
  1204. function can also be used.
  1205. Returns
  1206. -------
  1207. Z_ordered : ndarray
  1208. A copy of the linkage matrix Z, reordered to minimize the distance
  1209. between adjacent leaves.
  1210. Examples
  1211. --------
  1212. >>> import numpy as np
  1213. >>> from scipy.cluster import hierarchy
  1214. >>> rng = np.random.default_rng()
  1215. >>> X = rng.standard_normal((10, 10))
  1216. >>> Z = hierarchy.ward(X)
  1217. >>> hierarchy.leaves_list(Z)
  1218. array([0, 3, 1, 9, 2, 5, 7, 4, 6, 8], dtype=int32)
  1219. >>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))
  1220. array([3, 0, 2, 5, 7, 4, 8, 6, 9, 1], dtype=int32)
  1221. """
  1222. xp = array_namespace(Z, y)
  1223. Z = _asarray(Z, order='C', xp=xp)
  1224. y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
  1225. lazy = is_lazy_array(Z)
  1226. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1227. if y.ndim == 1:
  1228. distance.is_valid_y(y, throw=True, name='y')
  1229. elif y.ndim == 2:
  1230. if (not lazy and y.shape[0] == y.shape[1]
  1231. and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
  1232. and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
  1233. warnings.warn('The symmetric non-negative hollow observation '
  1234. 'matrix looks suspiciously like an uncondensed '
  1235. 'distance matrix',
  1236. ClusterWarning, stacklevel=2)
  1237. y = distance.pdist(y, metric)
  1238. else:
  1239. raise ValueError("`y` must be 1 or 2 dimensional.")
  1240. if not lazy and not xp.all(xp.isfinite(y)):
  1241. raise ValueError("The condensed distance matrix must contain only "
  1242. "finite values.")
  1243. # The function name is prominently visible on the user-facing Dask dashboard;
  1244. # make sure it is meaningful.
  1245. def cy_optimal_leaf_ordering(Z, y, validate):
  1246. if validate:
  1247. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  1248. if not np.all(np.isfinite(y)):
  1249. raise ValueError("The condensed distance matrix must contain only "
  1250. "finite values.")
  1251. return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y)
  1252. return xpx.lazy_apply(cy_optimal_leaf_ordering, Z, y, validate=lazy,
  1253. shape=Z.shape, dtype=Z.dtype, as_numpy=True, xp=xp)
  1254. @lazy_cython
  1255. def cophenet(Z, Y=None):
  1256. """
  1257. Calculate the cophenetic distances between each observation in
  1258. the hierarchical clustering defined by the linkage ``Z``.
  1259. Suppose ``p`` and ``q`` are original observations in
  1260. disjoint clusters ``s`` and ``t``, respectively and
  1261. ``s`` and ``t`` are joined by a direct parent cluster
  1262. ``u``. The cophenetic distance between observations
  1263. ``i`` and ``j`` is simply the distance between
  1264. clusters ``s`` and ``t``.
  1265. Parameters
  1266. ----------
  1267. Z : ndarray
  1268. The hierarchical clustering encoded as an array
  1269. (see `linkage` function).
  1270. Y : ndarray (optional)
  1271. Calculates the cophenetic correlation coefficient ``c`` of a
  1272. hierarchical clustering defined by the linkage matrix `Z`
  1273. of a set of :math:`n` observations in :math:`m`
  1274. dimensions. `Y` is the condensed distance matrix from which
  1275. `Z` was generated.
  1276. Returns
  1277. -------
  1278. c : ndarray
  1279. The cophentic correlation distance (if ``Y`` is passed).
  1280. d : ndarray
  1281. The cophenetic distance matrix in condensed form. The
  1282. :math:`ij` th entry is the cophenetic distance between
  1283. original observations :math:`i` and :math:`j`.
  1284. See Also
  1285. --------
  1286. linkage :
  1287. for a description of what a linkage matrix is.
  1288. scipy.spatial.distance.squareform :
  1289. transforming condensed matrices into square ones.
  1290. Examples
  1291. --------
  1292. >>> from scipy.cluster.hierarchy import single, cophenet
  1293. >>> from scipy.spatial.distance import pdist, squareform
  1294. Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance
  1295. between two points of ``X`` is the distance between the largest two
  1296. distinct clusters that each of the points:
  1297. >>> X = [[0, 0], [0, 1], [1, 0],
  1298. ... [0, 4], [0, 3], [1, 4],
  1299. ... [4, 0], [3, 0], [4, 1],
  1300. ... [4, 4], [3, 4], [4, 3]]
  1301. ``X`` corresponds to this dataset ::
  1302. x x x x
  1303. x x
  1304. x x
  1305. x x x x
  1306. >>> Z = single(pdist(X))
  1307. >>> Z
  1308. array([[ 0., 1., 1., 2.],
  1309. [ 2., 12., 1., 3.],
  1310. [ 3., 4., 1., 2.],
  1311. [ 5., 14., 1., 3.],
  1312. [ 6., 7., 1., 2.],
  1313. [ 8., 16., 1., 3.],
  1314. [ 9., 10., 1., 2.],
  1315. [11., 18., 1., 3.],
  1316. [13., 15., 2., 6.],
  1317. [17., 20., 2., 9.],
  1318. [19., 21., 2., 12.]])
  1319. >>> cophenet(Z)
  1320. array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
  1321. 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
  1322. 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
  1323. 1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.])
  1324. The output of the `scipy.cluster.hierarchy.cophenet` method is
  1325. represented in condensed form. We can use
  1326. `scipy.spatial.distance.squareform` to see the output as a
  1327. regular matrix (where each element ``ij`` denotes the cophenetic distance
  1328. between each ``i``, ``j`` pair of points in ``X``):
  1329. >>> squareform(cophenet(Z))
  1330. array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1331. [1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1332. [1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1333. [2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.],
  1334. [2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.],
  1335. [2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.],
  1336. [2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.],
  1337. [2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.],
  1338. [2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.],
  1339. [2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.],
  1340. [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.],
  1341. [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]])
  1342. In this example, the cophenetic distance between points on ``X`` that are
  1343. very close (i.e., in the same corner) is 1. For other pairs of points is 2,
  1344. because the points will be located in clusters at different
  1345. corners - thus, the distance between these clusters will be larger.
  1346. """
  1347. xp = array_namespace(Z, Y)
  1348. # Ensure float64 C-contiguous array. Cython code doesn't deal with striding.
  1349. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  1350. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1351. def cy_cophenet(Z, validate):
  1352. if validate:
  1353. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  1354. n = Z.shape[0] + 1
  1355. zz = np.zeros((n * (n-1)) // 2, dtype=np.float64)
  1356. _hierarchy.cophenetic_distances(Z, zz, n)
  1357. return zz
  1358. n = Z.shape[0] + 1
  1359. zz = xpx.lazy_apply(cy_cophenet, Z, validate=is_lazy_array(Z),
  1360. shape=((n * (n-1)) // 2, ), dtype=xp.float64,
  1361. as_numpy=True, xp=xp)
  1362. if Y is None:
  1363. return zz
  1364. Y = _asarray(Y, order='C', xp=xp)
  1365. distance.is_valid_y(Y, throw=True, name='Y')
  1366. z = xp.mean(zz)
  1367. y = xp.mean(Y)
  1368. Yy = Y - y
  1369. Zz = zz - z
  1370. numerator = (Yy * Zz)
  1371. denomA = Yy**2
  1372. denomB = Zz**2
  1373. c = xp.sum(numerator) / xp.sqrt(xp.sum(denomA) * xp.sum(denomB))
  1374. return (c, zz)
  1375. @lazy_cython
  1376. def inconsistent(Z, d=2):
  1377. r"""
  1378. Calculate inconsistency statistics on a linkage matrix.
  1379. Parameters
  1380. ----------
  1381. Z : ndarray
  1382. The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical
  1383. clustering). See `linkage` documentation for more information on its
  1384. form.
  1385. d : int, optional
  1386. The number of links up to `d` levels below each non-singleton cluster.
  1387. Returns
  1388. -------
  1389. R : ndarray
  1390. A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link
  1391. statistics for the non-singleton cluster ``i``. The link statistics are
  1392. computed over the link heights for links :math:`d` levels below the
  1393. cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard
  1394. deviation of the link heights, respectively; ``R[i,2]`` is the number
  1395. of links included in the calculation; and ``R[i,3]`` is the
  1396. inconsistency coefficient,
  1397. .. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]}
  1398. Notes
  1399. -----
  1400. This function behaves similarly to the MATLAB(TM) ``inconsistent``
  1401. function.
  1402. Examples
  1403. --------
  1404. >>> from scipy.cluster.hierarchy import inconsistent, linkage
  1405. >>> from matplotlib import pyplot as plt
  1406. >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
  1407. >>> Z = linkage(X, 'ward')
  1408. >>> print(Z)
  1409. [[ 5. 6. 0. 2. ]
  1410. [ 2. 7. 0. 2. ]
  1411. [ 0. 4. 1. 2. ]
  1412. [ 1. 8. 1.15470054 3. ]
  1413. [ 9. 10. 2.12132034 4. ]
  1414. [ 3. 12. 4.11096096 5. ]
  1415. [11. 13. 14.07183949 8. ]]
  1416. >>> inconsistent(Z)
  1417. array([[ 0. , 0. , 1. , 0. ],
  1418. [ 0. , 0. , 1. , 0. ],
  1419. [ 1. , 0. , 1. , 0. ],
  1420. [ 0.57735027, 0.81649658, 2. , 0.70710678],
  1421. [ 1.04044011, 1.06123822, 3. , 1.01850858],
  1422. [ 3.11614065, 1.40688837, 2. , 0.70710678],
  1423. [ 6.44583366, 6.76770586, 3. , 1.12682288]])
  1424. """
  1425. xp = array_namespace(Z)
  1426. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  1427. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1428. if d != np.floor(d) or d < 0:
  1429. raise ValueError('The second argument d must be a nonnegative '
  1430. 'integer value.')
  1431. def cy_inconsistent(Z, d, validate):
  1432. if validate:
  1433. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  1434. R = np.zeros((Z.shape[0], 4), dtype=np.float64)
  1435. n = Z.shape[0] + 1
  1436. _hierarchy.inconsistent(Z, R, n, d)
  1437. return R
  1438. return xpx.lazy_apply(cy_inconsistent, Z, d=int(d), validate=is_lazy_array(Z),
  1439. shape=(Z.shape[0], 4), dtype=xp.float64,
  1440. as_numpy=True, xp=xp)
  1441. @lazy_cython
  1442. def from_mlab_linkage(Z):
  1443. """
  1444. Convert a linkage matrix generated by MATLAB(TM) to a new
  1445. linkage matrix compatible with this module.
  1446. The conversion does two things:
  1447. * the indices are converted from ``1..N`` to ``0..(N-1)`` form,
  1448. and
  1449. * a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the
  1450. number of original observations (leaves) in the non-singleton
  1451. cluster ``i``.
  1452. This function is useful when loading in linkages from legacy data
  1453. files generated by MATLAB.
  1454. Parameters
  1455. ----------
  1456. Z : ndarray
  1457. A linkage matrix generated by MATLAB(TM).
  1458. Returns
  1459. -------
  1460. ZS : ndarray
  1461. A linkage matrix compatible with ``scipy.cluster.hierarchy``.
  1462. See Also
  1463. --------
  1464. linkage : for a description of what a linkage matrix is.
  1465. to_mlab_linkage : transform from SciPy to MATLAB format.
  1466. Examples
  1467. --------
  1468. >>> import numpy as np
  1469. >>> from scipy.cluster.hierarchy import ward, from_mlab_linkage
  1470. Given a linkage matrix in MATLAB format ``mZ``, we can use
  1471. `scipy.cluster.hierarchy.from_mlab_linkage` to import
  1472. it into SciPy format:
  1473. >>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1],
  1474. ... [10, 11, 1], [3, 13, 1.29099445],
  1475. ... [6, 14, 1.29099445],
  1476. ... [9, 15, 1.29099445],
  1477. ... [12, 16, 1.29099445],
  1478. ... [17, 18, 5.77350269],
  1479. ... [19, 20, 5.77350269],
  1480. ... [21, 22, 8.16496581]])
  1481. >>> Z = from_mlab_linkage(mZ)
  1482. >>> Z
  1483. array([[ 0. , 1. , 1. , 2. ],
  1484. [ 3. , 4. , 1. , 2. ],
  1485. [ 6. , 7. , 1. , 2. ],
  1486. [ 9. , 10. , 1. , 2. ],
  1487. [ 2. , 12. , 1.29099445, 3. ],
  1488. [ 5. , 13. , 1.29099445, 3. ],
  1489. [ 8. , 14. , 1.29099445, 3. ],
  1490. [ 11. , 15. , 1.29099445, 3. ],
  1491. [ 16. , 17. , 5.77350269, 6. ],
  1492. [ 18. , 19. , 5.77350269, 6. ],
  1493. [ 20. , 21. , 8.16496581, 12. ]])
  1494. As expected, the linkage matrix ``Z`` returned includes an
  1495. additional column counting the number of original samples in
  1496. each cluster. Also, all cluster indices are reduced by 1
  1497. (MATLAB format uses 1-indexing, whereas SciPy uses 0-indexing).
  1498. """
  1499. xp = array_namespace(Z)
  1500. Z = _asarray(Z, dtype=xp.float64, order='C', xp=xp)
  1501. # If it's empty, return it.
  1502. if Z.shape in ((), (0, )):
  1503. return xp_copy(Z, xp=xp)
  1504. if Z.ndim != 2:
  1505. raise ValueError("The linkage array must be rectangular.")
  1506. # If it contains no rows, return it.
  1507. n = Z.shape[0]
  1508. if n == 0:
  1509. return xp_copy(Z, xp=xp)
  1510. lazy = is_lazy_array(Z)
  1511. if not lazy and xp.min(Z[:, :2]) != 1.0 and xp.max(Z[:, :2]) != 2 * n:
  1512. raise ValueError('The format of the indices is not 1..N')
  1513. res = xp.empty((Z.shape[0], Z.shape[1] + 1), dtype=Z.dtype)
  1514. res = xpx.at(res)[:, :2].set(Z[:, :2] - 1.0)
  1515. res = xpx.at(res)[:, 2:-1].set(Z[:, 2:])
  1516. def cy_from_mlab_linkage(Zpart, validate):
  1517. n = Zpart.shape[0]
  1518. if validate and np.min(Zpart[:, :2]) != 0.0 and np.max(Zpart[:, :2]) != 2 * n:
  1519. raise ValueError('The format of the indices is not 1..N')
  1520. if not Zpart.flags.writeable:
  1521. Zpart = Zpart.copy() # xp=jax.numpy
  1522. CS = np.zeros((n,))
  1523. _hierarchy.calculate_cluster_sizes(Zpart, CS, n + 1)
  1524. return CS
  1525. CS = xpx.lazy_apply(cy_from_mlab_linkage, res[:, :-1], validate=lazy,
  1526. shape=(res.shape[0],), dtype=xp.float64,
  1527. as_numpy=True, xp=xp)
  1528. return xpx.at(res)[:, -1].set(CS)
  1529. @xp_capabilities()
  1530. def to_mlab_linkage(Z):
  1531. """
  1532. Convert a linkage matrix to a MATLAB(TM) compatible one.
  1533. Converts a linkage matrix ``Z`` generated by the linkage function
  1534. of this module to a MATLAB(TM) compatible one. The return linkage
  1535. matrix has the last column removed and the cluster indices are
  1536. converted to ``1..N`` indexing.
  1537. Parameters
  1538. ----------
  1539. Z : ndarray
  1540. A linkage matrix generated by ``scipy.cluster.hierarchy``.
  1541. Returns
  1542. -------
  1543. to_mlab_linkage : ndarray
  1544. A linkage matrix compatible with MATLAB(TM)'s hierarchical
  1545. clustering functions.
  1546. The return linkage matrix has the last column removed
  1547. and the cluster indices are converted to ``1..N`` indexing.
  1548. See Also
  1549. --------
  1550. linkage : for a description of what a linkage matrix is.
  1551. from_mlab_linkage : transform from Matlab to SciPy format.
  1552. Examples
  1553. --------
  1554. >>> from scipy.cluster.hierarchy import ward, to_mlab_linkage
  1555. >>> from scipy.spatial.distance import pdist
  1556. >>> X = [[0, 0], [0, 1], [1, 0],
  1557. ... [0, 4], [0, 3], [1, 4],
  1558. ... [4, 0], [3, 0], [4, 1],
  1559. ... [4, 4], [3, 4], [4, 3]]
  1560. >>> Z = ward(pdist(X))
  1561. >>> Z
  1562. array([[ 0. , 1. , 1. , 2. ],
  1563. [ 3. , 4. , 1. , 2. ],
  1564. [ 6. , 7. , 1. , 2. ],
  1565. [ 9. , 10. , 1. , 2. ],
  1566. [ 2. , 12. , 1.29099445, 3. ],
  1567. [ 5. , 13. , 1.29099445, 3. ],
  1568. [ 8. , 14. , 1.29099445, 3. ],
  1569. [11. , 15. , 1.29099445, 3. ],
  1570. [16. , 17. , 5.77350269, 6. ],
  1571. [18. , 19. , 5.77350269, 6. ],
  1572. [20. , 21. , 8.16496581, 12. ]])
  1573. After a linkage matrix ``Z`` has been created, we can use
  1574. `scipy.cluster.hierarchy.to_mlab_linkage` to convert it
  1575. into MATLAB format:
  1576. >>> mZ = to_mlab_linkage(Z)
  1577. >>> mZ
  1578. array([[ 1. , 2. , 1. ],
  1579. [ 4. , 5. , 1. ],
  1580. [ 7. , 8. , 1. ],
  1581. [ 10. , 11. , 1. ],
  1582. [ 3. , 13. , 1.29099445],
  1583. [ 6. , 14. , 1.29099445],
  1584. [ 9. , 15. , 1.29099445],
  1585. [ 12. , 16. , 1.29099445],
  1586. [ 17. , 18. , 5.77350269],
  1587. [ 19. , 20. , 5.77350269],
  1588. [ 21. , 22. , 8.16496581]])
  1589. The new linkage matrix ``mZ`` uses 1-indexing for all the
  1590. clusters (instead of 0-indexing). Also, the last column of
  1591. the original linkage matrix has been dropped.
  1592. """
  1593. xp = array_namespace(Z)
  1594. Z = _asarray(Z, dtype=xp.float64, xp=xp)
  1595. if Z.shape in ((), (0, )):
  1596. return xp_copy(Z, xp=xp)
  1597. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1598. return xp.concat((Z[:, :2] + 1.0, Z[:, 2:3]), axis=1)
  1599. @xp_capabilities()
  1600. def is_monotonic(Z):
  1601. """
  1602. Return True if the linkage passed is monotonic.
  1603. The linkage is monotonic if for every cluster :math:`s` and :math:`t`
  1604. joined, the distance between them is no less than the distance
  1605. between any previously joined clusters.
  1606. Parameters
  1607. ----------
  1608. Z : ndarray
  1609. The linkage matrix to check for monotonicity.
  1610. Returns
  1611. -------
  1612. b : bool
  1613. A boolean indicating whether the linkage is monotonic.
  1614. See Also
  1615. --------
  1616. linkage : for a description of what a linkage matrix is.
  1617. Examples
  1618. --------
  1619. >>> from scipy.cluster.hierarchy import median, ward, is_monotonic
  1620. >>> from scipy.spatial.distance import pdist
  1621. By definition, some hierarchical clustering algorithms - such as
  1622. `scipy.cluster.hierarchy.ward` - produce monotonic assignments of
  1623. samples to clusters; however, this is not always true for other
  1624. hierarchical methods - e.g. `scipy.cluster.hierarchy.median`.
  1625. Given a linkage matrix ``Z`` (as the result of a hierarchical clustering
  1626. method) we can test programmatically whether it has the monotonicity
  1627. property or not, using `scipy.cluster.hierarchy.is_monotonic`:
  1628. >>> X = [[0, 0], [0, 1], [1, 0],
  1629. ... [0, 4], [0, 3], [1, 4],
  1630. ... [4, 0], [3, 0], [4, 1],
  1631. ... [4, 4], [3, 4], [4, 3]]
  1632. >>> Z = ward(pdist(X))
  1633. >>> Z
  1634. array([[ 0. , 1. , 1. , 2. ],
  1635. [ 3. , 4. , 1. , 2. ],
  1636. [ 6. , 7. , 1. , 2. ],
  1637. [ 9. , 10. , 1. , 2. ],
  1638. [ 2. , 12. , 1.29099445, 3. ],
  1639. [ 5. , 13. , 1.29099445, 3. ],
  1640. [ 8. , 14. , 1.29099445, 3. ],
  1641. [11. , 15. , 1.29099445, 3. ],
  1642. [16. , 17. , 5.77350269, 6. ],
  1643. [18. , 19. , 5.77350269, 6. ],
  1644. [20. , 21. , 8.16496581, 12. ]])
  1645. >>> is_monotonic(Z)
  1646. True
  1647. >>> Z = median(pdist(X))
  1648. >>> Z
  1649. array([[ 0. , 1. , 1. , 2. ],
  1650. [ 3. , 4. , 1. , 2. ],
  1651. [ 9. , 10. , 1. , 2. ],
  1652. [ 6. , 7. , 1. , 2. ],
  1653. [ 2. , 12. , 1.11803399, 3. ],
  1654. [ 5. , 13. , 1.11803399, 3. ],
  1655. [ 8. , 15. , 1.11803399, 3. ],
  1656. [11. , 14. , 1.11803399, 3. ],
  1657. [18. , 19. , 3. , 6. ],
  1658. [16. , 17. , 3.5 , 6. ],
  1659. [20. , 21. , 3.25 , 12. ]])
  1660. >>> is_monotonic(Z)
  1661. False
  1662. Note that this method is equivalent to just verifying that the distances
  1663. in the third column of the linkage matrix appear in a monotonically
  1664. increasing order.
  1665. """
  1666. xp = array_namespace(Z)
  1667. Z = _asarray(Z, xp=xp)
  1668. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1669. # We expect the i'th value to be greater than its successor.
  1670. return xp.all(Z[1:, 2] >= Z[:-1, 2])
  1671. @xp_capabilities(warnings=[("dask.array", "see notes"), ("jax.numpy", "see notes")])
  1672. def is_valid_im(R, warning=False, throw=False, name=None):
  1673. """Return True if the inconsistency matrix passed is valid.
  1674. It must be a :math:`n` by 4 array of doubles. The standard
  1675. deviations ``R[:,1]`` must be nonnegative. The link counts
  1676. ``R[:,2]`` must be positive and no greater than :math:`n-1`.
  1677. Parameters
  1678. ----------
  1679. R : ndarray
  1680. The inconsistency matrix to check for validity.
  1681. warning : bool, optional
  1682. When True, issues a Python warning if the linkage
  1683. matrix passed is invalid.
  1684. throw : bool, optional
  1685. When True, throws a Python exception if the linkage
  1686. matrix passed is invalid.
  1687. name : str, optional
  1688. This string refers to the variable name of the invalid
  1689. linkage matrix.
  1690. Returns
  1691. -------
  1692. b : bool
  1693. True if the inconsistency matrix is valid; False otherwise.
  1694. Notes
  1695. -----
  1696. *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
  1697. or JAX), the return value may be a 0-dimensional bool Array. When warning=True
  1698. or throw=True, calling this function materializes the array.
  1699. See Also
  1700. --------
  1701. linkage : for a description of what a linkage matrix is.
  1702. inconsistent : for the creation of a inconsistency matrix.
  1703. Examples
  1704. --------
  1705. >>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im
  1706. >>> from scipy.spatial.distance import pdist
  1707. Given a data set ``X``, we can apply a clustering method to obtain a
  1708. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  1709. be also used to obtain the inconsistency matrix ``R`` associated to
  1710. this clustering process:
  1711. >>> X = [[0, 0], [0, 1], [1, 0],
  1712. ... [0, 4], [0, 3], [1, 4],
  1713. ... [4, 0], [3, 0], [4, 1],
  1714. ... [4, 4], [3, 4], [4, 3]]
  1715. >>> Z = ward(pdist(X))
  1716. >>> R = inconsistent(Z)
  1717. >>> Z
  1718. array([[ 0. , 1. , 1. , 2. ],
  1719. [ 3. , 4. , 1. , 2. ],
  1720. [ 6. , 7. , 1. , 2. ],
  1721. [ 9. , 10. , 1. , 2. ],
  1722. [ 2. , 12. , 1.29099445, 3. ],
  1723. [ 5. , 13. , 1.29099445, 3. ],
  1724. [ 8. , 14. , 1.29099445, 3. ],
  1725. [11. , 15. , 1.29099445, 3. ],
  1726. [16. , 17. , 5.77350269, 6. ],
  1727. [18. , 19. , 5.77350269, 6. ],
  1728. [20. , 21. , 8.16496581, 12. ]])
  1729. >>> R
  1730. array([[1. , 0. , 1. , 0. ],
  1731. [1. , 0. , 1. , 0. ],
  1732. [1. , 0. , 1. , 0. ],
  1733. [1. , 0. , 1. , 0. ],
  1734. [1.14549722, 0.20576415, 2. , 0.70710678],
  1735. [1.14549722, 0.20576415, 2. , 0.70710678],
  1736. [1.14549722, 0.20576415, 2. , 0.70710678],
  1737. [1.14549722, 0.20576415, 2. , 0.70710678],
  1738. [2.78516386, 2.58797734, 3. , 1.15470054],
  1739. [2.78516386, 2.58797734, 3. , 1.15470054],
  1740. [6.57065706, 1.38071187, 3. , 1.15470054]])
  1741. Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that
  1742. ``R`` is correct:
  1743. >>> is_valid_im(R)
  1744. True
  1745. However, if ``R`` is wrongly constructed (e.g., one of the standard
  1746. deviations is set to a negative value), then the check will fail:
  1747. >>> R[-1,1] = R[-1,1] * -1
  1748. >>> is_valid_im(R)
  1749. False
  1750. """
  1751. xp = array_namespace(R)
  1752. R = _asarray(R, xp=xp)
  1753. return _is_valid_im(R, warning=warning, throw=throw, name=name,
  1754. materialize=True, xp=xp)
  1755. def _is_valid_im(R, warning=False, throw=False, name=None, materialize=False, *, xp):
  1756. """Variant of `is_valid_im` to be called internally by other scipy functions,
  1757. which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
  1758. warning=True or throw=True.
  1759. """
  1760. name_str = f"{name!r} " if name else ''
  1761. try:
  1762. if R.dtype != xp.float64:
  1763. raise TypeError(f'Inconsistency matrix {name_str}must contain doubles '
  1764. '(double).')
  1765. if len(R.shape) != 2:
  1766. raise ValueError(f'Inconsistency matrix {name_str}must have shape=2 (i.e. '
  1767. 'be two-dimensional).')
  1768. if R.shape[1] != 4:
  1769. raise ValueError(f'Inconsistency matrix {name_str}'
  1770. 'must have 4 columns.')
  1771. if R.shape[0] < 1:
  1772. raise ValueError(f'Inconsistency matrix {name_str}'
  1773. 'must have at least one row.')
  1774. except (TypeError, ValueError) as e:
  1775. if throw:
  1776. raise
  1777. if warning:
  1778. _warning(str(e))
  1779. return False
  1780. return _lazy_valid_checks(
  1781. (xp.any(R[:, 0] < 0),
  1782. f'Inconsistency matrix {name_str} contains negative link height means.'),
  1783. (xp.any(R[:, 1] < 0),
  1784. f'Inconsistency matrix {name_str} contains negative link height standard '
  1785. 'deviations.'),
  1786. (xp.any(R[:, 2] < 0),
  1787. f'Inconsistency matrix {name_str} contains negative link counts.'),
  1788. throw=throw, warning=warning, materialize=materialize, xp=xp
  1789. )
  1790. @xp_capabilities(warnings=[("dask.array", "see notes"), ("jax.numpy", "see notes")])
  1791. def is_valid_linkage(Z, warning=False, throw=False, name=None):
  1792. """
  1793. Check the validity of a linkage matrix.
  1794. A linkage matrix is valid if it is a 2-D array (type double)
  1795. with :math:`n` rows and 4 columns. The first two columns must contain
  1796. indices between 0 and :math:`2n-1`. For a given row ``i``, the following
  1797. two expressions have to hold:
  1798. .. math::
  1799. 0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1
  1800. 0 \\leq Z[i,1] \\leq i+n-1
  1801. I.e., a cluster cannot join another cluster unless the cluster being joined
  1802. has been generated.
  1803. The fourth column of `Z` represents the number of original observations
  1804. in a cluster, so a valid ``Z[i, 3]`` value may not exceed the number of
  1805. original observations.
  1806. Parameters
  1807. ----------
  1808. Z : array_like
  1809. Linkage matrix.
  1810. warning : bool, optional
  1811. When True, issues a Python warning if the linkage
  1812. matrix passed is invalid.
  1813. throw : bool, optional
  1814. When True, throws a Python exception if the linkage
  1815. matrix passed is invalid.
  1816. name : str, optional
  1817. This string refers to the variable name of the invalid
  1818. linkage matrix.
  1819. Returns
  1820. -------
  1821. b : bool
  1822. True if the inconsistency matrix is valid; False otherwise.
  1823. Notes
  1824. -----
  1825. *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
  1826. or JAX), the return value may be a 0-dimensional bool Array. When warning=True
  1827. or throw=True, calling this function materializes the array.
  1828. See Also
  1829. --------
  1830. linkage: for a description of what a linkage matrix is.
  1831. Examples
  1832. --------
  1833. >>> from scipy.cluster.hierarchy import ward, is_valid_linkage
  1834. >>> from scipy.spatial.distance import pdist
  1835. All linkage matrices generated by the clustering methods in this module
  1836. will be valid (i.e., they will have the appropriate dimensions and the two
  1837. required expressions will hold for all the rows).
  1838. We can check this using `scipy.cluster.hierarchy.is_valid_linkage`:
  1839. >>> X = [[0, 0], [0, 1], [1, 0],
  1840. ... [0, 4], [0, 3], [1, 4],
  1841. ... [4, 0], [3, 0], [4, 1],
  1842. ... [4, 4], [3, 4], [4, 3]]
  1843. >>> Z = ward(pdist(X))
  1844. >>> Z
  1845. array([[ 0. , 1. , 1. , 2. ],
  1846. [ 3. , 4. , 1. , 2. ],
  1847. [ 6. , 7. , 1. , 2. ],
  1848. [ 9. , 10. , 1. , 2. ],
  1849. [ 2. , 12. , 1.29099445, 3. ],
  1850. [ 5. , 13. , 1.29099445, 3. ],
  1851. [ 8. , 14. , 1.29099445, 3. ],
  1852. [11. , 15. , 1.29099445, 3. ],
  1853. [16. , 17. , 5.77350269, 6. ],
  1854. [18. , 19. , 5.77350269, 6. ],
  1855. [20. , 21. , 8.16496581, 12. ]])
  1856. >>> is_valid_linkage(Z)
  1857. True
  1858. However, if we create a linkage matrix in a wrong way - or if we modify
  1859. a valid one in a way that any of the required expressions don't hold
  1860. anymore, then the check will fail:
  1861. >>> Z[3][1] = 20 # the cluster number 20 is not defined at this point
  1862. >>> is_valid_linkage(Z)
  1863. False
  1864. """
  1865. xp = array_namespace(Z)
  1866. Z = _asarray(Z, xp=xp)
  1867. return _is_valid_linkage(Z, warning=warning, throw=throw,
  1868. name=name, materialize=True, xp=xp)
  1869. def _is_valid_linkage(Z, warning=False, throw=False, name=None,
  1870. materialize=False, *, xp):
  1871. """Variant of `is_valid_linkage` to be called internally by other scipy functions,
  1872. which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
  1873. warning=True or throw=True.
  1874. """
  1875. name_str = f"{name!r} " if name else ''
  1876. try:
  1877. if Z.dtype != xp.float64:
  1878. raise TypeError(f'Linkage matrix {name_str}must contain doubles.')
  1879. if len(Z.shape) != 2:
  1880. raise ValueError(f'Linkage matrix {name_str}must have shape=2 (i.e. be'
  1881. ' two-dimensional).')
  1882. if Z.shape[1] != 4:
  1883. raise ValueError(f'Linkage matrix {name_str}must have 4 columns.')
  1884. if Z.shape[0] == 0:
  1885. raise ValueError('Linkage must be computed on at least two '
  1886. 'observations.')
  1887. except (TypeError, ValueError) as e:
  1888. if throw:
  1889. raise
  1890. if warning:
  1891. _warning(str(e))
  1892. return False
  1893. n = Z.shape[0]
  1894. if n < 2:
  1895. return True
  1896. return _lazy_valid_checks(
  1897. (xp.any(Z[:, :2] < 0),
  1898. f'Linkage {name_str}contains negative indices.'),
  1899. (xp.any(Z[:, 2] < 0),
  1900. f'Linkage {name_str}contains negative distances.'),
  1901. (xp.any(Z[:, 3] < 0),
  1902. f'Linkage {name_str}contains negative counts.'),
  1903. (xp.any(Z[:, 3] > n + 1),
  1904. f'Linkage {name_str}contains excessive observations in a cluster'),
  1905. (xp.any(xp.max(Z[:, :2], axis=1) >= xp.arange(n + 1, 2 * n + 1, dtype=Z.dtype)),
  1906. f'Linkage {name_str}uses non-singleton cluster before it is formed.'),
  1907. (xpx.nunique(Z[:, :2]) < n * 2,
  1908. f'Linkage {name_str}uses the same cluster more than once.'),
  1909. throw=throw, warning=warning, materialize=materialize, xp=xp
  1910. )
  1911. def _lazy_valid_checks(*args, throw=False, warning=False, materialize=False, xp):
  1912. """Validate a set of conditions on the contents of possibly lazy arrays.
  1913. Parameters
  1914. ----------
  1915. args : tuples of (Array, str)
  1916. The first element of each tuple must be a 0-dimensional Array
  1917. that evaluates to bool; the second element must be the message to convey
  1918. if the first element evaluates to True.
  1919. throw: bool
  1920. Set to True to `raise ValueError(args[i][1])` if `args[i][0]` is True.
  1921. warning: bool
  1922. Set to True to issue a warning with message `args[i][1]` if `args[i][0]`
  1923. is True.
  1924. materialize: bool
  1925. Set to True to force materialization of lazy arrays when throw=True or
  1926. warning=True. If the inputs are lazy and materialize=False, ignore the
  1927. `throw` and `warning` flags.
  1928. xp: module
  1929. Array API namespace
  1930. Returns
  1931. -------
  1932. If xp is an eager backend (e.g. numpy) and all conditions are False, return True.
  1933. If throw is True, raise. Otherwise, return False.
  1934. If xp is a lazy backend (e.g. Dask or JAX), return a 0-dimensional bool Array.
  1935. """
  1936. conds = xp.concat([xp.reshape(cond, (1, )) for cond, _ in args])
  1937. lazy = is_lazy_array(conds)
  1938. if not throw and not warning or (lazy and not materialize):
  1939. out = ~xp.any(conds)
  1940. return out if lazy else bool(out)
  1941. if is_dask(xp):
  1942. # Only materialize the graph once, instead of once per check
  1943. conds = conds.compute()
  1944. # Don't call np.asarray(conds), as it would be blocked by the device transfer
  1945. # guard on CuPy and PyTorch and the densification guard on Sparse, whereas
  1946. # bool() will not.
  1947. conds = [bool(cond) for cond in conds]
  1948. for cond, (_, msg) in zip(conds, args):
  1949. if throw and cond:
  1950. raise ValueError(msg)
  1951. elif warning and cond:
  1952. warnings.warn(msg, ClusterWarning, stacklevel=3)
  1953. return not any(conds)
  1954. @xp_capabilities()
  1955. def num_obs_linkage(Z):
  1956. """
  1957. Return the number of original observations of the linkage matrix passed.
  1958. Parameters
  1959. ----------
  1960. Z : ndarray
  1961. The linkage matrix on which to perform the operation.
  1962. Returns
  1963. -------
  1964. n : int
  1965. The number of original observations in the linkage.
  1966. Examples
  1967. --------
  1968. >>> from scipy.cluster.hierarchy import ward, num_obs_linkage
  1969. >>> from scipy.spatial.distance import pdist
  1970. >>> X = [[0, 0], [0, 1], [1, 0],
  1971. ... [0, 4], [0, 3], [1, 4],
  1972. ... [4, 0], [3, 0], [4, 1],
  1973. ... [4, 4], [3, 4], [4, 3]]
  1974. >>> Z = ward(pdist(X))
  1975. ``Z`` is a linkage matrix obtained after using the Ward clustering method
  1976. with ``X``, a dataset with 12 data points.
  1977. >>> num_obs_linkage(Z)
  1978. 12
  1979. """
  1980. xp = array_namespace(Z)
  1981. Z = _asarray(Z, xp=xp)
  1982. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  1983. return Z.shape[0] + 1
  1984. @xp_capabilities()
  1985. def correspond(Z, Y):
  1986. """
  1987. Check for correspondence between linkage and condensed distance matrices.
  1988. They must have the same number of original observations for
  1989. the check to succeed.
  1990. This function is useful as a sanity check in algorithms that make
  1991. extensive use of linkage and distance matrices that must
  1992. correspond to the same set of original observations.
  1993. Parameters
  1994. ----------
  1995. Z : array_like
  1996. The linkage matrix to check for correspondence.
  1997. Y : array_like
  1998. The condensed distance matrix to check for correspondence.
  1999. Returns
  2000. -------
  2001. b : bool
  2002. A boolean indicating whether the linkage matrix and distance
  2003. matrix could possibly correspond to one another.
  2004. See Also
  2005. --------
  2006. linkage : for a description of what a linkage matrix is.
  2007. Examples
  2008. --------
  2009. >>> from scipy.cluster.hierarchy import ward, correspond
  2010. >>> from scipy.spatial.distance import pdist
  2011. This method can be used to check if a given linkage matrix ``Z`` has been
  2012. obtained from the application of a cluster method over a dataset ``X``:
  2013. >>> X = [[0, 0], [0, 1], [1, 0],
  2014. ... [0, 4], [0, 3], [1, 4],
  2015. ... [4, 0], [3, 0], [4, 1],
  2016. ... [4, 4], [3, 4], [4, 3]]
  2017. >>> X_condensed = pdist(X)
  2018. >>> Z = ward(X_condensed)
  2019. Here, we can compare ``Z`` and ``X`` (in condensed form):
  2020. >>> correspond(Z, X_condensed)
  2021. True
  2022. """
  2023. xp = array_namespace(Z, Y)
  2024. Z = _asarray(Z, xp=xp)
  2025. Y = _asarray(Y, xp=xp)
  2026. _is_valid_linkage(Z, throw=True, xp=xp)
  2027. distance.is_valid_y(Y, throw=True)
  2028. return distance.num_obs_y(Y) == num_obs_linkage(Z)
  2029. @xp_capabilities(cpu_only=True, reason="Cython code",
  2030. jax_jit=False, allow_dask_compute=True)
  2031. def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None):
  2032. """
  2033. Form flat clusters from the hierarchical clustering defined by
  2034. the given linkage matrix.
  2035. Parameters
  2036. ----------
  2037. Z : ndarray
  2038. The hierarchical clustering encoded with the matrix returned
  2039. by the `linkage` function.
  2040. t : scalar
  2041. For criteria 'inconsistent', 'distance' or 'monocrit',
  2042. this is the threshold to apply when forming flat clusters.
  2043. For 'maxclust' or 'maxclust_monocrit' criteria,
  2044. this would be max number of clusters requested.
  2045. criterion : str, optional
  2046. The criterion to use in forming flat clusters. This can
  2047. be any of the following values:
  2048. ``inconsistent`` :
  2049. If a cluster node and all its
  2050. descendants have an inconsistent value less than or equal
  2051. to `t`, then all its leaf descendants belong to the
  2052. same flat cluster. When no non-singleton cluster meets
  2053. this criterion, every node is assigned to its own
  2054. cluster. (Default)
  2055. ``distance`` :
  2056. Forms flat clusters so that the original
  2057. observations in each flat cluster have no greater a
  2058. cophenetic distance than `t`.
  2059. ``maxclust`` :
  2060. Finds a minimum threshold ``r`` so that
  2061. the cophenetic distance between any two original
  2062. observations in the same flat cluster is no more than
  2063. ``r`` and no more than `t` flat clusters are formed.
  2064. ``monocrit`` :
  2065. Forms a flat cluster from a cluster node c
  2066. with index i when ``monocrit[j] <= t``.
  2067. For example, to threshold on the maximum mean distance
  2068. as computed in the inconsistency matrix R with a
  2069. threshold of 0.8 do::
  2070. MR = maxRstat(Z, R, 3)
  2071. fcluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
  2072. ``maxclust_monocrit`` :
  2073. Forms a flat cluster from a
  2074. non-singleton cluster node ``c`` when ``monocrit[i] <=
  2075. r`` for all cluster indices ``i`` below and including
  2076. ``c``. ``r`` is minimized such that no more than ``t``
  2077. flat clusters are formed. monocrit must be
  2078. monotonic. For example, to minimize the threshold t on
  2079. maximum inconsistency values so that no more than 3 flat
  2080. clusters are formed, do::
  2081. MI = maxinconsts(Z, R)
  2082. fcluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
  2083. depth : int, optional
  2084. The maximum depth to perform the inconsistency calculation.
  2085. It has no meaning for the other criteria. Default is 2.
  2086. R : ndarray, optional
  2087. The inconsistency matrix to use for the ``'inconsistent'``
  2088. criterion. This matrix is computed if not provided.
  2089. monocrit : ndarray, optional
  2090. An array of length n-1. `monocrit[i]` is the
  2091. statistics upon which non-singleton i is thresholded. The
  2092. monocrit vector must be monotonic, i.e., given a node c with
  2093. index i, for all node indices j corresponding to nodes
  2094. below c, ``monocrit[i] >= monocrit[j]``.
  2095. Returns
  2096. -------
  2097. fcluster : ndarray
  2098. An array of length ``n``. ``T[i]`` is the flat cluster number to
  2099. which original observation ``i`` belongs.
  2100. See Also
  2101. --------
  2102. linkage : for information about hierarchical clustering methods work.
  2103. Examples
  2104. --------
  2105. >>> from scipy.cluster.hierarchy import ward, fcluster
  2106. >>> from scipy.spatial.distance import pdist
  2107. All cluster linkage methods - e.g., `scipy.cluster.hierarchy.ward`
  2108. generate a linkage matrix ``Z`` as their output:
  2109. >>> X = [[0, 0], [0, 1], [1, 0],
  2110. ... [0, 4], [0, 3], [1, 4],
  2111. ... [4, 0], [3, 0], [4, 1],
  2112. ... [4, 4], [3, 4], [4, 3]]
  2113. >>> Z = ward(pdist(X))
  2114. >>> Z
  2115. array([[ 0. , 1. , 1. , 2. ],
  2116. [ 3. , 4. , 1. , 2. ],
  2117. [ 6. , 7. , 1. , 2. ],
  2118. [ 9. , 10. , 1. , 2. ],
  2119. [ 2. , 12. , 1.29099445, 3. ],
  2120. [ 5. , 13. , 1.29099445, 3. ],
  2121. [ 8. , 14. , 1.29099445, 3. ],
  2122. [11. , 15. , 1.29099445, 3. ],
  2123. [16. , 17. , 5.77350269, 6. ],
  2124. [18. , 19. , 5.77350269, 6. ],
  2125. [20. , 21. , 8.16496581, 12. ]])
  2126. This matrix represents a dendrogram, where the first and second elements
  2127. are the two clusters merged at each step, the third element is the
  2128. distance between these clusters, and the fourth element is the size of
  2129. the new cluster - the number of original data points included.
  2130. `scipy.cluster.hierarchy.fcluster` can be used to flatten the
  2131. dendrogram, obtaining as a result an assignation of the original data
  2132. points to single clusters.
  2133. This assignation mostly depends on a distance threshold ``t`` - the maximum
  2134. inter-cluster distance allowed:
  2135. >>> fcluster(Z, t=0.9, criterion='distance')
  2136. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  2137. >>> fcluster(Z, t=1.1, criterion='distance')
  2138. array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
  2139. >>> fcluster(Z, t=3, criterion='distance')
  2140. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  2141. >>> fcluster(Z, t=9, criterion='distance')
  2142. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  2143. In the first case, the threshold ``t`` is too small to allow any two
  2144. samples in the data to form a cluster, so 12 different clusters are
  2145. returned.
  2146. In the second case, the threshold is large enough to allow the first
  2147. 4 points to be merged with their nearest neighbors. So, here, only 8
  2148. clusters are returned.
  2149. The third case, with a much higher threshold, allows for up to 8 data
  2150. points to be connected - so 4 clusters are returned here.
  2151. Lastly, the threshold of the fourth case is large enough to allow for
  2152. all data points to be merged together - so a single cluster is returned.
  2153. """
  2154. xp = array_namespace(Z)
  2155. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  2156. _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
  2157. n = Z.shape[0] + 1
  2158. T = np.zeros((n,), dtype='i')
  2159. if monocrit is not None:
  2160. monocrit = np.asarray(monocrit, order='C', dtype=np.float64)
  2161. Z = np.asarray(Z)
  2162. monocrit = np.asarray(monocrit)
  2163. if criterion == 'inconsistent':
  2164. if R is None:
  2165. R = inconsistent(Z, depth)
  2166. else:
  2167. R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
  2168. _is_valid_im(R, throw=True, name='R', materialize=True, xp=xp)
  2169. # Since the C code does not support striding using strides.
  2170. # The dimensions are used instead.
  2171. R = np.asarray(R)
  2172. _hierarchy.cluster_in(Z, R, T, float(t), int(n))
  2173. elif criterion == 'distance':
  2174. _hierarchy.cluster_dist(Z, T, float(t), int(n))
  2175. elif criterion == 'maxclust':
  2176. _hierarchy.cluster_maxclust_dist(Z, T, int(n), t)
  2177. elif criterion == 'monocrit':
  2178. _hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n))
  2179. elif criterion == 'maxclust_monocrit':
  2180. _hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t))
  2181. else:
  2182. raise ValueError(f'Invalid cluster formation criterion: {str(criterion)}')
  2183. return xp.asarray(T)
  2184. @xp_capabilities(cpu_only=True, reason="Cython code",
  2185. jax_jit=False, allow_dask_compute=True)
  2186. def fclusterdata(X, t, criterion='inconsistent',
  2187. metric='euclidean', depth=2, method='single', R=None):
  2188. """
  2189. Cluster observation data using a given metric.
  2190. Clusters the original observations in the n-by-m data
  2191. matrix X (n observations in m dimensions), using the euclidean
  2192. distance metric to calculate distances between original observations,
  2193. performs hierarchical clustering using the single linkage algorithm,
  2194. and forms flat clusters using the inconsistency method with `t` as the
  2195. cut-off threshold.
  2196. A 1-D array ``T`` of length ``n`` is returned. ``T[i]`` is
  2197. the index of the flat cluster to which the original observation ``i``
  2198. belongs.
  2199. Parameters
  2200. ----------
  2201. X : (N, M) ndarray
  2202. N by M data matrix with N observations in M dimensions.
  2203. t : scalar
  2204. For criteria 'inconsistent', 'distance' or 'monocrit',
  2205. this is the threshold to apply when forming flat clusters.
  2206. For 'maxclust' or 'maxclust_monocrit' criteria,
  2207. this would be max number of clusters requested.
  2208. criterion : str, optional
  2209. Specifies the criterion for forming flat clusters. Valid
  2210. values are 'inconsistent' (default), 'distance', or 'maxclust'
  2211. cluster formation algorithms. See `fcluster` for descriptions.
  2212. metric : str or function, optional
  2213. The distance metric for calculating pairwise distances. See
  2214. ``distance.pdist`` for descriptions and linkage to verify
  2215. compatibility with the linkage method.
  2216. depth : int, optional
  2217. The maximum depth for the inconsistency calculation. See
  2218. `inconsistent` for more information.
  2219. method : str, optional
  2220. The linkage method to use (single, complete, average,
  2221. weighted, median centroid, ward). See `linkage` for more
  2222. information. Default is "single".
  2223. R : ndarray, optional
  2224. The inconsistency matrix. It will be computed if necessary
  2225. if it is not passed.
  2226. Returns
  2227. -------
  2228. fclusterdata : ndarray
  2229. A vector of length n. T[i] is the flat cluster number to
  2230. which original observation i belongs.
  2231. See Also
  2232. --------
  2233. scipy.spatial.distance.pdist : pairwise distance metrics
  2234. Notes
  2235. -----
  2236. This function is similar to the MATLAB function ``clusterdata``.
  2237. Examples
  2238. --------
  2239. >>> from scipy.cluster.hierarchy import fclusterdata
  2240. This is a convenience method that abstracts all the steps to perform in a
  2241. typical SciPy's hierarchical clustering workflow.
  2242. * Transform the input data into a condensed matrix with
  2243. `scipy.spatial.distance.pdist`.
  2244. * Apply a clustering method.
  2245. * Obtain flat clusters at a user defined distance threshold ``t`` using
  2246. `scipy.cluster.hierarchy.fcluster`.
  2247. >>> X = [[0, 0], [0, 1], [1, 0],
  2248. ... [0, 4], [0, 3], [1, 4],
  2249. ... [4, 0], [3, 0], [4, 1],
  2250. ... [4, 4], [3, 4], [4, 3]]
  2251. >>> fclusterdata(X, t=1)
  2252. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  2253. The output here (for the dataset ``X``, distance threshold ``t``, and the
  2254. default settings) is four clusters with three data points each.
  2255. """
  2256. xp = array_namespace(X)
  2257. X = _asarray(X, order='C', dtype=xp.float64, xp=xp)
  2258. if X.ndim != 2:
  2259. raise TypeError('The observation matrix X must be an n by m array.')
  2260. Y = distance.pdist(X, metric=metric)
  2261. Z = linkage(Y, method=method)
  2262. if R is None:
  2263. R = inconsistent(Z, d=depth)
  2264. else:
  2265. R = _asarray(R, order='C', xp=xp)
  2266. T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
  2267. return T
  2268. @lazy_cython
  2269. def leaves_list(Z):
  2270. """
  2271. Return a list of leaf node ids.
  2272. The return corresponds to the observation vector index as it appears
  2273. in the tree from left to right. Z is a linkage matrix.
  2274. Parameters
  2275. ----------
  2276. Z : ndarray
  2277. The hierarchical clustering encoded as a matrix. `Z` is
  2278. a linkage matrix. See `linkage` for more information.
  2279. Returns
  2280. -------
  2281. leaves_list : ndarray
  2282. The list of leaf node ids.
  2283. See Also
  2284. --------
  2285. dendrogram : for information about dendrogram structure.
  2286. Examples
  2287. --------
  2288. >>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list
  2289. >>> from scipy.spatial.distance import pdist
  2290. >>> from matplotlib import pyplot as plt
  2291. >>> X = [[0, 0], [0, 1], [1, 0],
  2292. ... [0, 4], [0, 3], [1, 4],
  2293. ... [4, 0], [3, 0], [4, 1],
  2294. ... [4, 4], [3, 4], [4, 3]]
  2295. >>> Z = ward(pdist(X))
  2296. The linkage matrix ``Z`` represents a dendrogram, that is, a tree that
  2297. encodes the structure of the clustering performed.
  2298. `scipy.cluster.hierarchy.leaves_list` shows the mapping between
  2299. indices in the ``X`` dataset and leaves in the dendrogram:
  2300. >>> leaves_list(Z)
  2301. array([ 2, 0, 1, 5, 3, 4, 8, 6, 7, 11, 9, 10], dtype=int32)
  2302. >>> fig = plt.figure(figsize=(25, 10))
  2303. >>> dn = dendrogram(Z)
  2304. >>> plt.show()
  2305. """
  2306. xp = array_namespace(Z)
  2307. Z = _asarray(Z, order='C', xp=xp)
  2308. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  2309. def cy_leaves_list(Z, validate):
  2310. if validate:
  2311. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  2312. n = Z.shape[0] + 1
  2313. ML = np.zeros((n,), dtype=np.int32)
  2314. _hierarchy.prelist(Z, ML, n)
  2315. return ML
  2316. n = Z.shape[0] + 1
  2317. return xpx.lazy_apply(cy_leaves_list, Z, validate=is_lazy_array(Z),
  2318. shape=(n, ), dtype=xp.int32,
  2319. as_numpy=True, xp=xp)
  2320. # Maps number of leaves to text size.
  2321. #
  2322. # p <= 20, size="12"
  2323. # 20 < p <= 30, size="10"
  2324. # 30 < p <= 50, size="8"
  2325. # 50 < p <= np.inf, size="6"
  2326. _dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5}
  2327. _drotation = {20: 0, 40: 45, np.inf: 90}
  2328. _dtextsortedkeys = list(_dtextsizes.keys())
  2329. _dtextsortedkeys.sort()
  2330. _drotationsortedkeys = list(_drotation.keys())
  2331. _drotationsortedkeys.sort()
  2332. def _remove_dups(L):
  2333. """
  2334. Remove duplicates AND preserve the original order of the elements.
  2335. The set class is not guaranteed to do this.
  2336. """
  2337. seen_before = set()
  2338. L2 = []
  2339. for i in L:
  2340. if i not in seen_before:
  2341. seen_before.add(i)
  2342. L2.append(i)
  2343. return L2
  2344. def _get_tick_text_size(p):
  2345. for k in _dtextsortedkeys:
  2346. if p <= k:
  2347. return _dtextsizes[k]
  2348. def _get_tick_rotation(p):
  2349. for k in _drotationsortedkeys:
  2350. if p <= k:
  2351. return _drotation[k]
  2352. def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation,
  2353. no_labels, color_list, leaf_font_size=None,
  2354. leaf_rotation=None, contraction_marks=None,
  2355. ax=None, above_threshold_color='C0'):
  2356. # Import matplotlib here so that it's not imported unless dendrograms
  2357. # are plotted. Raise an informative error if importing fails.
  2358. try:
  2359. # if an axis is provided, don't use pylab at all
  2360. if ax is None:
  2361. import matplotlib.pylab
  2362. import matplotlib.patches
  2363. import matplotlib.collections
  2364. except ImportError as e:
  2365. raise ImportError("You must install the matplotlib library to plot "
  2366. "the dendrogram. Use no_plot=True to calculate the "
  2367. "dendrogram without plotting.") from e
  2368. if ax is None:
  2369. ax = matplotlib.pylab.gca()
  2370. # if we're using pylab, we want to trigger a draw at the end
  2371. trigger_redraw = True
  2372. else:
  2373. trigger_redraw = False
  2374. # Independent variable plot width
  2375. ivw = len(ivl) * 10
  2376. # Dependent variable plot height
  2377. dvw = mh + mh * 0.05
  2378. iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10)
  2379. if orientation in ('top', 'bottom'):
  2380. if orientation == 'top':
  2381. ax.set_ylim([0, dvw])
  2382. ax.set_xlim([0, ivw])
  2383. else:
  2384. ax.set_ylim([dvw, 0])
  2385. ax.set_xlim([0, ivw])
  2386. xlines = icoords
  2387. ylines = dcoords
  2388. if no_labels:
  2389. ax.set_xticks([])
  2390. ax.set_xticklabels([])
  2391. else:
  2392. ax.set_xticks(iv_ticks)
  2393. if orientation == 'top':
  2394. ax.xaxis.set_ticks_position('bottom')
  2395. else:
  2396. ax.xaxis.set_ticks_position('top')
  2397. # Make the tick marks invisible because they cover up the links
  2398. for line in ax.get_xticklines():
  2399. line.set_visible(False)
  2400. leaf_rot = (float(_get_tick_rotation(len(ivl)))
  2401. if (leaf_rotation is None) else leaf_rotation)
  2402. leaf_font = (float(_get_tick_text_size(len(ivl)))
  2403. if (leaf_font_size is None) else leaf_font_size)
  2404. ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font)
  2405. elif orientation in ('left', 'right'):
  2406. if orientation == 'left':
  2407. ax.set_xlim([dvw, 0])
  2408. ax.set_ylim([0, ivw])
  2409. else:
  2410. ax.set_xlim([0, dvw])
  2411. ax.set_ylim([0, ivw])
  2412. xlines = dcoords
  2413. ylines = icoords
  2414. if no_labels:
  2415. ax.set_yticks([])
  2416. ax.set_yticklabels([])
  2417. else:
  2418. ax.set_yticks(iv_ticks)
  2419. if orientation == 'left':
  2420. ax.yaxis.set_ticks_position('right')
  2421. else:
  2422. ax.yaxis.set_ticks_position('left')
  2423. # Make the tick marks invisible because they cover up the links
  2424. for line in ax.get_yticklines():
  2425. line.set_visible(False)
  2426. leaf_font = (float(_get_tick_text_size(len(ivl)))
  2427. if (leaf_font_size is None) else leaf_font_size)
  2428. if leaf_rotation is not None:
  2429. ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font)
  2430. else:
  2431. ax.set_yticklabels(ivl, size=leaf_font)
  2432. # Let's use collections instead. This way there is a separate legend item
  2433. # for each tree grouping, rather than stupidly one for each line segment.
  2434. colors_used = _remove_dups(color_list)
  2435. color_to_lines = {}
  2436. for color in colors_used:
  2437. color_to_lines[color] = []
  2438. for (xline, yline, color) in zip(xlines, ylines, color_list):
  2439. color_to_lines[color].append(list(zip(xline, yline)))
  2440. colors_to_collections = {}
  2441. # Construct the collections.
  2442. for color in colors_used:
  2443. coll = matplotlib.collections.LineCollection(color_to_lines[color],
  2444. colors=(color,))
  2445. colors_to_collections[color] = coll
  2446. # Add all the groupings below the color threshold.
  2447. for color in colors_used:
  2448. if color != above_threshold_color:
  2449. ax.add_collection(colors_to_collections[color])
  2450. # If there's a grouping of links above the color threshold, it goes last.
  2451. if above_threshold_color in colors_to_collections:
  2452. ax.add_collection(colors_to_collections[above_threshold_color])
  2453. if contraction_marks is not None:
  2454. Ellipse = matplotlib.patches.Ellipse
  2455. for (x, y) in contraction_marks:
  2456. if orientation in ('left', 'right'):
  2457. e = Ellipse((y, x), width=dvw / 100, height=1.0)
  2458. else:
  2459. e = Ellipse((x, y), width=1.0, height=dvw / 100)
  2460. ax.add_artist(e)
  2461. e.set_clip_box(ax.bbox)
  2462. e.set_alpha(0.5)
  2463. e.set_facecolor('k')
  2464. if trigger_redraw:
  2465. matplotlib.pylab.draw_if_interactive()
  2466. # C0 is used for above threshold color
  2467. _link_line_colors_default = ('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9')
  2468. _link_line_colors = list(_link_line_colors_default)
  2469. @xp_capabilities(out_of_scope=True)
  2470. def set_link_color_palette(palette):
  2471. """
  2472. Set list of matplotlib color codes for use by dendrogram.
  2473. Note that this palette is global (i.e., setting it once changes the colors
  2474. for all subsequent calls to `dendrogram`) and that it affects only the
  2475. the colors below ``color_threshold``.
  2476. Note that `dendrogram` also accepts a custom coloring function through its
  2477. ``link_color_func`` keyword, which is more flexible and non-global.
  2478. Parameters
  2479. ----------
  2480. palette : list of str or None
  2481. A list of matplotlib color codes. The order of the color codes is the
  2482. order in which the colors are cycled through when color thresholding in
  2483. the dendrogram.
  2484. If ``None``, resets the palette to its default (which are matplotlib
  2485. default colors C1 to C9).
  2486. Returns
  2487. -------
  2488. None
  2489. See Also
  2490. --------
  2491. dendrogram
  2492. Notes
  2493. -----
  2494. Ability to reset the palette with ``None`` added in SciPy 0.17.0.
  2495. Thread safety: using this function in a multi-threaded fashion may
  2496. result in `dendrogram` producing plots with unexpected colors.
  2497. Examples
  2498. --------
  2499. >>> import numpy as np
  2500. >>> from scipy.cluster import hierarchy
  2501. >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
  2502. ... 400., 754., 564., 138., 219., 869., 669.])
  2503. >>> Z = hierarchy.linkage(ytdist, 'single')
  2504. >>> dn = hierarchy.dendrogram(Z, no_plot=True)
  2505. >>> dn['color_list']
  2506. ['C1', 'C0', 'C0', 'C0', 'C0']
  2507. >>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k'])
  2508. >>> dn = hierarchy.dendrogram(Z, no_plot=True, above_threshold_color='b')
  2509. >>> dn['color_list']
  2510. ['c', 'b', 'b', 'b', 'b']
  2511. >>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267,
  2512. ... above_threshold_color='k')
  2513. >>> dn['color_list']
  2514. ['c', 'm', 'm', 'k', 'k']
  2515. Now, reset the color palette to its default:
  2516. >>> hierarchy.set_link_color_palette(None)
  2517. """
  2518. if palette is None:
  2519. # reset to its default
  2520. palette = _link_line_colors_default
  2521. elif not isinstance(palette, list | tuple):
  2522. raise TypeError("palette must be a list or tuple")
  2523. _ptypes = [isinstance(p, str) for p in palette]
  2524. if False in _ptypes:
  2525. raise TypeError("all palette list elements must be color strings")
  2526. global _link_line_colors
  2527. _link_line_colors = palette
  2528. @xp_capabilities(cpu_only=True, jax_jit=False, allow_dask_compute=True)
  2529. def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None,
  2530. get_leaves=True, orientation='top', labels=None,
  2531. count_sort=False, distance_sort=False, show_leaf_counts=True,
  2532. no_plot=False, no_labels=False, leaf_font_size=None,
  2533. leaf_rotation=None, leaf_label_func=None,
  2534. show_contracted=False, link_color_func=None, ax=None,
  2535. above_threshold_color='C0'):
  2536. """
  2537. Plot the hierarchical clustering as a dendrogram.
  2538. The dendrogram illustrates how each cluster is
  2539. composed by drawing a U-shaped link between a non-singleton
  2540. cluster and its children. The top of the U-link indicates a
  2541. cluster merge. The two legs of the U-link indicate which clusters
  2542. were merged. The length of the two legs of the U-link represents
  2543. the distance between the child clusters. It is also the
  2544. cophenetic distance between original observations in the two
  2545. children clusters.
  2546. Parameters
  2547. ----------
  2548. Z : ndarray
  2549. The linkage matrix encoding the hierarchical clustering to
  2550. render as a dendrogram. See the ``linkage`` function for more
  2551. information on the format of ``Z``.
  2552. p : int, optional
  2553. The ``p`` parameter for ``truncate_mode``.
  2554. truncate_mode : str, optional
  2555. The dendrogram can be hard to read when the original
  2556. observation matrix from which the linkage is derived is
  2557. large. Truncation is used to condense the dendrogram. There
  2558. are several modes:
  2559. ``None``
  2560. No truncation is performed (default).
  2561. Note: ``'none'`` is an alias for ``None`` that's kept for
  2562. backward compatibility.
  2563. ``'lastp'``
  2564. The last ``p`` non-singleton clusters formed in the linkage are the
  2565. only non-leaf nodes in the linkage; they correspond to rows
  2566. ``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are
  2567. contracted into leaf nodes.
  2568. ``'level'``
  2569. No more than ``p`` levels of the dendrogram tree are displayed.
  2570. A "level" includes all nodes with ``p`` merges from the final merge.
  2571. Note: ``'mtica'`` is an alias for ``'level'`` that's kept for
  2572. backward compatibility.
  2573. color_threshold : double, optional
  2574. For brevity, let :math:`t` be the ``color_threshold``.
  2575. Colors all the descendent links below a cluster node
  2576. :math:`k` the same color if :math:`k` is the first node below
  2577. the cut threshold :math:`t`. All links connecting nodes with
  2578. distances greater than or equal to the threshold are colored
  2579. with de default matplotlib color ``'C0'``. If :math:`t` is less
  2580. than or equal to zero, all nodes are colored ``'C0'``.
  2581. If ``color_threshold`` is None or 'default',
  2582. corresponding with MATLAB(TM) behavior, the threshold is set to
  2583. ``0.7*max(Z[:,2])``.
  2584. get_leaves : bool, optional
  2585. Includes a list ``R['leaves']=H`` in the result
  2586. dictionary. For each :math:`i`, ``H[i] == j``, cluster node
  2587. ``j`` appears in position ``i`` in the left-to-right traversal
  2588. of the leaves, where :math:`j < 2n-1` and :math:`i < n`.
  2589. orientation : str, optional
  2590. The direction to plot the dendrogram, which can be any
  2591. of the following strings:
  2592. ``'top'``
  2593. Plots the root at the top, and plot descendent links going downwards.
  2594. (default).
  2595. ``'bottom'``
  2596. Plots the root at the bottom, and plot descendent links going
  2597. upwards.
  2598. ``'left'``
  2599. Plots the root at the left, and plot descendent links going right.
  2600. ``'right'``
  2601. Plots the root at the right, and plot descendent links going left.
  2602. labels : ndarray, optional
  2603. By default, ``labels`` is None so the index of the original observation
  2604. is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized
  2605. sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the
  2606. text to put under the :math:`i` th leaf node only if it corresponds to
  2607. an original observation and not a non-singleton cluster.
  2608. count_sort : str or bool, optional
  2609. For each node n, the order (visually, from left-to-right) n's
  2610. two descendent links are plotted is determined by this
  2611. parameter, which can be any of the following values:
  2612. ``False``
  2613. Nothing is done.
  2614. ``'ascending'`` or ``True``
  2615. The child with the minimum number of original objects in its cluster
  2616. is plotted first.
  2617. ``'descending'``
  2618. The child with the maximum number of original objects in its cluster
  2619. is plotted first.
  2620. Note, ``distance_sort`` and ``count_sort`` cannot both be True.
  2621. distance_sort : str or bool, optional
  2622. For each node n, the order (visually, from left-to-right) n's
  2623. two descendent links are plotted is determined by this
  2624. parameter, which can be any of the following values:
  2625. ``False``
  2626. Nothing is done.
  2627. ``'ascending'`` or ``True``
  2628. The child with the minimum distance between its direct descendents is
  2629. plotted first.
  2630. ``'descending'``
  2631. The child with the maximum distance between its direct descendents is
  2632. plotted first.
  2633. Note ``distance_sort`` and ``count_sort`` cannot both be True.
  2634. show_leaf_counts : bool, optional
  2635. When True, leaf nodes representing :math:`k>1` original
  2636. observation are labeled with the number of observations they
  2637. contain in parentheses.
  2638. no_plot : bool, optional
  2639. When True, the final rendering is not performed. This is
  2640. useful if only the data structures computed for the rendering
  2641. are needed or if matplotlib is not available.
  2642. no_labels : bool, optional
  2643. When True, no labels appear next to the leaf nodes in the
  2644. rendering of the dendrogram.
  2645. leaf_rotation : double, optional
  2646. Specifies the angle (in degrees) to rotate the leaf
  2647. labels. When unspecified, the rotation is based on the number of
  2648. nodes in the dendrogram (default is 0).
  2649. leaf_font_size : int, optional
  2650. Specifies the font size (in points) of the leaf labels. When
  2651. unspecified, the size based on the number of nodes in the
  2652. dendrogram.
  2653. leaf_label_func : lambda or function, optional
  2654. When ``leaf_label_func`` is a callable function, for each
  2655. leaf with cluster index :math:`k < 2n-1`. The function
  2656. is expected to return a string with the label for the
  2657. leaf.
  2658. Indices :math:`k < n` correspond to original observations
  2659. while indices :math:`k \\geq n` correspond to non-singleton
  2660. clusters.
  2661. For example, to label singletons with their node id and
  2662. non-singletons with their id, count, and inconsistency
  2663. coefficient, simply do::
  2664. # First define the leaf label function.
  2665. def llf(id):
  2666. if id < n:
  2667. return str(id)
  2668. else:
  2669. return '[%d %d %1.2f]' % (id, count, R[n-id,3])
  2670. # The text for the leaf nodes is going to be big so force
  2671. # a rotation of 90 degrees.
  2672. dendrogram(Z, leaf_label_func=llf, leaf_rotation=90)
  2673. # leaf_label_func can also be used together with ``truncate_mode``,
  2674. # in which case you will get your leaves labeled after truncation:
  2675. dendrogram(Z, leaf_label_func=llf, leaf_rotation=90,
  2676. truncate_mode='level', p=2)
  2677. show_contracted : bool, optional
  2678. When True the heights of non-singleton nodes contracted
  2679. into a leaf node are plotted as crosses along the link
  2680. connecting that leaf node. This really is only useful when
  2681. truncation is used (see ``truncate_mode`` parameter).
  2682. link_color_func : callable, optional
  2683. If given, `link_color_function` is called with each non-singleton id
  2684. corresponding to each U-shaped link it will paint. The function is
  2685. expected to return the color to paint the link, encoded as a matplotlib
  2686. color string code. For example::
  2687. dendrogram(Z, link_color_func=lambda k: colors[k])
  2688. colors the direct links below each untruncated non-singleton node
  2689. ``k`` using ``colors[k]``.
  2690. ax : matplotlib Axes instance, optional
  2691. If None and `no_plot` is not True, the dendrogram will be plotted
  2692. on the current axes. Otherwise if `no_plot` is not True the
  2693. dendrogram will be plotted on the given ``Axes`` instance. This can be
  2694. useful if the dendrogram is part of a more complex figure.
  2695. above_threshold_color : str, optional
  2696. This matplotlib color string sets the color of the links above the
  2697. color_threshold. The default is ``'C0'``.
  2698. Returns
  2699. -------
  2700. R : dict
  2701. A dictionary of data structures computed to render the
  2702. dendrogram. Its has the following keys:
  2703. ``'color_list'``
  2704. A list of color names. The k'th element represents the color of the
  2705. k'th link.
  2706. ``'icoord'`` and ``'dcoord'``
  2707. Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]``
  2708. where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]``
  2709. where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is
  2710. ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``.
  2711. ``'ivl'``
  2712. A list of labels corresponding to the leaf nodes.
  2713. ``'leaves'``
  2714. For each i, ``H[i] == j``, cluster node ``j`` appears in position
  2715. ``i`` in the left-to-right traversal of the leaves, where
  2716. :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the
  2717. ``i``-th leaf node corresponds to an original observation.
  2718. Otherwise, it corresponds to a non-singleton cluster.
  2719. ``'leaves_color_list'``
  2720. A list of color names. The k'th element represents the color of the
  2721. k'th leaf.
  2722. See Also
  2723. --------
  2724. linkage, set_link_color_palette
  2725. Notes
  2726. -----
  2727. It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise
  2728. crossings appear in the dendrogram.
  2729. Examples
  2730. --------
  2731. >>> import numpy as np
  2732. >>> from scipy.cluster import hierarchy
  2733. >>> import matplotlib.pyplot as plt
  2734. A very basic example:
  2735. >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
  2736. ... 400., 754., 564., 138., 219., 869., 669.])
  2737. >>> Z = hierarchy.linkage(ytdist, 'single')
  2738. >>> plt.figure()
  2739. >>> dn = hierarchy.dendrogram(Z)
  2740. Now, plot in given axes, improve the color scheme and use both vertical and
  2741. horizontal orientations:
  2742. >>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
  2743. >>> fig, axes = plt.subplots(1, 2, figsize=(8, 3))
  2744. >>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
  2745. ... orientation='top')
  2746. >>> dn2 = hierarchy.dendrogram(Z, ax=axes[1],
  2747. ... above_threshold_color='#bcbddc',
  2748. ... orientation='right')
  2749. >>> hierarchy.set_link_color_palette(None) # reset to default after use
  2750. >>> plt.show()
  2751. """
  2752. # This feature was thought about but never implemented (still useful?):
  2753. #
  2754. # ... = dendrogram(..., leaves_order=None)
  2755. #
  2756. # Plots the leaves in the order specified by a vector of
  2757. # original observation indices. If the vector contains duplicates
  2758. # or results in a crossing, an exception will be thrown. Passing
  2759. # None orders leaf nodes based on the order they appear in the
  2760. # pre-order traversal.
  2761. xp = array_namespace(Z)
  2762. Z = _asarray(Z, order='C', xp=xp)
  2763. if orientation not in ["top", "left", "bottom", "right"]:
  2764. raise ValueError("orientation must be one of 'top', 'left', "
  2765. "'bottom', or 'right'")
  2766. if labels is not None:
  2767. try:
  2768. len_labels = len(labels)
  2769. except (TypeError, AttributeError):
  2770. len_labels = labels.shape[0]
  2771. if Z.shape[0] + 1 != len_labels:
  2772. raise ValueError("Dimensions of Z and labels must be consistent.")
  2773. _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
  2774. Zs = Z.shape
  2775. n = Zs[0] + 1
  2776. if isinstance(p, int | float):
  2777. p = int(p)
  2778. else:
  2779. raise TypeError('The second argument must be a number')
  2780. if truncate_mode not in ('lastp', 'mtica', 'level', 'none', None):
  2781. # 'mtica' is kept working for backwards compat.
  2782. raise ValueError('Invalid truncation mode.')
  2783. if truncate_mode == 'lastp':
  2784. if p > n or p == 0:
  2785. p = n
  2786. if truncate_mode == 'mtica':
  2787. # 'mtica' is an alias
  2788. truncate_mode = 'level'
  2789. if truncate_mode == 'level':
  2790. if p <= 0:
  2791. p = np.inf
  2792. if get_leaves:
  2793. lvs = []
  2794. else:
  2795. lvs = None
  2796. icoord_list = []
  2797. dcoord_list = []
  2798. color_list = []
  2799. current_color = [0]
  2800. currently_below_threshold = [False]
  2801. ivl = [] # list of leaves
  2802. if color_threshold is None or (isinstance(color_threshold, str) and
  2803. color_threshold == 'default'):
  2804. color_threshold = xp.max(Z[:, 2]) * 0.7
  2805. R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl,
  2806. 'leaves': lvs, 'color_list': color_list}
  2807. # Empty list will be filled in _dendrogram_calculate_info
  2808. contraction_marks = [] if show_contracted else None
  2809. _dendrogram_calculate_info(
  2810. Z=Z, p=p,
  2811. truncate_mode=truncate_mode,
  2812. color_threshold=color_threshold,
  2813. get_leaves=get_leaves,
  2814. orientation=orientation,
  2815. labels=labels,
  2816. count_sort=count_sort,
  2817. distance_sort=distance_sort,
  2818. show_leaf_counts=show_leaf_counts,
  2819. i=2*n - 2,
  2820. iv=0.0,
  2821. ivl=ivl,
  2822. n=n,
  2823. icoord_list=icoord_list,
  2824. dcoord_list=dcoord_list,
  2825. lvs=lvs,
  2826. current_color=current_color,
  2827. color_list=color_list,
  2828. currently_below_threshold=currently_below_threshold,
  2829. leaf_label_func=leaf_label_func,
  2830. contraction_marks=contraction_marks,
  2831. link_color_func=link_color_func,
  2832. above_threshold_color=above_threshold_color)
  2833. if not no_plot:
  2834. mh = xp.max(Z[:, 2])
  2835. _plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation,
  2836. no_labels, color_list,
  2837. leaf_font_size=leaf_font_size,
  2838. leaf_rotation=leaf_rotation,
  2839. contraction_marks=contraction_marks,
  2840. ax=ax,
  2841. above_threshold_color=above_threshold_color)
  2842. R["leaves_color_list"] = _get_leaves_color_list(R)
  2843. return R
  2844. def _get_leaves_color_list(R):
  2845. leaves_color_list = [None] * len(R['leaves'])
  2846. for link_x, link_y, link_color in zip(R['icoord'],
  2847. R['dcoord'],
  2848. R['color_list']):
  2849. for (xi, yi) in zip(link_x, link_y):
  2850. if yi == 0.0 and (xi % 5 == 0 and xi % 2 == 1):
  2851. # if yi is 0.0 and xi is divisible by 5 and odd,
  2852. # the point is a leaf
  2853. # xi of leaves are 5, 15, 25, 35, ... (see `iv_ticks`)
  2854. # index of leaves are 0, 1, 2, 3, ... as below
  2855. leaf_index = (int(xi) - 5) // 10
  2856. # each leaf has a same color of its link.
  2857. leaves_color_list[leaf_index] = link_color
  2858. return leaves_color_list
  2859. def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
  2860. i, labels):
  2861. # If the leaf id structure is not None and is a list then the caller
  2862. # to dendrogram has indicated that cluster id's corresponding to the
  2863. # leaf nodes should be recorded.
  2864. if lvs is not None:
  2865. lvs.append(int(i))
  2866. # If leaf node labels are to be displayed...
  2867. if ivl is not None:
  2868. # If a leaf_label_func has been provided, the label comes from the
  2869. # string returned from the leaf_label_func, which is a function
  2870. # passed to dendrogram.
  2871. if leaf_label_func:
  2872. ivl.append(leaf_label_func(int(i)))
  2873. else:
  2874. # Otherwise, if the dendrogram caller has passed a labels list
  2875. # for the leaf nodes, use it.
  2876. if labels is not None:
  2877. ivl.append(labels[int(i - n)])
  2878. else:
  2879. # Otherwise, use the id as the label for the leaf.x
  2880. ivl.append(str(int(i)))
  2881. def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
  2882. i, labels, show_leaf_counts):
  2883. # If the leaf id structure is not None and is a list then the caller
  2884. # to dendrogram has indicated that cluster id's corresponding to the
  2885. # leaf nodes should be recorded.
  2886. if lvs is not None:
  2887. lvs.append(int(i))
  2888. if ivl is not None:
  2889. if leaf_label_func:
  2890. ivl.append(leaf_label_func(int(i)))
  2891. else:
  2892. if show_leaf_counts:
  2893. ivl.append("(" + str(np.asarray(Z[i - n, 3], dtype=np.int64)) + ")")
  2894. else:
  2895. ivl.append("")
  2896. def _append_contraction_marks(Z, iv, i, n, contraction_marks, xp):
  2897. _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp),
  2898. n, contraction_marks, xp)
  2899. _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp),
  2900. n, contraction_marks, xp)
  2901. def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks, xp):
  2902. if i >= n:
  2903. contraction_marks.append((iv, Z[i - n, 2]))
  2904. _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp),
  2905. n, contraction_marks, xp)
  2906. _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp),
  2907. n, contraction_marks, xp)
  2908. def _dendrogram_calculate_info(Z, p, truncate_mode,
  2909. color_threshold=np.inf, get_leaves=True,
  2910. orientation='top', labels=None,
  2911. count_sort=False, distance_sort=False,
  2912. show_leaf_counts=False, i=-1, iv=0.0,
  2913. ivl=None, n=0, icoord_list=None, dcoord_list=None,
  2914. lvs=None, mhr=False,
  2915. current_color=None, color_list=None,
  2916. currently_below_threshold=None,
  2917. leaf_label_func=None, level=0,
  2918. contraction_marks=None,
  2919. link_color_func=None,
  2920. above_threshold_color='C0'):
  2921. """
  2922. Calculate the endpoints of the links as well as the labels for the
  2923. the dendrogram rooted at the node with index i. iv is the independent
  2924. variable value to plot the left-most leaf node below the root node i
  2925. (if orientation='top', this would be the left-most x value where the
  2926. plotting of this root node i and its descendents should begin).
  2927. ivl is a list to store the labels of the leaf nodes. The leaf_label_func
  2928. is called whenever ivl != None, labels == None, and
  2929. leaf_label_func != None. When ivl != None and labels != None, the
  2930. labels list is used only for labeling the leaf nodes. When
  2931. ivl == None, no labels are generated for leaf nodes.
  2932. When get_leaves==True, a list of leaves is built as they are visited
  2933. in the dendrogram.
  2934. Returns a tuple with l being the independent variable coordinate that
  2935. corresponds to the midpoint of cluster to the left of cluster i if
  2936. i is non-singleton, otherwise the independent coordinate of the leaf
  2937. node if i is a leaf node.
  2938. Returns
  2939. -------
  2940. A tuple (left, w, h, md), where:
  2941. * left is the independent variable coordinate of the center of the
  2942. the U of the subtree
  2943. * w is the amount of space used for the subtree (in independent
  2944. variable units)
  2945. * h is the height of the subtree in dependent variable units
  2946. * md is the ``max(Z[*,2]``) for all nodes ``*`` below and including
  2947. the target node.
  2948. """
  2949. xp = array_namespace(Z)
  2950. if n == 0:
  2951. raise ValueError("Invalid singleton cluster count n.")
  2952. if i == -1:
  2953. raise ValueError("Invalid root cluster index i.")
  2954. if truncate_mode == 'lastp':
  2955. # If the node is a leaf node but corresponds to a non-singleton
  2956. # cluster, its label is either the empty string or the number of
  2957. # original observations belonging to cluster i.
  2958. if 2*n - p > i >= n:
  2959. d = Z[i - n, 2]
  2960. _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
  2961. leaf_label_func, i, labels,
  2962. show_leaf_counts)
  2963. if contraction_marks is not None:
  2964. _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp)
  2965. return (iv + 5.0, 10.0, 0.0, d)
  2966. elif i < n:
  2967. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2968. leaf_label_func, i, labels)
  2969. return (iv + 5.0, 10.0, 0.0, 0.0)
  2970. elif truncate_mode == 'level':
  2971. if i > n and level > p:
  2972. d = Z[i - n, 2]
  2973. _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
  2974. leaf_label_func, i, labels,
  2975. show_leaf_counts)
  2976. if contraction_marks is not None:
  2977. _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp)
  2978. return (iv + 5.0, 10.0, 0.0, d)
  2979. elif i < n:
  2980. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2981. leaf_label_func, i, labels)
  2982. return (iv + 5.0, 10.0, 0.0, 0.0)
  2983. # Otherwise, only truncate if we have a leaf node.
  2984. #
  2985. # Only place leaves if they correspond to original observations.
  2986. if i < n:
  2987. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2988. leaf_label_func, i, labels)
  2989. return (iv + 5.0, 10.0, 0.0, 0.0)
  2990. # !!! Otherwise, we don't have a leaf node, so work on plotting a
  2991. # non-leaf node.
  2992. # Actual indices of a and b
  2993. aa = int_floor(Z[i - n, 0], xp)
  2994. ab = int_floor(Z[i - n, 1], xp)
  2995. if aa >= n:
  2996. # The number of singletons below cluster a
  2997. na = Z[aa - n, 3]
  2998. # The distance between a's two direct children.
  2999. da = Z[aa - n, 2]
  3000. else:
  3001. na = 1
  3002. da = 0.0
  3003. if ab >= n:
  3004. nb = Z[ab - n, 3]
  3005. db = Z[ab - n, 2]
  3006. else:
  3007. nb = 1
  3008. db = 0.0
  3009. if count_sort == 'ascending' or count_sort is True:
  3010. # If a has a count greater than b, it and its descendents should
  3011. # be drawn to the right. Otherwise, to the left.
  3012. if na > nb:
  3013. # The cluster index to draw to the left (ua) will be ab
  3014. # and the one to draw to the right (ub) will be aa
  3015. ua = ab
  3016. ub = aa
  3017. else:
  3018. ua = aa
  3019. ub = ab
  3020. elif count_sort == 'descending':
  3021. # If a has a count less than or equal to b, it and its
  3022. # descendents should be drawn to the left. Otherwise, to
  3023. # the right.
  3024. if na > nb:
  3025. ua = aa
  3026. ub = ab
  3027. else:
  3028. ua = ab
  3029. ub = aa
  3030. elif distance_sort == 'ascending' or distance_sort is True:
  3031. # If a has a distance greater than b, it and its descendents should
  3032. # be drawn to the right. Otherwise, to the left.
  3033. if da > db:
  3034. ua = ab
  3035. ub = aa
  3036. else:
  3037. ua = aa
  3038. ub = ab
  3039. elif distance_sort == 'descending':
  3040. # If a has a distance less than or equal to b, it and its
  3041. # descendents should be drawn to the left. Otherwise, to
  3042. # the right.
  3043. if da > db:
  3044. ua = aa
  3045. ub = ab
  3046. else:
  3047. ua = ab
  3048. ub = aa
  3049. else:
  3050. ua = aa
  3051. ub = ab
  3052. # Updated iv variable and the amount of space used.
  3053. (uiva, uwa, uah, uamd) = \
  3054. _dendrogram_calculate_info(
  3055. Z=Z, p=p,
  3056. truncate_mode=truncate_mode,
  3057. color_threshold=color_threshold,
  3058. get_leaves=get_leaves,
  3059. orientation=orientation,
  3060. labels=labels,
  3061. count_sort=count_sort,
  3062. distance_sort=distance_sort,
  3063. show_leaf_counts=show_leaf_counts,
  3064. i=ua, iv=iv, ivl=ivl, n=n,
  3065. icoord_list=icoord_list,
  3066. dcoord_list=dcoord_list, lvs=lvs,
  3067. current_color=current_color,
  3068. color_list=color_list,
  3069. currently_below_threshold=currently_below_threshold,
  3070. leaf_label_func=leaf_label_func,
  3071. level=level + 1, contraction_marks=contraction_marks,
  3072. link_color_func=link_color_func,
  3073. above_threshold_color=above_threshold_color)
  3074. h = Z[i - n, 2]
  3075. if h >= color_threshold or color_threshold <= 0:
  3076. c = above_threshold_color
  3077. if currently_below_threshold[0]:
  3078. current_color[0] = (current_color[0] + 1) % len(_link_line_colors)
  3079. currently_below_threshold[0] = False
  3080. else:
  3081. currently_below_threshold[0] = True
  3082. c = _link_line_colors[current_color[0]]
  3083. (uivb, uwb, ubh, ubmd) = \
  3084. _dendrogram_calculate_info(
  3085. Z=Z, p=p,
  3086. truncate_mode=truncate_mode,
  3087. color_threshold=color_threshold,
  3088. get_leaves=get_leaves,
  3089. orientation=orientation,
  3090. labels=labels,
  3091. count_sort=count_sort,
  3092. distance_sort=distance_sort,
  3093. show_leaf_counts=show_leaf_counts,
  3094. i=ub, iv=iv + uwa, ivl=ivl, n=n,
  3095. icoord_list=icoord_list,
  3096. dcoord_list=dcoord_list, lvs=lvs,
  3097. current_color=current_color,
  3098. color_list=color_list,
  3099. currently_below_threshold=currently_below_threshold,
  3100. leaf_label_func=leaf_label_func,
  3101. level=level + 1, contraction_marks=contraction_marks,
  3102. link_color_func=link_color_func,
  3103. above_threshold_color=above_threshold_color)
  3104. max_dist = max(uamd, ubmd, h)
  3105. icoord_list.append([uiva, uiva, uivb, uivb])
  3106. dcoord_list.append([uah, h, h, ubh])
  3107. if link_color_func is not None:
  3108. v = link_color_func(int(i))
  3109. if not isinstance(v, str):
  3110. raise TypeError("link_color_func must return a matplotlib "
  3111. "color string!")
  3112. color_list.append(v)
  3113. else:
  3114. color_list.append(c)
  3115. return (((uiva + uivb) / 2), uwa + uwb, h, max_dist)
  3116. @xp_capabilities()
  3117. def is_isomorphic(T1, T2):
  3118. """
  3119. Determine if two different cluster assignments are equivalent.
  3120. Parameters
  3121. ----------
  3122. T1 : array_like
  3123. An assignment of singleton cluster ids to flat cluster ids.
  3124. T2 : array_like
  3125. An assignment of singleton cluster ids to flat cluster ids.
  3126. Returns
  3127. -------
  3128. b : bool
  3129. Whether the flat cluster assignments `T1` and `T2` are
  3130. equivalent.
  3131. See Also
  3132. --------
  3133. linkage : for a description of what a linkage matrix is.
  3134. fcluster : for the creation of flat cluster assignments.
  3135. Examples
  3136. --------
  3137. >>> from scipy.cluster.hierarchy import fcluster, is_isomorphic
  3138. >>> from scipy.cluster.hierarchy import single, complete
  3139. >>> from scipy.spatial.distance import pdist
  3140. Two flat cluster assignments can be isomorphic if they represent the same
  3141. cluster assignment, with different labels.
  3142. For example, we can use the `scipy.cluster.hierarchy.single` method
  3143. and flatten the output to four clusters:
  3144. >>> X = [[0, 0], [0, 1], [1, 0],
  3145. ... [0, 4], [0, 3], [1, 4],
  3146. ... [4, 0], [3, 0], [4, 1],
  3147. ... [4, 4], [3, 4], [4, 3]]
  3148. >>> Z = single(pdist(X))
  3149. >>> T = fcluster(Z, 1, criterion='distance')
  3150. >>> T
  3151. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  3152. We can then do the same using the
  3153. `scipy.cluster.hierarchy.complete`: method:
  3154. >>> Z = complete(pdist(X))
  3155. >>> T_ = fcluster(Z, 1.5, criterion='distance')
  3156. >>> T_
  3157. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  3158. As we can see, in both cases we obtain four clusters and all the data
  3159. points are distributed in the same way - the only thing that changes
  3160. are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both
  3161. cluster assignments are isomorphic:
  3162. >>> is_isomorphic(T, T_)
  3163. True
  3164. """
  3165. xp = array_namespace(T1, T2)
  3166. T1 = _asarray(T1, xp=xp)
  3167. T2 = _asarray(T2, xp=xp)
  3168. if T1.ndim != 1:
  3169. raise ValueError('T1 must be one-dimensional.')
  3170. if T2.ndim != 1:
  3171. raise ValueError('T2 must be one-dimensional.')
  3172. if T1.shape != T2.shape:
  3173. raise ValueError('T1 and T2 must have the same number of elements.')
  3174. # For each pair of (i, j) indices, test that
  3175. # T1[i] == T1[j] <--> T2[i] == T2[j]
  3176. # In other words, if the same symbol appears multiple times in T1,
  3177. # then a potentially different symbol must also be repeated in the same
  3178. # positions in T2, and vice versa.
  3179. # O(n*log(n)) algorithm.
  3180. # It is also possible to write a O(n) algorithm on top of unique_all(),
  3181. # but in practice it's much slower even for large clusters.
  3182. idx = xp.argsort(T1)
  3183. T1 = xp.take(T1, idx)
  3184. T2 = xp.take(T2, idx)
  3185. changes1 = T1 != xp.roll(T1, -1)
  3186. changes2 = T2 != xp.roll(T2, -1)
  3187. return xp.all(changes1 == changes2)
  3188. @lazy_cython
  3189. def maxdists(Z):
  3190. """
  3191. Return the maximum distance between any non-singleton cluster.
  3192. Parameters
  3193. ----------
  3194. Z : ndarray
  3195. The hierarchical clustering encoded as a matrix. See
  3196. ``linkage`` for more information.
  3197. Returns
  3198. -------
  3199. maxdists : ndarray
  3200. A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents
  3201. the maximum distance between any cluster (including
  3202. singletons) below and including the node with index i. More
  3203. specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the
  3204. set of all node indices below and including node i.
  3205. See Also
  3206. --------
  3207. linkage : for a description of what a linkage matrix is.
  3208. is_monotonic : for testing for monotonicity of a linkage matrix.
  3209. Examples
  3210. --------
  3211. >>> from scipy.cluster.hierarchy import median, maxdists
  3212. >>> from scipy.spatial.distance import pdist
  3213. Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists`
  3214. computes for each new cluster generated (i.e., for each row of the linkage
  3215. matrix) what is the maximum distance between any two child clusters.
  3216. Due to the nature of hierarchical clustering, in many cases this is going
  3217. to be just the distance between the two child clusters that were merged
  3218. to form the current one - that is, Z[:,2].
  3219. However, for non-monotonic cluster assignments such as
  3220. `scipy.cluster.hierarchy.median` clustering this is not always the
  3221. case: There may be cluster formations were the distance between the two
  3222. clusters merged is smaller than the distance between their children.
  3223. We can see this in an example:
  3224. >>> X = [[0, 0], [0, 1], [1, 0],
  3225. ... [0, 4], [0, 3], [1, 4],
  3226. ... [4, 0], [3, 0], [4, 1],
  3227. ... [4, 4], [3, 4], [4, 3]]
  3228. >>> Z = median(pdist(X))
  3229. >>> Z
  3230. array([[ 0. , 1. , 1. , 2. ],
  3231. [ 3. , 4. , 1. , 2. ],
  3232. [ 9. , 10. , 1. , 2. ],
  3233. [ 6. , 7. , 1. , 2. ],
  3234. [ 2. , 12. , 1.11803399, 3. ],
  3235. [ 5. , 13. , 1.11803399, 3. ],
  3236. [ 8. , 15. , 1.11803399, 3. ],
  3237. [11. , 14. , 1.11803399, 3. ],
  3238. [18. , 19. , 3. , 6. ],
  3239. [16. , 17. , 3.5 , 6. ],
  3240. [20. , 21. , 3.25 , 12. ]])
  3241. >>> maxdists(Z)
  3242. array([1. , 1. , 1. , 1. , 1.11803399,
  3243. 1.11803399, 1.11803399, 1.11803399, 3. , 3.5 ,
  3244. 3.5 ])
  3245. Note that while the distance between the two clusters merged when creating the
  3246. last cluster is 3.25, there are two children (clusters 16 and 17) whose distance
  3247. is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in
  3248. this case.
  3249. """
  3250. xp = array_namespace(Z)
  3251. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  3252. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  3253. def cy_maxdists(Z, validate):
  3254. if validate:
  3255. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  3256. MD = np.zeros((Z.shape[0],))
  3257. _hierarchy.get_max_dist_for_each_cluster(Z, MD, Z.shape[0] + 1)
  3258. return MD
  3259. return xpx.lazy_apply(cy_maxdists, Z, validate=is_lazy_array(Z),
  3260. shape=(Z.shape[0], ), dtype=xp.float64,
  3261. as_numpy=True, xp=xp)
  3262. @lazy_cython
  3263. def maxinconsts(Z, R):
  3264. """
  3265. Return the maximum inconsistency coefficient for each
  3266. non-singleton cluster and its children.
  3267. Parameters
  3268. ----------
  3269. Z : ndarray
  3270. The hierarchical clustering encoded as a matrix. See
  3271. `linkage` for more information.
  3272. R : ndarray
  3273. The inconsistency matrix.
  3274. Returns
  3275. -------
  3276. MI : ndarray
  3277. A monotonic ``(n-1)``-sized numpy array of doubles.
  3278. See Also
  3279. --------
  3280. linkage : for a description of what a linkage matrix is.
  3281. inconsistent : for the creation of a inconsistency matrix.
  3282. Examples
  3283. --------
  3284. >>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts
  3285. >>> from scipy.spatial.distance import pdist
  3286. Given a data set ``X``, we can apply a clustering method to obtain a
  3287. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  3288. be also used to obtain the inconsistency matrix ``R`` associated to
  3289. this clustering process:
  3290. >>> X = [[0, 0], [0, 1], [1, 0],
  3291. ... [0, 4], [0, 3], [1, 4],
  3292. ... [4, 0], [3, 0], [4, 1],
  3293. ... [4, 4], [3, 4], [4, 3]]
  3294. >>> Z = median(pdist(X))
  3295. >>> R = inconsistent(Z)
  3296. >>> Z
  3297. array([[ 0. , 1. , 1. , 2. ],
  3298. [ 3. , 4. , 1. , 2. ],
  3299. [ 9. , 10. , 1. , 2. ],
  3300. [ 6. , 7. , 1. , 2. ],
  3301. [ 2. , 12. , 1.11803399, 3. ],
  3302. [ 5. , 13. , 1.11803399, 3. ],
  3303. [ 8. , 15. , 1.11803399, 3. ],
  3304. [11. , 14. , 1.11803399, 3. ],
  3305. [18. , 19. , 3. , 6. ],
  3306. [16. , 17. , 3.5 , 6. ],
  3307. [20. , 21. , 3.25 , 12. ]])
  3308. >>> R
  3309. array([[1. , 0. , 1. , 0. ],
  3310. [1. , 0. , 1. , 0. ],
  3311. [1. , 0. , 1. , 0. ],
  3312. [1. , 0. , 1. , 0. ],
  3313. [1.05901699, 0.08346263, 2. , 0.70710678],
  3314. [1.05901699, 0.08346263, 2. , 0.70710678],
  3315. [1.05901699, 0.08346263, 2. , 0.70710678],
  3316. [1.05901699, 0.08346263, 2. , 0.70710678],
  3317. [1.74535599, 1.08655358, 3. , 1.15470054],
  3318. [1.91202266, 1.37522872, 3. , 1.15470054],
  3319. [3.25 , 0.25 , 3. , 0. ]])
  3320. Here, `scipy.cluster.hierarchy.maxinconsts` can be used to compute
  3321. the maximum value of the inconsistency statistic (the last column of
  3322. ``R``) for each non-singleton cluster and its children:
  3323. >>> maxinconsts(Z, R)
  3324. array([0. , 0. , 0. , 0. , 0.70710678,
  3325. 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
  3326. 1.15470054])
  3327. """
  3328. xp = array_namespace(Z, R)
  3329. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  3330. R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
  3331. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  3332. _is_valid_im(R, throw=True, name='R', xp=xp)
  3333. if Z.shape[0] != R.shape[0]:
  3334. raise ValueError("The inconsistency matrix and linkage matrix each "
  3335. "have a different number of rows.")
  3336. def cy_maxinconsts(Z, R, validate):
  3337. if validate:
  3338. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  3339. _is_valid_im(R, throw=True, name='R', xp=np)
  3340. n = Z.shape[0] + 1
  3341. MI = np.zeros((n - 1,))
  3342. _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, n, 3)
  3343. return MI
  3344. return xpx.lazy_apply(cy_maxinconsts, Z, R, validate=is_lazy_array(Z),
  3345. shape=(Z.shape[0], ), dtype=xp.float64,
  3346. as_numpy=True, xp=xp)
  3347. @lazy_cython
  3348. def maxRstat(Z, R, i):
  3349. """
  3350. Return the maximum statistic for each non-singleton cluster and its
  3351. children.
  3352. Parameters
  3353. ----------
  3354. Z : array_like
  3355. The hierarchical clustering encoded as a matrix. See `linkage` for more
  3356. information.
  3357. R : array_like
  3358. The inconsistency matrix.
  3359. i : int
  3360. The column of `R` to use as the statistic.
  3361. Returns
  3362. -------
  3363. MR : ndarray
  3364. Calculates the maximum statistic for the i'th column of the
  3365. inconsistency matrix `R` for each non-singleton cluster
  3366. node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]``, where
  3367. ``Q(j)`` the set of all node ids corresponding to nodes below
  3368. and including ``j``.
  3369. See Also
  3370. --------
  3371. linkage : for a description of what a linkage matrix is.
  3372. inconsistent : for the creation of a inconsistency matrix.
  3373. Examples
  3374. --------
  3375. >>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat
  3376. >>> from scipy.spatial.distance import pdist
  3377. Given a data set ``X``, we can apply a clustering method to obtain a
  3378. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  3379. be also used to obtain the inconsistency matrix ``R`` associated to
  3380. this clustering process:
  3381. >>> X = [[0, 0], [0, 1], [1, 0],
  3382. ... [0, 4], [0, 3], [1, 4],
  3383. ... [4, 0], [3, 0], [4, 1],
  3384. ... [4, 4], [3, 4], [4, 3]]
  3385. >>> Z = median(pdist(X))
  3386. >>> R = inconsistent(Z)
  3387. >>> R
  3388. array([[1. , 0. , 1. , 0. ],
  3389. [1. , 0. , 1. , 0. ],
  3390. [1. , 0. , 1. , 0. ],
  3391. [1. , 0. , 1. , 0. ],
  3392. [1.05901699, 0.08346263, 2. , 0.70710678],
  3393. [1.05901699, 0.08346263, 2. , 0.70710678],
  3394. [1.05901699, 0.08346263, 2. , 0.70710678],
  3395. [1.05901699, 0.08346263, 2. , 0.70710678],
  3396. [1.74535599, 1.08655358, 3. , 1.15470054],
  3397. [1.91202266, 1.37522872, 3. , 1.15470054],
  3398. [3.25 , 0.25 , 3. , 0. ]])
  3399. `scipy.cluster.hierarchy.maxRstat` can be used to compute
  3400. the maximum value of each column of ``R``, for each non-singleton
  3401. cluster and its children:
  3402. >>> maxRstat(Z, R, 0)
  3403. array([1. , 1. , 1. , 1. , 1.05901699,
  3404. 1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266,
  3405. 3.25 ])
  3406. >>> maxRstat(Z, R, 1)
  3407. array([0. , 0. , 0. , 0. , 0.08346263,
  3408. 0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872,
  3409. 1.37522872])
  3410. >>> maxRstat(Z, R, 3)
  3411. array([0. , 0. , 0. , 0. , 0.70710678,
  3412. 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
  3413. 1.15470054])
  3414. """
  3415. xp = array_namespace(Z, R)
  3416. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  3417. R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
  3418. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  3419. _is_valid_im(R, throw=True, name='R', xp=xp)
  3420. if not isinstance(i, int):
  3421. raise TypeError('The third argument must be an integer.')
  3422. if i < 0 or i > 3:
  3423. raise ValueError('i must be an integer between 0 and 3 inclusive.')
  3424. if Z.shape[0] != R.shape[0]:
  3425. raise ValueError("The inconsistency matrix and linkage matrix each "
  3426. "have a different number of rows.")
  3427. def cy_maxRstat(Z, R, i, validate):
  3428. if validate:
  3429. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  3430. _is_valid_im(R, throw=True, name='R', xp=np)
  3431. MR = np.zeros((Z.shape[0],))
  3432. n = Z.shape[0] + 1
  3433. _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, n, i)
  3434. return MR
  3435. return xpx.lazy_apply(cy_maxRstat, Z, R, i=i, validate=is_lazy_array(Z),
  3436. shape=(Z.shape[0], ), dtype=xp.float64,
  3437. as_numpy=True, xp=xp)
  3438. # Data-dependent output shape makes it impossible to use jax.jit
  3439. @xp_capabilities(cpu_only=True, reason="Cython code", jax_jit=False,
  3440. warnings=[("dask.array", "merges chunks")])
  3441. def leaders(Z, T):
  3442. """
  3443. Return the root nodes in a hierarchical clustering.
  3444. Returns the root nodes in a hierarchical clustering corresponding
  3445. to a cut defined by a flat cluster assignment vector ``T``. See
  3446. the ``fcluster`` function for more information on the format of ``T``.
  3447. For each flat cluster :math:`j` of the :math:`k` flat clusters
  3448. represented in the n-sized flat cluster assignment vector ``T``,
  3449. this function finds the lowest cluster node :math:`i` in the linkage
  3450. tree Z, such that:
  3451. * leaf descendants belong only to flat cluster j
  3452. (i.e., ``T[p]==j`` for all :math:`p` in :math:`S(i)`, where
  3453. :math:`S(i)` is the set of leaf ids of descendant leaf nodes
  3454. with cluster node :math:`i`)
  3455. * there does not exist a leaf that is not a descendant with
  3456. :math:`i` that also belongs to cluster :math:`j`
  3457. (i.e., ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If
  3458. this condition is violated, ``T`` is not a valid cluster
  3459. assignment vector, and an exception will be thrown.
  3460. Parameters
  3461. ----------
  3462. Z : ndarray
  3463. The hierarchical clustering encoded as a matrix. See
  3464. `linkage` for more information.
  3465. T : ndarray
  3466. The flat cluster assignment vector.
  3467. Returns
  3468. -------
  3469. L : ndarray
  3470. The leader linkage node id's stored as a k-element 1-D array,
  3471. where ``k`` is the number of flat clusters found in ``T``.
  3472. ``L[j]=i`` is the linkage cluster node id that is the
  3473. leader of flat cluster with id M[j]. If ``i < n``, ``i``
  3474. corresponds to an original observation, otherwise it
  3475. corresponds to a non-singleton cluster.
  3476. M : ndarray
  3477. The leader linkage node id's stored as a k-element 1-D array, where
  3478. ``k`` is the number of flat clusters found in ``T``. This allows the
  3479. set of flat cluster ids to be any arbitrary set of ``k`` integers.
  3480. For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with
  3481. id 8's leader is linkage node 2.
  3482. See Also
  3483. --------
  3484. fcluster : for the creation of flat cluster assignments.
  3485. Examples
  3486. --------
  3487. >>> from scipy.cluster.hierarchy import ward, fcluster, leaders
  3488. >>> from scipy.spatial.distance import pdist
  3489. Given a linkage matrix ``Z`` - obtained after apply a clustering method
  3490. to a dataset ``X`` - and a flat cluster assignment array ``T``:
  3491. >>> X = [[0, 0], [0, 1], [1, 0],
  3492. ... [0, 4], [0, 3], [1, 4],
  3493. ... [4, 0], [3, 0], [4, 1],
  3494. ... [4, 4], [3, 4], [4, 3]]
  3495. >>> Z = ward(pdist(X))
  3496. >>> Z
  3497. array([[ 0. , 1. , 1. , 2. ],
  3498. [ 3. , 4. , 1. , 2. ],
  3499. [ 6. , 7. , 1. , 2. ],
  3500. [ 9. , 10. , 1. , 2. ],
  3501. [ 2. , 12. , 1.29099445, 3. ],
  3502. [ 5. , 13. , 1.29099445, 3. ],
  3503. [ 8. , 14. , 1.29099445, 3. ],
  3504. [11. , 15. , 1.29099445, 3. ],
  3505. [16. , 17. , 5.77350269, 6. ],
  3506. [18. , 19. , 5.77350269, 6. ],
  3507. [20. , 21. , 8.16496581, 12. ]])
  3508. >>> T = fcluster(Z, 3, criterion='distance')
  3509. >>> T
  3510. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  3511. `scipy.cluster.hierarchy.leaders` returns the indices of the nodes
  3512. in the dendrogram that are the leaders of each flat cluster:
  3513. >>> L, M = leaders(Z, T)
  3514. >>> L
  3515. array([16, 17, 18, 19], dtype=int32)
  3516. (remember that indices 0-11 point to the 12 data points in ``X``,
  3517. whereas indices 12-22 point to the 11 rows of ``Z``)
  3518. `scipy.cluster.hierarchy.leaders` also returns the indices of
  3519. the flat clusters in ``T``:
  3520. >>> M
  3521. array([1, 2, 3, 4], dtype=int32)
  3522. Notes
  3523. -----
  3524. *Array API support (experimental):* This function returns arrays
  3525. with data-dependent shape. In JAX, at the moment of writing this makes it
  3526. impossible to execute it inside `@jax.jit`.
  3527. """
  3528. xp = array_namespace(Z, T)
  3529. Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
  3530. T = _asarray(T, order='C', xp=xp)
  3531. _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
  3532. if T.dtype != xp.int32:
  3533. raise TypeError('T must be a 1-D array of dtype int32.')
  3534. if T.shape[0] != Z.shape[0] + 1:
  3535. raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
  3536. n_obs = Z.shape[0] + 1
  3537. def cy_leaders(Z, T, validate):
  3538. if validate:
  3539. _is_valid_linkage(Z, throw=True, name='Z', xp=np)
  3540. n_clusters = int(xpx.nunique(T))
  3541. L = np.zeros(n_clusters, dtype=np.int32)
  3542. M = np.zeros(n_clusters, dtype=np.int32)
  3543. s = _hierarchy.leaders(Z, T, L, M, n_clusters, n_obs)
  3544. if s >= 0:
  3545. raise ValueError('T is not a valid assignment vector. Error found '
  3546. f'when examining linkage node {s} (< 2n-1).')
  3547. return L, M
  3548. return xpx.lazy_apply(cy_leaders, Z, T, validate=is_lazy_array(Z),
  3549. shape=((None,), (None, )), dtype=(xp.int32, xp.int32),
  3550. as_numpy=True, xp=xp)