| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338 |
- """
- Hierarchical clustering (:mod:`scipy.cluster.hierarchy`)
- ========================================================
- .. currentmodule:: scipy.cluster.hierarchy
- These functions cut hierarchical clusterings into flat clusterings
- or find the roots of the forest formed by a cut by providing the flat
- cluster ids of each observation.
- .. autosummary::
- :toctree: generated/
- fcluster
- fclusterdata
- leaders
- These are routines for agglomerative clustering.
- .. autosummary::
- :toctree: generated/
- linkage
- single
- complete
- average
- weighted
- centroid
- median
- ward
- These routines compute statistics on hierarchies.
- .. autosummary::
- :toctree: generated/
- cophenet
- from_mlab_linkage
- inconsistent
- maxinconsts
- maxdists
- maxRstat
- to_mlab_linkage
- Routines for visualizing flat clusters.
- .. autosummary::
- :toctree: generated/
- dendrogram
- These are data structures and routines for representing hierarchies as
- tree objects.
- .. autosummary::
- :toctree: generated/
- ClusterNode
- leaves_list
- to_tree
- cut_tree
- optimal_leaf_ordering
- These are predicates for checking the validity of linkage and
- inconsistency matrices as well as for checking isomorphism of two
- flat cluster assignments.
- .. autosummary::
- :toctree: generated/
- is_valid_im
- is_valid_linkage
- is_isomorphic
- is_monotonic
- correspond
- num_obs_linkage
- Utility routines for plotting:
- .. autosummary::
- :toctree: generated/
- set_link_color_palette
- Utility classes:
- .. autosummary::
- :toctree: generated/
- DisjointSet -- data structure for incremental connectivity queries
- """
- # Copyright (C) Damian Eads, 2007-2008. New BSD License.
- # hierarchy.py (derived from cluster.py, http://scipy-cluster.googlecode.com)
- #
- # Author: Damian Eads
- # Date: September 22, 2007
- #
- # Copyright (c) 2007, 2008, Damian Eads
- #
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # - Redistributions of source code must retain the above
- # copyright notice, this list of conditions and the
- # following disclaimer.
- # - Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer
- # in the documentation and/or other materials provided with the
- # distribution.
- # - Neither the name of the author nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- import warnings
- import bisect
- from collections import deque
- import numpy as np
- from . import _hierarchy, _optimal_leaf_ordering
- import scipy.spatial.distance as distance
- from scipy._lib._array_api import (_asarray, array_namespace, is_dask,
- is_lazy_array, xp_capabilities, xp_copy)
- from scipy._lib._disjoint_set import DisjointSet
- import scipy._lib.array_api_extra as xpx
- _LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3,
- 'median': 4, 'ward': 5, 'weighted': 6}
- _EUCLIDEAN_METHODS = ('centroid', 'median', 'ward')
- __all__ = ['ClusterNode', 'DisjointSet', 'average', 'centroid', 'complete',
- 'cophenet', 'correspond', 'cut_tree', 'dendrogram', 'fcluster',
- 'fclusterdata', 'from_mlab_linkage', 'inconsistent',
- 'is_isomorphic', 'is_monotonic', 'is_valid_im', 'is_valid_linkage',
- 'leaders', 'leaves_list', 'linkage', 'maxRstat', 'maxdists',
- 'maxinconsts', 'median', 'num_obs_linkage', 'optimal_leaf_ordering',
- 'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree',
- 'ward', 'weighted']
- class ClusterWarning(UserWarning):
- pass
- def _warning(s):
- warnings.warn(f'scipy.cluster: {s}', ClusterWarning, stacklevel=3)
- def int_floor(arr, xp):
- # array_api_strict is strict about not allowing `int()` on a float array.
- # That's typically not needed, here it is - so explicitly convert
- return int(xp.asarray(arr, dtype=xp.int64))
- lazy_cython = xp_capabilities(
- cpu_only=True, reason="Cython code",
- warnings=[("dask.array", "merges chunks")])
- @lazy_cython
- def single(y):
- """
- Perform single/min/nearest linkage on the condensed distance matrix ``y``.
- Parameters
- ----------
- y : ndarray
- The upper triangular of the distance matrix. The result of
- ``pdist`` is returned in this form.
- Returns
- -------
- Z : ndarray
- The linkage matrix.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import single, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = single(y)
- >>> Z
- array([[ 0., 1., 1., 2.],
- [ 2., 12., 1., 3.],
- [ 3., 4., 1., 2.],
- [ 5., 14., 1., 3.],
- [ 6., 7., 1., 2.],
- [ 8., 16., 1., 3.],
- [ 9., 10., 1., 2.],
- [11., 18., 1., 3.],
- [13., 15., 2., 6.],
- [17., 20., 2., 9.],
- [19., 21., 2., 12.]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 7, 8, 9, 10, 11, 12, 4, 5, 6, 1, 2, 3], dtype=int32)
- >>> fcluster(Z, 1, criterion='distance')
- array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
- >>> fcluster(Z, 2, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='single', metric='euclidean')
- @lazy_cython
- def complete(y):
- """
- Perform complete/max/farthest point linkage on a condensed distance matrix.
- Parameters
- ----------
- y : ndarray
- The upper triangular of the distance matrix. The result of
- ``pdist`` is returned in this form.
- Returns
- -------
- Z : ndarray
- A linkage matrix containing the hierarchical clustering. See
- the `linkage` function documentation for more information
- on its structure.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import complete, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = complete(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.41421356, 3. ],
- [ 5. , 13. , 1.41421356, 3. ],
- [ 8. , 14. , 1.41421356, 3. ],
- [11. , 15. , 1.41421356, 3. ],
- [16. , 17. , 4.12310563, 6. ],
- [18. , 19. , 4.12310563, 6. ],
- [20. , 21. , 5.65685425, 12. ]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
- >>> fcluster(Z, 1.5, criterion='distance')
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- >>> fcluster(Z, 4.5, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
- >>> fcluster(Z, 6, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='complete', metric='euclidean')
- @lazy_cython
- def average(y):
- """
- Perform average/UPGMA linkage on a condensed distance matrix.
- Parameters
- ----------
- y : ndarray
- The upper triangular of the distance matrix. The result of
- ``pdist`` is returned in this form.
- Returns
- -------
- Z : ndarray
- A linkage matrix containing the hierarchical clustering. See
- `linkage` for more information on its structure.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import average, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = average(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.20710678, 3. ],
- [ 5. , 13. , 1.20710678, 3. ],
- [ 8. , 14. , 1.20710678, 3. ],
- [11. , 15. , 1.20710678, 3. ],
- [16. , 17. , 3.39675184, 6. ],
- [18. , 19. , 3.39675184, 6. ],
- [20. , 21. , 4.09206523, 12. ]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
- >>> fcluster(Z, 1.5, criterion='distance')
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- >>> fcluster(Z, 4, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
- >>> fcluster(Z, 6, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='average', metric='euclidean')
- @lazy_cython
- def weighted(y):
- """
- Perform weighted/WPGMA linkage on the condensed distance matrix.
- See `linkage` for more information on the return
- structure and algorithm.
- Parameters
- ----------
- y : ndarray
- The upper triangular of the distance matrix. The result of
- ``pdist`` is returned in this form.
- Returns
- -------
- Z : ndarray
- A linkage matrix containing the hierarchical clustering. See
- `linkage` for more information on its structure.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import weighted, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = weighted(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 11. , 1. , 2. ],
- [ 2. , 12. , 1.20710678, 3. ],
- [ 8. , 13. , 1.20710678, 3. ],
- [ 5. , 14. , 1.20710678, 3. ],
- [10. , 15. , 1.20710678, 3. ],
- [18. , 19. , 3.05595762, 6. ],
- [16. , 17. , 3.32379407, 6. ],
- [20. , 21. , 4.06357713, 12. ]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 7, 8, 9, 1, 2, 3, 10, 11, 12, 4, 6, 5], dtype=int32)
- >>> fcluster(Z, 1.5, criterion='distance')
- array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32)
- >>> fcluster(Z, 4, criterion='distance')
- array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32)
- >>> fcluster(Z, 6, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='weighted', metric='euclidean')
- @lazy_cython
- def centroid(y):
- """
- Perform centroid/UPGMC linkage.
- See `linkage` for more information on the input matrix,
- return structure, and algorithm.
- The following are common calling conventions:
- 1. ``Z = centroid(y)``
- Performs centroid/UPGMC linkage on the condensed distance
- matrix ``y``.
- 2. ``Z = centroid(X)``
- Performs centroid/UPGMC linkage on the observation matrix ``X``
- using Euclidean distance as the distance metric.
- Parameters
- ----------
- y : ndarray
- A condensed distance matrix. A condensed
- distance matrix is a flat array containing the upper
- triangular of the distance matrix. This is the form that
- ``pdist`` returns. Alternatively, a collection of
- m observation vectors in n dimensions may be passed as
- an m by n array.
- Returns
- -------
- Z : ndarray
- A linkage matrix containing the hierarchical clustering. See
- the `linkage` function documentation for more information
- on its structure.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import centroid, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = centroid(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 2. , 12. , 1.11803399, 3. ],
- [ 5. , 13. , 1.11803399, 3. ],
- [ 8. , 15. , 1.11803399, 3. ],
- [11. , 14. , 1.11803399, 3. ],
- [18. , 19. , 3.33333333, 6. ],
- [16. , 17. , 3.33333333, 6. ],
- [20. , 21. , 3.33333333, 12. ]]) # may vary
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) # may vary
- >>> fcluster(Z, 1.1, criterion='distance')
- array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) # may vary
- >>> fcluster(Z, 2, criterion='distance')
- array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) # may vary
- >>> fcluster(Z, 4, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='centroid', metric='euclidean')
- @lazy_cython
- def median(y):
- """
- Perform median/WPGMC linkage.
- See `linkage` for more information on the return structure
- and algorithm.
- The following are common calling conventions:
- 1. ``Z = median(y)``
- Performs median/WPGMC linkage on the condensed distance matrix
- ``y``. See ``linkage`` for more information on the return
- structure and algorithm.
- 2. ``Z = median(X)``
- Performs median/WPGMC linkage on the observation matrix ``X``
- using Euclidean distance as the distance metric. See `linkage`
- for more information on the return structure and algorithm.
- Parameters
- ----------
- y : ndarray
- A condensed distance matrix. A condensed
- distance matrix is a flat array containing the upper
- triangular of the distance matrix. This is the form that
- ``pdist`` returns. Alternatively, a collection of
- m observation vectors in n dimensions may be passed as
- an m by n array.
- Returns
- -------
- Z : ndarray
- The hierarchical clustering encoded as a linkage matrix.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import median, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = median(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 2. , 12. , 1.11803399, 3. ],
- [ 5. , 13. , 1.11803399, 3. ],
- [ 8. , 15. , 1.11803399, 3. ],
- [11. , 14. , 1.11803399, 3. ],
- [18. , 19. , 3. , 6. ],
- [16. , 17. , 3.5 , 6. ],
- [20. , 21. , 3.25 , 12. ]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32)
- >>> fcluster(Z, 1.1, criterion='distance')
- array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
- >>> fcluster(Z, 2, criterion='distance')
- array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
- >>> fcluster(Z, 4, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='median', metric='euclidean')
- @lazy_cython
- def ward(y):
- """
- Perform Ward's linkage on a condensed distance matrix.
- See `linkage` for more information on the return structure
- and algorithm.
- The following are common calling conventions:
- 1. ``Z = ward(y)``
- Performs Ward's linkage on the condensed distance matrix ``y``.
- 2. ``Z = ward(X)``
- Performs Ward's linkage on the observation matrix ``X`` using
- Euclidean distance as the distance metric.
- Parameters
- ----------
- y : ndarray
- A condensed distance matrix. A condensed
- distance matrix is a flat array containing the upper
- triangular of the distance matrix. This is the form that
- ``pdist`` returns. Alternatively, a collection of
- m observation vectors in n dimensions may be passed as
- an m by n array.
- Returns
- -------
- Z : ndarray
- The hierarchical clustering encoded as a linkage matrix. See
- `linkage` for more information on the return structure and
- algorithm.
- See Also
- --------
- linkage : for advanced creation of hierarchical clusterings.
- scipy.spatial.distance.pdist : pairwise distance metrics
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, fcluster
- >>> from scipy.spatial.distance import pdist
- First, we need a toy dataset to play with::
- x x x x
- x x
- x x
- x x x x
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- Then, we get a condensed distance matrix from this dataset:
- >>> y = pdist(X)
- Finally, we can perform the clustering:
- >>> Z = ward(y)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- The linkage matrix ``Z`` represents a dendrogram - see
- `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
- contents.
- We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
- each initial point would belong given a distance threshold:
- >>> fcluster(Z, 0.9, criterion='distance')
- array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
- >>> fcluster(Z, 1.1, criterion='distance')
- array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
- >>> fcluster(Z, 3, criterion='distance')
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- >>> fcluster(Z, 9, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
- plot of the dendrogram.
- """
- return linkage(y, method='ward', metric='euclidean')
- @lazy_cython
- def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
- """
- Perform hierarchical/agglomerative clustering.
- The input y may be either a 1-D condensed distance matrix
- or a 2-D array of observation vectors.
- If y is a 1-D condensed distance matrix,
- then y must be a :math:`\\binom{n}{2}` sized
- vector, where n is the number of original observations paired
- in the distance matrix. The behavior of this function is very
- similar to the MATLAB linkage function.
- A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the
- :math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and
- ``Z[i, 1]`` are combined to form cluster :math:`n + i`. A
- cluster with an index less than :math:`n` corresponds to one of
- the :math:`n` original observations. The distance between
- clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The
- fourth value ``Z[i, 3]`` represents the number of original
- observations in the newly formed cluster.
- The following linkage methods are used to compute the distance
- :math:`d(s, t)` between two clusters :math:`s` and
- :math:`t`. The algorithm begins with a forest of clusters that
- have yet to be used in the hierarchy being formed. When two
- clusters :math:`s` and :math:`t` from this forest are combined
- into a single cluster :math:`u`, :math:`s` and :math:`t` are
- removed from the forest, and :math:`u` is added to the
- forest. When only one cluster remains in the forest, the algorithm
- stops, and this cluster becomes the root.
- A distance matrix is maintained at each iteration. The ``d[i,j]``
- entry corresponds to the distance between cluster :math:`i` and
- :math:`j` in the original forest.
- At each iteration, the algorithm must update the distance matrix
- to reflect the distance of the newly formed cluster u with the
- remaining clusters in the forest.
- Suppose there are :math:`|u|` original observations
- :math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and
- :math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in
- cluster :math:`v`. Recall, :math:`s` and :math:`t` are
- combined to form cluster :math:`u`. Let :math:`v` be any
- remaining cluster in the forest that is not :math:`u`.
- The following are methods for calculating the distance between the
- newly formed cluster :math:`u` and each :math:`v`.
- * method='single' assigns
- .. math::
- d(u,v) = \\min(dist(u[i],v[j]))
- for all points :math:`i` in cluster :math:`u` and
- :math:`j` in cluster :math:`v`. This is also known as the
- Nearest Point Algorithm.
- * method='complete' assigns
- .. math::
- d(u, v) = \\max(dist(u[i],v[j]))
- for all points :math:`i` in cluster u and :math:`j` in
- cluster :math:`v`. This is also known by the Farthest Point
- Algorithm or Voor Hees Algorithm.
- * method='average' assigns
- .. math::
- d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])}
- {(|u|*|v|)}
- for all points :math:`i` and :math:`j` where :math:`|u|`
- and :math:`|v|` are the cardinalities of clusters :math:`u`
- and :math:`v`, respectively. This is also called the UPGMA
- algorithm.
- * method='weighted' assigns
- .. math::
- d(u,v) = (dist(s,v) + dist(t,v))/2
- where cluster u was formed with cluster s and t and v
- is a remaining cluster in the forest (also called WPGMA).
- * method='centroid' assigns
- .. math::
- dist(s,t) = ||c_s-c_t||_2
- where :math:`c_s` and :math:`c_t` are the centroids of
- clusters :math:`s` and :math:`t`, respectively. When two
- clusters :math:`s` and :math:`t` are combined into a new
- cluster :math:`u`, the new centroid is computed over all the
- original objects in clusters :math:`s` and :math:`t`. The
- distance then becomes the Euclidean distance between the
- centroid of :math:`u` and the centroid of a remaining cluster
- :math:`v` in the forest. This is also known as the UPGMC
- algorithm.
- * method='median' assigns :math:`d(s,t)` like the ``centroid``
- method. When two clusters :math:`s` and :math:`t` are combined
- into a new cluster :math:`u`, the average of centroids s and t
- give the new centroid :math:`u`. This is also known as the
- WPGMC algorithm.
- * method='ward' uses the Ward variance minimization algorithm.
- The new entry :math:`d(u,v)` is computed as follows,
- .. math::
- d(u,v) = \\sqrt{\\frac{|v|+|s|}
- {T}d(v,s)^2
- + \\frac{|v|+|t|}
- {T}d(v,t)^2
- - \\frac{|v|}
- {T}d(s,t)^2}
- where :math:`u` is the newly joined cluster consisting of
- clusters :math:`s` and :math:`t`, :math:`v` is an unused
- cluster in the forest, :math:`T=|v|+|s|+|t|`, and
- :math:`|*|` is the cardinality of its argument. This is also
- known as the incremental algorithm.
- Warning: When the minimum distance pair in the forest is chosen, there
- may be two or more pairs with the same minimum distance. This
- implementation may choose a different minimum than the MATLAB
- version.
- Parameters
- ----------
- y : ndarray
- A condensed distance matrix. A condensed distance matrix
- is a flat array containing the upper triangular of the distance matrix.
- This is the form that ``pdist`` returns. Alternatively, a collection of
- :math:`m` observation vectors in :math:`n` dimensions may be passed as
- an :math:`m` by :math:`n` array. All elements of the condensed distance
- matrix must be finite, i.e., no NaNs or infs.
- method : str, optional
- The linkage algorithm to use. See the ``Linkage Methods`` section below
- for full descriptions.
- metric : str or function, optional
- The distance metric to use in the case that y is a collection of
- observation vectors; ignored otherwise. See the ``pdist``
- function for a list of valid distance metrics. A custom distance
- function can also be used.
- optimal_ordering : bool, optional
- If True, the linkage matrix will be reordered so that the distance
- between successive leaves is minimal. This results in a more intuitive
- tree structure when the data are visualized. defaults to False, because
- this algorithm can be slow, particularly on large datasets [2]_. See
- also the `optimal_leaf_ordering` function.
- .. versionadded:: 1.0.0
- Returns
- -------
- Z : ndarray
- The hierarchical clustering encoded as a linkage matrix.
- Notes
- -----
- 1. For method 'single', an optimized algorithm based on minimum spanning
- tree is implemented. It has time complexity :math:`O(n^2)`.
- For methods 'complete', 'average', 'weighted' and 'ward', an algorithm
- called nearest-neighbors chain is implemented. It also has time
- complexity :math:`O(n^2)`.
- For other methods, a naive algorithm is implemented with :math:`O(n^3)`
- time complexity.
- All algorithms use :math:`O(n^2)` memory.
- Refer to [1]_ for details about the algorithms.
- 2. Methods 'centroid', 'median', and 'ward' are correctly defined only if
- Euclidean pairwise metric is used. If `y` is passed as precomputed
- pairwise distances, then it is the user's responsibility to assure that
- these distances are in fact Euclidean, otherwise the produced result
- will be incorrect.
- See Also
- --------
- scipy.spatial.distance.pdist : pairwise distance metrics
- References
- ----------
- .. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering
- algorithms", :arXiv:`1109.2378v1`.
- .. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal
- leaf ordering for hierarchical clustering", 2001. Bioinformatics
- :doi:`10.1093/bioinformatics/17.suppl_1.S22`
- Examples
- --------
- >>> from scipy.cluster.hierarchy import dendrogram, linkage
- >>> from matplotlib import pyplot as plt
- >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
- >>> Z = linkage(X, 'ward')
- >>> fig = plt.figure(figsize=(25, 10))
- >>> dn = dendrogram(Z)
- >>> Z = linkage(X, 'single')
- >>> fig = plt.figure(figsize=(25, 10))
- >>> dn = dendrogram(Z)
- >>> plt.show()
- """
- xp = array_namespace(y)
- y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
- lazy = is_lazy_array(y)
- if method not in _LINKAGE_METHODS:
- raise ValueError(f"Invalid method: {method}")
- if method in _EUCLIDEAN_METHODS and metric != 'euclidean' and y.ndim == 2:
- msg = f"`method={method}` requires the distance metric to be Euclidean"
- raise ValueError(msg)
- if y.ndim == 1:
- distance.is_valid_y(y, throw=True, name='y')
- elif y.ndim == 2:
- if (not lazy and y.shape[0] == y.shape[1]
- and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
- and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
- warnings.warn('The symmetric non-negative hollow observation '
- 'matrix looks suspiciously like an uncondensed '
- 'distance matrix',
- ClusterWarning, stacklevel=2)
- y = distance.pdist(y, metric)
- else:
- raise ValueError("`y` must be 1 or 2 dimensional.")
- if not lazy and not xp.all(xp.isfinite(y)):
- raise ValueError("The condensed distance matrix must contain only "
- "finite values.")
- n = distance.num_obs_y(y)
- method_code = _LINKAGE_METHODS[method]
- def cy_linkage(y, validate):
- if validate and not np.all(np.isfinite(y)):
- raise ValueError("The condensed distance matrix must contain only "
- "finite values.")
- if method == 'single':
- return _hierarchy.mst_single_linkage(y, n)
- elif method in ('complete', 'average', 'weighted', 'ward'):
- return _hierarchy.nn_chain(y, n, method_code)
- else:
- return _hierarchy.fast_linkage(y, n, method_code)
- result = xpx.lazy_apply(cy_linkage, y, validate=lazy,
- shape=(n - 1, 4), dtype=xp.float64,
- as_numpy=True, xp=xp)
- if optimal_ordering:
- return optimal_leaf_ordering(result, y)
- else:
- return result
- class ClusterNode:
- """
- A tree node class for representing a cluster.
- Leaf nodes correspond to original observations, while non-leaf nodes
- correspond to non-singleton clusters.
- The `to_tree` function converts a matrix returned by the linkage
- function into an easy-to-use tree representation.
- All parameter names are also attributes.
- Parameters
- ----------
- id : int
- The node id.
- left : ClusterNode instance, optional
- The left child tree node.
- right : ClusterNode instance, optional
- The right child tree node.
- dist : float, optional
- Distance for this cluster in the linkage matrix.
- count : int, optional
- The number of samples in this cluster.
- See Also
- --------
- to_tree : for converting a linkage matrix ``Z`` into a tree object.
- """
- def __init__(self, id, left=None, right=None, dist=0.0, count=1):
- if id < 0:
- raise ValueError('The id must be non-negative.')
- if dist < 0:
- raise ValueError('The distance must be non-negative.')
- if (left is None and right is not None) or \
- (left is not None and right is None):
- raise ValueError('Only full or proper binary trees are permitted.'
- ' This node has one child.')
- if count < 1:
- raise ValueError('A cluster must contain at least one original '
- 'observation.')
- self.id = id
- self.left = left
- self.right = right
- self.dist = dist
- if self.left is None:
- self.count = count
- else:
- self.count = left.count + right.count
- def __lt__(self, node):
- if not isinstance(node, ClusterNode):
- raise ValueError("Can't compare ClusterNode "
- f"to type {type(node)}")
- return self.dist < node.dist
- def __gt__(self, node):
- if not isinstance(node, ClusterNode):
- raise ValueError("Can't compare ClusterNode "
- f"to type {type(node)}")
- return self.dist > node.dist
- def __eq__(self, node):
- if not isinstance(node, ClusterNode):
- raise ValueError("Can't compare ClusterNode "
- f"to type {type(node)}")
- return self.dist == node.dist
- def get_id(self):
- """
- The identifier of the target node.
- For ``0 <= i < n``, `i` corresponds to original observation i.
- For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed
- at iteration ``i-n``.
- Returns
- -------
- id : int
- The identifier of the target node.
- """
- return self.id
- def get_count(self):
- """
- The number of leaf nodes (original observations) belonging to
- the cluster node nd. If the target node is a leaf, 1 is
- returned.
- Returns
- -------
- get_count : int
- The number of leaf nodes below the target node.
- """
- return self.count
- def get_left(self):
- """
- Return a reference to the left child tree object.
- Returns
- -------
- left : ClusterNode
- The left child of the target node. If the node is a leaf,
- None is returned.
- """
- return self.left
- def get_right(self):
- """
- Return a reference to the right child tree object.
- Returns
- -------
- right : ClusterNode
- The left child of the target node. If the node is a leaf,
- None is returned.
- """
- return self.right
- def is_leaf(self):
- """
- Return True if the target node is a leaf.
- Returns
- -------
- leafness : bool
- True if the target node is a leaf node.
- """
- return self.left is None
- def pre_order(self, func=(lambda x: x.id)):
- """
- Perform pre-order traversal without recursive function calls.
- When a leaf node is first encountered, ``func`` is called with
- the leaf node as its argument, and its result is appended to
- the list.
- For example, the statement::
- ids = root.pre_order(lambda x: x.id)
- returns a list of the node ids corresponding to the leaf nodes
- of the tree as they appear from left to right.
- Parameters
- ----------
- func : function
- Applied to each leaf ClusterNode object in the pre-order traversal.
- Given the ``i``-th leaf node in the pre-order traversal ``n[i]``,
- the result of ``func(n[i])`` is stored in ``L[i]``. If not
- provided, the index of the original observation to which the node
- corresponds is used.
- Returns
- -------
- L : list
- The pre-order traversal.
- """
- # Do a preorder traversal, caching the result. To avoid having to do
- # recursion, we'll store the previous index we've visited in a vector.
- n = self.count
- curNode = [None] * (2 * n)
- lvisited = set()
- rvisited = set()
- curNode[0] = self
- k = 0
- preorder = []
- while k >= 0:
- nd = curNode[k]
- ndid = nd.id
- if nd.is_leaf():
- preorder.append(func(nd))
- k = k - 1
- else:
- if ndid not in lvisited:
- curNode[k + 1] = nd.left
- lvisited.add(ndid)
- k = k + 1
- elif ndid not in rvisited:
- curNode[k + 1] = nd.right
- rvisited.add(ndid)
- k = k + 1
- # If we've visited the left and right of this non-leaf
- # node already, go up in the tree.
- else:
- k = k - 1
- return preorder
- _cnode_bare = ClusterNode(0)
- _cnode_type = type(ClusterNode)
- def _order_cluster_tree(Z):
- """
- Return clustering nodes in bottom-up order by distance.
- Parameters
- ----------
- Z : scipy.cluster.linkage array
- The linkage matrix.
- Returns
- -------
- nodes : list
- A list of ClusterNode objects.
- """
- q = deque()
- tree = to_tree(Z)
- q.append(tree)
- nodes = []
- while q:
- node = q.popleft()
- if not node.is_leaf():
- bisect.insort_left(nodes, node)
- q.append(node.get_right())
- q.append(node.get_left())
- return nodes
- @xp_capabilities(np_only=True, reason="non-standard indexing")
- def cut_tree(Z, n_clusters=None, height=None):
- """
- Given a linkage matrix Z, return the cut tree.
- Parameters
- ----------
- Z : scipy.cluster.linkage array
- The linkage matrix.
- n_clusters : array_like, optional
- Number of clusters in the tree at the cut point.
- height : array_like, optional
- The height at which to cut the tree. Only possible for ultrametric
- trees.
- Returns
- -------
- cutree : array
- An array indicating group membership at each agglomeration step. I.e.,
- for a full cut tree, in the first column each data point is in its own
- cluster. At the next step, two nodes are merged. Finally, all
- singleton and non-singleton clusters are in one group. If `n_clusters`
- or `height` are given, the columns correspond to the columns of
- `n_clusters` or `height`.
- Examples
- --------
- >>> from scipy import cluster
- >>> import numpy as np
- >>> from numpy.random import default_rng
- >>> rng = default_rng()
- >>> X = rng.random((50, 4))
- >>> Z = cluster.hierarchy.ward(X)
- >>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])
- >>> cutree[:10]
- array([[0, 0],
- [1, 1],
- [2, 2],
- [3, 3],
- [3, 4],
- [2, 2],
- [0, 0],
- [1, 5],
- [3, 6],
- [4, 7]]) # random
- """
- xp = array_namespace(Z)
- nobs = num_obs_linkage(Z)
- nodes = _order_cluster_tree(Z)
- if height is not None and n_clusters is not None:
- raise ValueError("At least one of either height or n_clusters "
- "must be None")
- elif height is None and n_clusters is None: # return the full cut tree
- cols_idx = xp.arange(nobs)
- elif height is not None:
- height = xp.asarray(height)
- heights = xp.asarray([x.dist for x in nodes])
- cols_idx = xp.searchsorted(heights, height)
- else:
- n_clusters = xp.asarray(n_clusters)
- cols_idx = nobs - xp.searchsorted(xp.arange(nobs), n_clusters)
- try:
- n_cols = len(cols_idx)
- except TypeError: # scalar
- n_cols = 1
- cols_idx = xp.asarray([cols_idx])
- groups = xp.zeros((n_cols, nobs), dtype=xp.int64)
- last_group = xp.arange(nobs)
- if 0 in cols_idx:
- groups[0] = last_group
- for i, node in enumerate(nodes):
- idx = node.pre_order()
- this_group = xp_copy(last_group, xp=xp)
- # TODO ARRAY_API complex indexing not supported
- this_group[idx] = xp.min(last_group[idx])
- this_group[this_group > xp.max(last_group[idx])] -= 1
- if i + 1 in cols_idx:
- groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group
- last_group = this_group
- return groups.T
- @xp_capabilities(jax_jit=False, allow_dask_compute=True)
- def to_tree(Z, rd=False):
- """
- Convert a linkage matrix into an easy-to-use tree object.
- The reference to the root `ClusterNode` object is returned (by default).
- Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``,
- and ``count`` attribute. The left and right attributes point to
- ClusterNode objects that were combined to generate the cluster.
- If both are None then the `ClusterNode` object is a leaf node, its count
- must be 1, and its distance is meaningless but set to 0.
- *Note: This function is provided for the convenience of the library
- user. ClusterNodes are not used as input to any of the functions in this
- library.*
- Parameters
- ----------
- Z : ndarray
- The linkage matrix in proper form (see the `linkage`
- function documentation).
- rd : bool, optional
- When False (default), a reference to the root `ClusterNode` object is
- returned. Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a
- reference to the root node while ``d`` is a list of `ClusterNode`
- objects - one per original entry in the linkage matrix plus entries
- for all clustering steps. If a cluster id is
- less than the number of samples ``n`` in the data that the linkage
- matrix describes, then it corresponds to a singleton cluster (leaf
- node).
- See `linkage` for more information on the assignment of cluster ids
- to clusters.
- Returns
- -------
- tree : ClusterNode or tuple (ClusterNode, list of ClusterNode)
- If ``rd`` is False, a `ClusterNode`.
- If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number
- of samples. See the description of `rd` above for more details.
- See Also
- --------
- linkage, is_valid_linkage, ClusterNode
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.cluster import hierarchy
- >>> rng = np.random.default_rng()
- >>> x = rng.random((5, 2))
- >>> Z = hierarchy.linkage(x)
- >>> hierarchy.to_tree(Z)
- <scipy.cluster.hierarchy.ClusterNode object at ...
- >>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True)
- >>> rootnode
- <scipy.cluster.hierarchy.ClusterNode object at ...
- >>> len(nodelist)
- 9
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
- # Number of original objects is equal to the number of rows plus 1.
- n = Z.shape[0] + 1
- # Create a list full of None's to store the node objects
- d = [None] * (n * 2 - 1)
- # Create the nodes corresponding to the n original objects.
- for i in range(0, n):
- d[i] = ClusterNode(i)
- nd = None
- for i in range(Z.shape[0]):
- row = Z[i, :]
- fi = int_floor(row[0], xp)
- fj = int_floor(row[1], xp)
- if fi > i + n:
- raise ValueError('Corrupt matrix Z. Index to derivative cluster '
- f'is used before it is formed. See row {fi}, '
- 'column 0')
- if fj > i + n:
- raise ValueError('Corrupt matrix Z. Index to derivative cluster '
- f'is used before it is formed. See row {fj}, '
- 'column 1')
- nd = ClusterNode(i + n, d[fi], d[fj], row[2])
- # ^ id ^ left ^ right ^ dist
- if row[3] != nd.count:
- raise ValueError(f'Corrupt matrix Z. The count Z[{i},3] is '
- 'incorrect.')
- d[n + i] = nd
- if rd:
- return (nd, d)
- else:
- return nd
- @lazy_cython
- def optimal_leaf_ordering(Z, y, metric='euclidean'):
- """
- Given a linkage matrix Z and distance, reorder the cut tree.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as a linkage matrix. See
- `linkage` for more information on the return structure and
- algorithm.
- y : ndarray
- The condensed distance matrix from which Z was generated.
- Alternatively, a collection of m observation vectors in n
- dimensions may be passed as an m by n array.
- metric : str or function, optional
- The distance metric to use in the case that y is a collection of
- observation vectors; ignored otherwise. See the ``pdist``
- function for a list of valid distance metrics. A custom distance
- function can also be used.
- Returns
- -------
- Z_ordered : ndarray
- A copy of the linkage matrix Z, reordered to minimize the distance
- between adjacent leaves.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.cluster import hierarchy
- >>> rng = np.random.default_rng()
- >>> X = rng.standard_normal((10, 10))
- >>> Z = hierarchy.ward(X)
- >>> hierarchy.leaves_list(Z)
- array([0, 3, 1, 9, 2, 5, 7, 4, 6, 8], dtype=int32)
- >>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))
- array([3, 0, 2, 5, 7, 4, 8, 6, 9, 1], dtype=int32)
- """
- xp = array_namespace(Z, y)
- Z = _asarray(Z, order='C', xp=xp)
- y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
- lazy = is_lazy_array(Z)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- if y.ndim == 1:
- distance.is_valid_y(y, throw=True, name='y')
- elif y.ndim == 2:
- if (not lazy and y.shape[0] == y.shape[1]
- and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
- and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
- warnings.warn('The symmetric non-negative hollow observation '
- 'matrix looks suspiciously like an uncondensed '
- 'distance matrix',
- ClusterWarning, stacklevel=2)
- y = distance.pdist(y, metric)
- else:
- raise ValueError("`y` must be 1 or 2 dimensional.")
- if not lazy and not xp.all(xp.isfinite(y)):
- raise ValueError("The condensed distance matrix must contain only "
- "finite values.")
- # The function name is prominently visible on the user-facing Dask dashboard;
- # make sure it is meaningful.
- def cy_optimal_leaf_ordering(Z, y, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- if not np.all(np.isfinite(y)):
- raise ValueError("The condensed distance matrix must contain only "
- "finite values.")
- return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y)
- return xpx.lazy_apply(cy_optimal_leaf_ordering, Z, y, validate=lazy,
- shape=Z.shape, dtype=Z.dtype, as_numpy=True, xp=xp)
- @lazy_cython
- def cophenet(Z, Y=None):
- """
- Calculate the cophenetic distances between each observation in
- the hierarchical clustering defined by the linkage ``Z``.
- Suppose ``p`` and ``q`` are original observations in
- disjoint clusters ``s`` and ``t``, respectively and
- ``s`` and ``t`` are joined by a direct parent cluster
- ``u``. The cophenetic distance between observations
- ``i`` and ``j`` is simply the distance between
- clusters ``s`` and ``t``.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as an array
- (see `linkage` function).
- Y : ndarray (optional)
- Calculates the cophenetic correlation coefficient ``c`` of a
- hierarchical clustering defined by the linkage matrix `Z`
- of a set of :math:`n` observations in :math:`m`
- dimensions. `Y` is the condensed distance matrix from which
- `Z` was generated.
- Returns
- -------
- c : ndarray
- The cophentic correlation distance (if ``Y`` is passed).
- d : ndarray
- The cophenetic distance matrix in condensed form. The
- :math:`ij` th entry is the cophenetic distance between
- original observations :math:`i` and :math:`j`.
- See Also
- --------
- linkage :
- for a description of what a linkage matrix is.
- scipy.spatial.distance.squareform :
- transforming condensed matrices into square ones.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import single, cophenet
- >>> from scipy.spatial.distance import pdist, squareform
- Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance
- between two points of ``X`` is the distance between the largest two
- distinct clusters that each of the points:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- ``X`` corresponds to this dataset ::
- x x x x
- x x
- x x
- x x x x
- >>> Z = single(pdist(X))
- >>> Z
- array([[ 0., 1., 1., 2.],
- [ 2., 12., 1., 3.],
- [ 3., 4., 1., 2.],
- [ 5., 14., 1., 3.],
- [ 6., 7., 1., 2.],
- [ 8., 16., 1., 3.],
- [ 9., 10., 1., 2.],
- [11., 18., 1., 3.],
- [13., 15., 2., 6.],
- [17., 20., 2., 9.],
- [19., 21., 2., 12.]])
- >>> cophenet(Z)
- array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
- 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
- 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
- 1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.])
- The output of the `scipy.cluster.hierarchy.cophenet` method is
- represented in condensed form. We can use
- `scipy.spatial.distance.squareform` to see the output as a
- regular matrix (where each element ``ij`` denotes the cophenetic distance
- between each ``i``, ``j`` pair of points in ``X``):
- >>> squareform(cophenet(Z))
- array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
- [1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
- [1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
- [2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.],
- [2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.],
- [2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.],
- [2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.],
- [2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.],
- [2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.],
- [2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.],
- [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.],
- [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]])
- In this example, the cophenetic distance between points on ``X`` that are
- very close (i.e., in the same corner) is 1. For other pairs of points is 2,
- because the points will be located in clusters at different
- corners - thus, the distance between these clusters will be larger.
- """
- xp = array_namespace(Z, Y)
- # Ensure float64 C-contiguous array. Cython code doesn't deal with striding.
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- def cy_cophenet(Z, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- n = Z.shape[0] + 1
- zz = np.zeros((n * (n-1)) // 2, dtype=np.float64)
- _hierarchy.cophenetic_distances(Z, zz, n)
- return zz
-
- n = Z.shape[0] + 1
- zz = xpx.lazy_apply(cy_cophenet, Z, validate=is_lazy_array(Z),
- shape=((n * (n-1)) // 2, ), dtype=xp.float64,
- as_numpy=True, xp=xp)
-
- if Y is None:
- return zz
- Y = _asarray(Y, order='C', xp=xp)
- distance.is_valid_y(Y, throw=True, name='Y')
- z = xp.mean(zz)
- y = xp.mean(Y)
- Yy = Y - y
- Zz = zz - z
- numerator = (Yy * Zz)
- denomA = Yy**2
- denomB = Zz**2
- c = xp.sum(numerator) / xp.sqrt(xp.sum(denomA) * xp.sum(denomB))
- return (c, zz)
- @lazy_cython
- def inconsistent(Z, d=2):
- r"""
- Calculate inconsistency statistics on a linkage matrix.
- Parameters
- ----------
- Z : ndarray
- The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical
- clustering). See `linkage` documentation for more information on its
- form.
- d : int, optional
- The number of links up to `d` levels below each non-singleton cluster.
- Returns
- -------
- R : ndarray
- A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link
- statistics for the non-singleton cluster ``i``. The link statistics are
- computed over the link heights for links :math:`d` levels below the
- cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard
- deviation of the link heights, respectively; ``R[i,2]`` is the number
- of links included in the calculation; and ``R[i,3]`` is the
- inconsistency coefficient,
- .. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]}
- Notes
- -----
- This function behaves similarly to the MATLAB(TM) ``inconsistent``
- function.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import inconsistent, linkage
- >>> from matplotlib import pyplot as plt
- >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
- >>> Z = linkage(X, 'ward')
- >>> print(Z)
- [[ 5. 6. 0. 2. ]
- [ 2. 7. 0. 2. ]
- [ 0. 4. 1. 2. ]
- [ 1. 8. 1.15470054 3. ]
- [ 9. 10. 2.12132034 4. ]
- [ 3. 12. 4.11096096 5. ]
- [11. 13. 14.07183949 8. ]]
- >>> inconsistent(Z)
- array([[ 0. , 0. , 1. , 0. ],
- [ 0. , 0. , 1. , 0. ],
- [ 1. , 0. , 1. , 0. ],
- [ 0.57735027, 0.81649658, 2. , 0.70710678],
- [ 1.04044011, 1.06123822, 3. , 1.01850858],
- [ 3.11614065, 1.40688837, 2. , 0.70710678],
- [ 6.44583366, 6.76770586, 3. , 1.12682288]])
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- if d != np.floor(d) or d < 0:
- raise ValueError('The second argument d must be a nonnegative '
- 'integer value.')
- def cy_inconsistent(Z, d, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- R = np.zeros((Z.shape[0], 4), dtype=np.float64)
- n = Z.shape[0] + 1
- _hierarchy.inconsistent(Z, R, n, d)
- return R
- return xpx.lazy_apply(cy_inconsistent, Z, d=int(d), validate=is_lazy_array(Z),
- shape=(Z.shape[0], 4), dtype=xp.float64,
- as_numpy=True, xp=xp)
- @lazy_cython
- def from_mlab_linkage(Z):
- """
- Convert a linkage matrix generated by MATLAB(TM) to a new
- linkage matrix compatible with this module.
- The conversion does two things:
- * the indices are converted from ``1..N`` to ``0..(N-1)`` form,
- and
- * a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the
- number of original observations (leaves) in the non-singleton
- cluster ``i``.
- This function is useful when loading in linkages from legacy data
- files generated by MATLAB.
- Parameters
- ----------
- Z : ndarray
- A linkage matrix generated by MATLAB(TM).
- Returns
- -------
- ZS : ndarray
- A linkage matrix compatible with ``scipy.cluster.hierarchy``.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- to_mlab_linkage : transform from SciPy to MATLAB format.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.cluster.hierarchy import ward, from_mlab_linkage
- Given a linkage matrix in MATLAB format ``mZ``, we can use
- `scipy.cluster.hierarchy.from_mlab_linkage` to import
- it into SciPy format:
- >>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1],
- ... [10, 11, 1], [3, 13, 1.29099445],
- ... [6, 14, 1.29099445],
- ... [9, 15, 1.29099445],
- ... [12, 16, 1.29099445],
- ... [17, 18, 5.77350269],
- ... [19, 20, 5.77350269],
- ... [21, 22, 8.16496581]])
- >>> Z = from_mlab_linkage(mZ)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [ 11. , 15. , 1.29099445, 3. ],
- [ 16. , 17. , 5.77350269, 6. ],
- [ 18. , 19. , 5.77350269, 6. ],
- [ 20. , 21. , 8.16496581, 12. ]])
- As expected, the linkage matrix ``Z`` returned includes an
- additional column counting the number of original samples in
- each cluster. Also, all cluster indices are reduced by 1
- (MATLAB format uses 1-indexing, whereas SciPy uses 0-indexing).
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, dtype=xp.float64, order='C', xp=xp)
- # If it's empty, return it.
- if Z.shape in ((), (0, )):
- return xp_copy(Z, xp=xp)
- if Z.ndim != 2:
- raise ValueError("The linkage array must be rectangular.")
- # If it contains no rows, return it.
- n = Z.shape[0]
- if n == 0:
- return xp_copy(Z, xp=xp)
- lazy = is_lazy_array(Z)
- if not lazy and xp.min(Z[:, :2]) != 1.0 and xp.max(Z[:, :2]) != 2 * n:
- raise ValueError('The format of the indices is not 1..N')
- res = xp.empty((Z.shape[0], Z.shape[1] + 1), dtype=Z.dtype)
- res = xpx.at(res)[:, :2].set(Z[:, :2] - 1.0)
- res = xpx.at(res)[:, 2:-1].set(Z[:, 2:])
- def cy_from_mlab_linkage(Zpart, validate):
- n = Zpart.shape[0]
- if validate and np.min(Zpart[:, :2]) != 0.0 and np.max(Zpart[:, :2]) != 2 * n:
- raise ValueError('The format of the indices is not 1..N')
- if not Zpart.flags.writeable:
- Zpart = Zpart.copy() # xp=jax.numpy
- CS = np.zeros((n,))
- _hierarchy.calculate_cluster_sizes(Zpart, CS, n + 1)
- return CS
- CS = xpx.lazy_apply(cy_from_mlab_linkage, res[:, :-1], validate=lazy,
- shape=(res.shape[0],), dtype=xp.float64,
- as_numpy=True, xp=xp)
- return xpx.at(res)[:, -1].set(CS)
- @xp_capabilities()
- def to_mlab_linkage(Z):
- """
- Convert a linkage matrix to a MATLAB(TM) compatible one.
- Converts a linkage matrix ``Z`` generated by the linkage function
- of this module to a MATLAB(TM) compatible one. The return linkage
- matrix has the last column removed and the cluster indices are
- converted to ``1..N`` indexing.
- Parameters
- ----------
- Z : ndarray
- A linkage matrix generated by ``scipy.cluster.hierarchy``.
- Returns
- -------
- to_mlab_linkage : ndarray
- A linkage matrix compatible with MATLAB(TM)'s hierarchical
- clustering functions.
- The return linkage matrix has the last column removed
- and the cluster indices are converted to ``1..N`` indexing.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- from_mlab_linkage : transform from Matlab to SciPy format.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, to_mlab_linkage
- >>> from scipy.spatial.distance import pdist
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- After a linkage matrix ``Z`` has been created, we can use
- `scipy.cluster.hierarchy.to_mlab_linkage` to convert it
- into MATLAB format:
- >>> mZ = to_mlab_linkage(Z)
- >>> mZ
- array([[ 1. , 2. , 1. ],
- [ 4. , 5. , 1. ],
- [ 7. , 8. , 1. ],
- [ 10. , 11. , 1. ],
- [ 3. , 13. , 1.29099445],
- [ 6. , 14. , 1.29099445],
- [ 9. , 15. , 1.29099445],
- [ 12. , 16. , 1.29099445],
- [ 17. , 18. , 5.77350269],
- [ 19. , 20. , 5.77350269],
- [ 21. , 22. , 8.16496581]])
- The new linkage matrix ``mZ`` uses 1-indexing for all the
- clusters (instead of 0-indexing). Also, the last column of
- the original linkage matrix has been dropped.
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, dtype=xp.float64, xp=xp)
- if Z.shape in ((), (0, )):
- return xp_copy(Z, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- return xp.concat((Z[:, :2] + 1.0, Z[:, 2:3]), axis=1)
- @xp_capabilities()
- def is_monotonic(Z):
- """
- Return True if the linkage passed is monotonic.
- The linkage is monotonic if for every cluster :math:`s` and :math:`t`
- joined, the distance between them is no less than the distance
- between any previously joined clusters.
- Parameters
- ----------
- Z : ndarray
- The linkage matrix to check for monotonicity.
- Returns
- -------
- b : bool
- A boolean indicating whether the linkage is monotonic.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import median, ward, is_monotonic
- >>> from scipy.spatial.distance import pdist
- By definition, some hierarchical clustering algorithms - such as
- `scipy.cluster.hierarchy.ward` - produce monotonic assignments of
- samples to clusters; however, this is not always true for other
- hierarchical methods - e.g. `scipy.cluster.hierarchy.median`.
- Given a linkage matrix ``Z`` (as the result of a hierarchical clustering
- method) we can test programmatically whether it has the monotonicity
- property or not, using `scipy.cluster.hierarchy.is_monotonic`:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- >>> is_monotonic(Z)
- True
- >>> Z = median(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 2. , 12. , 1.11803399, 3. ],
- [ 5. , 13. , 1.11803399, 3. ],
- [ 8. , 15. , 1.11803399, 3. ],
- [11. , 14. , 1.11803399, 3. ],
- [18. , 19. , 3. , 6. ],
- [16. , 17. , 3.5 , 6. ],
- [20. , 21. , 3.25 , 12. ]])
- >>> is_monotonic(Z)
- False
- Note that this method is equivalent to just verifying that the distances
- in the third column of the linkage matrix appear in a monotonically
- increasing order.
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- # We expect the i'th value to be greater than its successor.
- return xp.all(Z[1:, 2] >= Z[:-1, 2])
- @xp_capabilities(warnings=[("dask.array", "see notes"), ("jax.numpy", "see notes")])
- def is_valid_im(R, warning=False, throw=False, name=None):
- """Return True if the inconsistency matrix passed is valid.
- It must be a :math:`n` by 4 array of doubles. The standard
- deviations ``R[:,1]`` must be nonnegative. The link counts
- ``R[:,2]`` must be positive and no greater than :math:`n-1`.
- Parameters
- ----------
- R : ndarray
- The inconsistency matrix to check for validity.
- warning : bool, optional
- When True, issues a Python warning if the linkage
- matrix passed is invalid.
- throw : bool, optional
- When True, throws a Python exception if the linkage
- matrix passed is invalid.
- name : str, optional
- This string refers to the variable name of the invalid
- linkage matrix.
- Returns
- -------
- b : bool
- True if the inconsistency matrix is valid; False otherwise.
- Notes
- -----
- *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
- or JAX), the return value may be a 0-dimensional bool Array. When warning=True
- or throw=True, calling this function materializes the array.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- inconsistent : for the creation of a inconsistency matrix.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im
- >>> from scipy.spatial.distance import pdist
- Given a data set ``X``, we can apply a clustering method to obtain a
- linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
- be also used to obtain the inconsistency matrix ``R`` associated to
- this clustering process:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> R = inconsistent(Z)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- >>> R
- array([[1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1.14549722, 0.20576415, 2. , 0.70710678],
- [1.14549722, 0.20576415, 2. , 0.70710678],
- [1.14549722, 0.20576415, 2. , 0.70710678],
- [1.14549722, 0.20576415, 2. , 0.70710678],
- [2.78516386, 2.58797734, 3. , 1.15470054],
- [2.78516386, 2.58797734, 3. , 1.15470054],
- [6.57065706, 1.38071187, 3. , 1.15470054]])
- Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that
- ``R`` is correct:
- >>> is_valid_im(R)
- True
- However, if ``R`` is wrongly constructed (e.g., one of the standard
- deviations is set to a negative value), then the check will fail:
- >>> R[-1,1] = R[-1,1] * -1
- >>> is_valid_im(R)
- False
- """
- xp = array_namespace(R)
- R = _asarray(R, xp=xp)
- return _is_valid_im(R, warning=warning, throw=throw, name=name,
- materialize=True, xp=xp)
- def _is_valid_im(R, warning=False, throw=False, name=None, materialize=False, *, xp):
- """Variant of `is_valid_im` to be called internally by other scipy functions,
- which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
- warning=True or throw=True.
- """
- name_str = f"{name!r} " if name else ''
- try:
- if R.dtype != xp.float64:
- raise TypeError(f'Inconsistency matrix {name_str}must contain doubles '
- '(double).')
- if len(R.shape) != 2:
- raise ValueError(f'Inconsistency matrix {name_str}must have shape=2 (i.e. '
- 'be two-dimensional).')
- if R.shape[1] != 4:
- raise ValueError(f'Inconsistency matrix {name_str}'
- 'must have 4 columns.')
- if R.shape[0] < 1:
- raise ValueError(f'Inconsistency matrix {name_str}'
- 'must have at least one row.')
- except (TypeError, ValueError) as e:
- if throw:
- raise
- if warning:
- _warning(str(e))
- return False
- return _lazy_valid_checks(
- (xp.any(R[:, 0] < 0),
- f'Inconsistency matrix {name_str} contains negative link height means.'),
- (xp.any(R[:, 1] < 0),
- f'Inconsistency matrix {name_str} contains negative link height standard '
- 'deviations.'),
- (xp.any(R[:, 2] < 0),
- f'Inconsistency matrix {name_str} contains negative link counts.'),
- throw=throw, warning=warning, materialize=materialize, xp=xp
- )
- @xp_capabilities(warnings=[("dask.array", "see notes"), ("jax.numpy", "see notes")])
- def is_valid_linkage(Z, warning=False, throw=False, name=None):
- """
- Check the validity of a linkage matrix.
- A linkage matrix is valid if it is a 2-D array (type double)
- with :math:`n` rows and 4 columns. The first two columns must contain
- indices between 0 and :math:`2n-1`. For a given row ``i``, the following
- two expressions have to hold:
- .. math::
- 0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1
- 0 \\leq Z[i,1] \\leq i+n-1
- I.e., a cluster cannot join another cluster unless the cluster being joined
- has been generated.
- The fourth column of `Z` represents the number of original observations
- in a cluster, so a valid ``Z[i, 3]`` value may not exceed the number of
- original observations.
- Parameters
- ----------
- Z : array_like
- Linkage matrix.
- warning : bool, optional
- When True, issues a Python warning if the linkage
- matrix passed is invalid.
- throw : bool, optional
- When True, throws a Python exception if the linkage
- matrix passed is invalid.
- name : str, optional
- This string refers to the variable name of the invalid
- linkage matrix.
- Returns
- -------
- b : bool
- True if the inconsistency matrix is valid; False otherwise.
- Notes
- -----
- *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
- or JAX), the return value may be a 0-dimensional bool Array. When warning=True
- or throw=True, calling this function materializes the array.
- See Also
- --------
- linkage: for a description of what a linkage matrix is.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, is_valid_linkage
- >>> from scipy.spatial.distance import pdist
- All linkage matrices generated by the clustering methods in this module
- will be valid (i.e., they will have the appropriate dimensions and the two
- required expressions will hold for all the rows).
- We can check this using `scipy.cluster.hierarchy.is_valid_linkage`:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- >>> is_valid_linkage(Z)
- True
- However, if we create a linkage matrix in a wrong way - or if we modify
- a valid one in a way that any of the required expressions don't hold
- anymore, then the check will fail:
- >>> Z[3][1] = 20 # the cluster number 20 is not defined at this point
- >>> is_valid_linkage(Z)
- False
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, xp=xp)
- return _is_valid_linkage(Z, warning=warning, throw=throw,
- name=name, materialize=True, xp=xp)
- def _is_valid_linkage(Z, warning=False, throw=False, name=None,
- materialize=False, *, xp):
- """Variant of `is_valid_linkage` to be called internally by other scipy functions,
- which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
- warning=True or throw=True.
- """
- name_str = f"{name!r} " if name else ''
- try:
- if Z.dtype != xp.float64:
- raise TypeError(f'Linkage matrix {name_str}must contain doubles.')
- if len(Z.shape) != 2:
- raise ValueError(f'Linkage matrix {name_str}must have shape=2 (i.e. be'
- ' two-dimensional).')
- if Z.shape[1] != 4:
- raise ValueError(f'Linkage matrix {name_str}must have 4 columns.')
- if Z.shape[0] == 0:
- raise ValueError('Linkage must be computed on at least two '
- 'observations.')
- except (TypeError, ValueError) as e:
- if throw:
- raise
- if warning:
- _warning(str(e))
- return False
- n = Z.shape[0]
- if n < 2:
- return True
- return _lazy_valid_checks(
- (xp.any(Z[:, :2] < 0),
- f'Linkage {name_str}contains negative indices.'),
- (xp.any(Z[:, 2] < 0),
- f'Linkage {name_str}contains negative distances.'),
- (xp.any(Z[:, 3] < 0),
- f'Linkage {name_str}contains negative counts.'),
- (xp.any(Z[:, 3] > n + 1),
- f'Linkage {name_str}contains excessive observations in a cluster'),
- (xp.any(xp.max(Z[:, :2], axis=1) >= xp.arange(n + 1, 2 * n + 1, dtype=Z.dtype)),
- f'Linkage {name_str}uses non-singleton cluster before it is formed.'),
- (xpx.nunique(Z[:, :2]) < n * 2,
- f'Linkage {name_str}uses the same cluster more than once.'),
- throw=throw, warning=warning, materialize=materialize, xp=xp
- )
- def _lazy_valid_checks(*args, throw=False, warning=False, materialize=False, xp):
- """Validate a set of conditions on the contents of possibly lazy arrays.
- Parameters
- ----------
- args : tuples of (Array, str)
- The first element of each tuple must be a 0-dimensional Array
- that evaluates to bool; the second element must be the message to convey
- if the first element evaluates to True.
- throw: bool
- Set to True to `raise ValueError(args[i][1])` if `args[i][0]` is True.
- warning: bool
- Set to True to issue a warning with message `args[i][1]` if `args[i][0]`
- is True.
- materialize: bool
- Set to True to force materialization of lazy arrays when throw=True or
- warning=True. If the inputs are lazy and materialize=False, ignore the
- `throw` and `warning` flags.
- xp: module
- Array API namespace
- Returns
- -------
- If xp is an eager backend (e.g. numpy) and all conditions are False, return True.
- If throw is True, raise. Otherwise, return False.
- If xp is a lazy backend (e.g. Dask or JAX), return a 0-dimensional bool Array.
- """
- conds = xp.concat([xp.reshape(cond, (1, )) for cond, _ in args])
- lazy = is_lazy_array(conds)
- if not throw and not warning or (lazy and not materialize):
- out = ~xp.any(conds)
- return out if lazy else bool(out)
- if is_dask(xp):
- # Only materialize the graph once, instead of once per check
- conds = conds.compute()
- # Don't call np.asarray(conds), as it would be blocked by the device transfer
- # guard on CuPy and PyTorch and the densification guard on Sparse, whereas
- # bool() will not.
- conds = [bool(cond) for cond in conds]
- for cond, (_, msg) in zip(conds, args):
- if throw and cond:
- raise ValueError(msg)
- elif warning and cond:
- warnings.warn(msg, ClusterWarning, stacklevel=3)
- return not any(conds)
- @xp_capabilities()
- def num_obs_linkage(Z):
- """
- Return the number of original observations of the linkage matrix passed.
- Parameters
- ----------
- Z : ndarray
- The linkage matrix on which to perform the operation.
- Returns
- -------
- n : int
- The number of original observations in the linkage.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, num_obs_linkage
- >>> from scipy.spatial.distance import pdist
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- ``Z`` is a linkage matrix obtained after using the Ward clustering method
- with ``X``, a dataset with 12 data points.
- >>> num_obs_linkage(Z)
- 12
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- return Z.shape[0] + 1
- @xp_capabilities()
- def correspond(Z, Y):
- """
- Check for correspondence between linkage and condensed distance matrices.
- They must have the same number of original observations for
- the check to succeed.
- This function is useful as a sanity check in algorithms that make
- extensive use of linkage and distance matrices that must
- correspond to the same set of original observations.
- Parameters
- ----------
- Z : array_like
- The linkage matrix to check for correspondence.
- Y : array_like
- The condensed distance matrix to check for correspondence.
- Returns
- -------
- b : bool
- A boolean indicating whether the linkage matrix and distance
- matrix could possibly correspond to one another.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, correspond
- >>> from scipy.spatial.distance import pdist
- This method can be used to check if a given linkage matrix ``Z`` has been
- obtained from the application of a cluster method over a dataset ``X``:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> X_condensed = pdist(X)
- >>> Z = ward(X_condensed)
- Here, we can compare ``Z`` and ``X`` (in condensed form):
- >>> correspond(Z, X_condensed)
- True
- """
- xp = array_namespace(Z, Y)
- Z = _asarray(Z, xp=xp)
- Y = _asarray(Y, xp=xp)
- _is_valid_linkage(Z, throw=True, xp=xp)
- distance.is_valid_y(Y, throw=True)
- return distance.num_obs_y(Y) == num_obs_linkage(Z)
- @xp_capabilities(cpu_only=True, reason="Cython code",
- jax_jit=False, allow_dask_compute=True)
- def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None):
- """
- Form flat clusters from the hierarchical clustering defined by
- the given linkage matrix.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded with the matrix returned
- by the `linkage` function.
- t : scalar
- For criteria 'inconsistent', 'distance' or 'monocrit',
- this is the threshold to apply when forming flat clusters.
- For 'maxclust' or 'maxclust_monocrit' criteria,
- this would be max number of clusters requested.
- criterion : str, optional
- The criterion to use in forming flat clusters. This can
- be any of the following values:
- ``inconsistent`` :
- If a cluster node and all its
- descendants have an inconsistent value less than or equal
- to `t`, then all its leaf descendants belong to the
- same flat cluster. When no non-singleton cluster meets
- this criterion, every node is assigned to its own
- cluster. (Default)
- ``distance`` :
- Forms flat clusters so that the original
- observations in each flat cluster have no greater a
- cophenetic distance than `t`.
- ``maxclust`` :
- Finds a minimum threshold ``r`` so that
- the cophenetic distance between any two original
- observations in the same flat cluster is no more than
- ``r`` and no more than `t` flat clusters are formed.
- ``monocrit`` :
- Forms a flat cluster from a cluster node c
- with index i when ``monocrit[j] <= t``.
- For example, to threshold on the maximum mean distance
- as computed in the inconsistency matrix R with a
- threshold of 0.8 do::
- MR = maxRstat(Z, R, 3)
- fcluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
- ``maxclust_monocrit`` :
- Forms a flat cluster from a
- non-singleton cluster node ``c`` when ``monocrit[i] <=
- r`` for all cluster indices ``i`` below and including
- ``c``. ``r`` is minimized such that no more than ``t``
- flat clusters are formed. monocrit must be
- monotonic. For example, to minimize the threshold t on
- maximum inconsistency values so that no more than 3 flat
- clusters are formed, do::
- MI = maxinconsts(Z, R)
- fcluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
- depth : int, optional
- The maximum depth to perform the inconsistency calculation.
- It has no meaning for the other criteria. Default is 2.
- R : ndarray, optional
- The inconsistency matrix to use for the ``'inconsistent'``
- criterion. This matrix is computed if not provided.
- monocrit : ndarray, optional
- An array of length n-1. `monocrit[i]` is the
- statistics upon which non-singleton i is thresholded. The
- monocrit vector must be monotonic, i.e., given a node c with
- index i, for all node indices j corresponding to nodes
- below c, ``monocrit[i] >= monocrit[j]``.
- Returns
- -------
- fcluster : ndarray
- An array of length ``n``. ``T[i]`` is the flat cluster number to
- which original observation ``i`` belongs.
- See Also
- --------
- linkage : for information about hierarchical clustering methods work.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, fcluster
- >>> from scipy.spatial.distance import pdist
- All cluster linkage methods - e.g., `scipy.cluster.hierarchy.ward`
- generate a linkage matrix ``Z`` as their output:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- This matrix represents a dendrogram, where the first and second elements
- are the two clusters merged at each step, the third element is the
- distance between these clusters, and the fourth element is the size of
- the new cluster - the number of original data points included.
- `scipy.cluster.hierarchy.fcluster` can be used to flatten the
- dendrogram, obtaining as a result an assignation of the original data
- points to single clusters.
- This assignation mostly depends on a distance threshold ``t`` - the maximum
- inter-cluster distance allowed:
- >>> fcluster(Z, t=0.9, criterion='distance')
- array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
- >>> fcluster(Z, t=1.1, criterion='distance')
- array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
- >>> fcluster(Z, t=3, criterion='distance')
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- >>> fcluster(Z, t=9, criterion='distance')
- array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
- In the first case, the threshold ``t`` is too small to allow any two
- samples in the data to form a cluster, so 12 different clusters are
- returned.
- In the second case, the threshold is large enough to allow the first
- 4 points to be merged with their nearest neighbors. So, here, only 8
- clusters are returned.
- The third case, with a much higher threshold, allows for up to 8 data
- points to be connected - so 4 clusters are returned here.
- Lastly, the threshold of the fourth case is large enough to allow for
- all data points to be merged together - so a single cluster is returned.
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
- n = Z.shape[0] + 1
- T = np.zeros((n,), dtype='i')
- if monocrit is not None:
- monocrit = np.asarray(monocrit, order='C', dtype=np.float64)
- Z = np.asarray(Z)
- monocrit = np.asarray(monocrit)
- if criterion == 'inconsistent':
- if R is None:
- R = inconsistent(Z, depth)
- else:
- R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
- _is_valid_im(R, throw=True, name='R', materialize=True, xp=xp)
- # Since the C code does not support striding using strides.
- # The dimensions are used instead.
- R = np.asarray(R)
- _hierarchy.cluster_in(Z, R, T, float(t), int(n))
- elif criterion == 'distance':
- _hierarchy.cluster_dist(Z, T, float(t), int(n))
- elif criterion == 'maxclust':
- _hierarchy.cluster_maxclust_dist(Z, T, int(n), t)
- elif criterion == 'monocrit':
- _hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n))
- elif criterion == 'maxclust_monocrit':
- _hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t))
- else:
- raise ValueError(f'Invalid cluster formation criterion: {str(criterion)}')
- return xp.asarray(T)
- @xp_capabilities(cpu_only=True, reason="Cython code",
- jax_jit=False, allow_dask_compute=True)
- def fclusterdata(X, t, criterion='inconsistent',
- metric='euclidean', depth=2, method='single', R=None):
- """
- Cluster observation data using a given metric.
- Clusters the original observations in the n-by-m data
- matrix X (n observations in m dimensions), using the euclidean
- distance metric to calculate distances between original observations,
- performs hierarchical clustering using the single linkage algorithm,
- and forms flat clusters using the inconsistency method with `t` as the
- cut-off threshold.
- A 1-D array ``T`` of length ``n`` is returned. ``T[i]`` is
- the index of the flat cluster to which the original observation ``i``
- belongs.
- Parameters
- ----------
- X : (N, M) ndarray
- N by M data matrix with N observations in M dimensions.
- t : scalar
- For criteria 'inconsistent', 'distance' or 'monocrit',
- this is the threshold to apply when forming flat clusters.
- For 'maxclust' or 'maxclust_monocrit' criteria,
- this would be max number of clusters requested.
- criterion : str, optional
- Specifies the criterion for forming flat clusters. Valid
- values are 'inconsistent' (default), 'distance', or 'maxclust'
- cluster formation algorithms. See `fcluster` for descriptions.
- metric : str or function, optional
- The distance metric for calculating pairwise distances. See
- ``distance.pdist`` for descriptions and linkage to verify
- compatibility with the linkage method.
- depth : int, optional
- The maximum depth for the inconsistency calculation. See
- `inconsistent` for more information.
- method : str, optional
- The linkage method to use (single, complete, average,
- weighted, median centroid, ward). See `linkage` for more
- information. Default is "single".
- R : ndarray, optional
- The inconsistency matrix. It will be computed if necessary
- if it is not passed.
- Returns
- -------
- fclusterdata : ndarray
- A vector of length n. T[i] is the flat cluster number to
- which original observation i belongs.
- See Also
- --------
- scipy.spatial.distance.pdist : pairwise distance metrics
- Notes
- -----
- This function is similar to the MATLAB function ``clusterdata``.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import fclusterdata
- This is a convenience method that abstracts all the steps to perform in a
- typical SciPy's hierarchical clustering workflow.
- * Transform the input data into a condensed matrix with
- `scipy.spatial.distance.pdist`.
- * Apply a clustering method.
- * Obtain flat clusters at a user defined distance threshold ``t`` using
- `scipy.cluster.hierarchy.fcluster`.
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> fclusterdata(X, t=1)
- array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
- The output here (for the dataset ``X``, distance threshold ``t``, and the
- default settings) is four clusters with three data points each.
- """
- xp = array_namespace(X)
- X = _asarray(X, order='C', dtype=xp.float64, xp=xp)
- if X.ndim != 2:
- raise TypeError('The observation matrix X must be an n by m array.')
- Y = distance.pdist(X, metric=metric)
- Z = linkage(Y, method=method)
- if R is None:
- R = inconsistent(Z, d=depth)
- else:
- R = _asarray(R, order='C', xp=xp)
- T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
- return T
- @lazy_cython
- def leaves_list(Z):
- """
- Return a list of leaf node ids.
- The return corresponds to the observation vector index as it appears
- in the tree from left to right. Z is a linkage matrix.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as a matrix. `Z` is
- a linkage matrix. See `linkage` for more information.
- Returns
- -------
- leaves_list : ndarray
- The list of leaf node ids.
- See Also
- --------
- dendrogram : for information about dendrogram structure.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list
- >>> from scipy.spatial.distance import pdist
- >>> from matplotlib import pyplot as plt
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- The linkage matrix ``Z`` represents a dendrogram, that is, a tree that
- encodes the structure of the clustering performed.
- `scipy.cluster.hierarchy.leaves_list` shows the mapping between
- indices in the ``X`` dataset and leaves in the dendrogram:
- >>> leaves_list(Z)
- array([ 2, 0, 1, 5, 3, 4, 8, 6, 7, 11, 9, 10], dtype=int32)
- >>> fig = plt.figure(figsize=(25, 10))
- >>> dn = dendrogram(Z)
- >>> plt.show()
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- def cy_leaves_list(Z, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- n = Z.shape[0] + 1
- ML = np.zeros((n,), dtype=np.int32)
- _hierarchy.prelist(Z, ML, n)
- return ML
- n = Z.shape[0] + 1
- return xpx.lazy_apply(cy_leaves_list, Z, validate=is_lazy_array(Z),
- shape=(n, ), dtype=xp.int32,
- as_numpy=True, xp=xp)
- # Maps number of leaves to text size.
- #
- # p <= 20, size="12"
- # 20 < p <= 30, size="10"
- # 30 < p <= 50, size="8"
- # 50 < p <= np.inf, size="6"
- _dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5}
- _drotation = {20: 0, 40: 45, np.inf: 90}
- _dtextsortedkeys = list(_dtextsizes.keys())
- _dtextsortedkeys.sort()
- _drotationsortedkeys = list(_drotation.keys())
- _drotationsortedkeys.sort()
- def _remove_dups(L):
- """
- Remove duplicates AND preserve the original order of the elements.
- The set class is not guaranteed to do this.
- """
- seen_before = set()
- L2 = []
- for i in L:
- if i not in seen_before:
- seen_before.add(i)
- L2.append(i)
- return L2
- def _get_tick_text_size(p):
- for k in _dtextsortedkeys:
- if p <= k:
- return _dtextsizes[k]
- def _get_tick_rotation(p):
- for k in _drotationsortedkeys:
- if p <= k:
- return _drotation[k]
- def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation,
- no_labels, color_list, leaf_font_size=None,
- leaf_rotation=None, contraction_marks=None,
- ax=None, above_threshold_color='C0'):
- # Import matplotlib here so that it's not imported unless dendrograms
- # are plotted. Raise an informative error if importing fails.
- try:
- # if an axis is provided, don't use pylab at all
- if ax is None:
- import matplotlib.pylab
- import matplotlib.patches
- import matplotlib.collections
- except ImportError as e:
- raise ImportError("You must install the matplotlib library to plot "
- "the dendrogram. Use no_plot=True to calculate the "
- "dendrogram without plotting.") from e
- if ax is None:
- ax = matplotlib.pylab.gca()
- # if we're using pylab, we want to trigger a draw at the end
- trigger_redraw = True
- else:
- trigger_redraw = False
- # Independent variable plot width
- ivw = len(ivl) * 10
- # Dependent variable plot height
- dvw = mh + mh * 0.05
- iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10)
- if orientation in ('top', 'bottom'):
- if orientation == 'top':
- ax.set_ylim([0, dvw])
- ax.set_xlim([0, ivw])
- else:
- ax.set_ylim([dvw, 0])
- ax.set_xlim([0, ivw])
- xlines = icoords
- ylines = dcoords
- if no_labels:
- ax.set_xticks([])
- ax.set_xticklabels([])
- else:
- ax.set_xticks(iv_ticks)
- if orientation == 'top':
- ax.xaxis.set_ticks_position('bottom')
- else:
- ax.xaxis.set_ticks_position('top')
- # Make the tick marks invisible because they cover up the links
- for line in ax.get_xticklines():
- line.set_visible(False)
- leaf_rot = (float(_get_tick_rotation(len(ivl)))
- if (leaf_rotation is None) else leaf_rotation)
- leaf_font = (float(_get_tick_text_size(len(ivl)))
- if (leaf_font_size is None) else leaf_font_size)
- ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font)
- elif orientation in ('left', 'right'):
- if orientation == 'left':
- ax.set_xlim([dvw, 0])
- ax.set_ylim([0, ivw])
- else:
- ax.set_xlim([0, dvw])
- ax.set_ylim([0, ivw])
- xlines = dcoords
- ylines = icoords
- if no_labels:
- ax.set_yticks([])
- ax.set_yticklabels([])
- else:
- ax.set_yticks(iv_ticks)
- if orientation == 'left':
- ax.yaxis.set_ticks_position('right')
- else:
- ax.yaxis.set_ticks_position('left')
- # Make the tick marks invisible because they cover up the links
- for line in ax.get_yticklines():
- line.set_visible(False)
- leaf_font = (float(_get_tick_text_size(len(ivl)))
- if (leaf_font_size is None) else leaf_font_size)
- if leaf_rotation is not None:
- ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font)
- else:
- ax.set_yticklabels(ivl, size=leaf_font)
- # Let's use collections instead. This way there is a separate legend item
- # for each tree grouping, rather than stupidly one for each line segment.
- colors_used = _remove_dups(color_list)
- color_to_lines = {}
- for color in colors_used:
- color_to_lines[color] = []
- for (xline, yline, color) in zip(xlines, ylines, color_list):
- color_to_lines[color].append(list(zip(xline, yline)))
- colors_to_collections = {}
- # Construct the collections.
- for color in colors_used:
- coll = matplotlib.collections.LineCollection(color_to_lines[color],
- colors=(color,))
- colors_to_collections[color] = coll
- # Add all the groupings below the color threshold.
- for color in colors_used:
- if color != above_threshold_color:
- ax.add_collection(colors_to_collections[color])
- # If there's a grouping of links above the color threshold, it goes last.
- if above_threshold_color in colors_to_collections:
- ax.add_collection(colors_to_collections[above_threshold_color])
- if contraction_marks is not None:
- Ellipse = matplotlib.patches.Ellipse
- for (x, y) in contraction_marks:
- if orientation in ('left', 'right'):
- e = Ellipse((y, x), width=dvw / 100, height=1.0)
- else:
- e = Ellipse((x, y), width=1.0, height=dvw / 100)
- ax.add_artist(e)
- e.set_clip_box(ax.bbox)
- e.set_alpha(0.5)
- e.set_facecolor('k')
- if trigger_redraw:
- matplotlib.pylab.draw_if_interactive()
- # C0 is used for above threshold color
- _link_line_colors_default = ('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9')
- _link_line_colors = list(_link_line_colors_default)
- @xp_capabilities(out_of_scope=True)
- def set_link_color_palette(palette):
- """
- Set list of matplotlib color codes for use by dendrogram.
- Note that this palette is global (i.e., setting it once changes the colors
- for all subsequent calls to `dendrogram`) and that it affects only the
- the colors below ``color_threshold``.
- Note that `dendrogram` also accepts a custom coloring function through its
- ``link_color_func`` keyword, which is more flexible and non-global.
- Parameters
- ----------
- palette : list of str or None
- A list of matplotlib color codes. The order of the color codes is the
- order in which the colors are cycled through when color thresholding in
- the dendrogram.
- If ``None``, resets the palette to its default (which are matplotlib
- default colors C1 to C9).
- Returns
- -------
- None
- See Also
- --------
- dendrogram
- Notes
- -----
- Ability to reset the palette with ``None`` added in SciPy 0.17.0.
- Thread safety: using this function in a multi-threaded fashion may
- result in `dendrogram` producing plots with unexpected colors.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.cluster import hierarchy
- >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
- ... 400., 754., 564., 138., 219., 869., 669.])
- >>> Z = hierarchy.linkage(ytdist, 'single')
- >>> dn = hierarchy.dendrogram(Z, no_plot=True)
- >>> dn['color_list']
- ['C1', 'C0', 'C0', 'C0', 'C0']
- >>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k'])
- >>> dn = hierarchy.dendrogram(Z, no_plot=True, above_threshold_color='b')
- >>> dn['color_list']
- ['c', 'b', 'b', 'b', 'b']
- >>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267,
- ... above_threshold_color='k')
- >>> dn['color_list']
- ['c', 'm', 'm', 'k', 'k']
- Now, reset the color palette to its default:
- >>> hierarchy.set_link_color_palette(None)
- """
- if palette is None:
- # reset to its default
- palette = _link_line_colors_default
- elif not isinstance(palette, list | tuple):
- raise TypeError("palette must be a list or tuple")
- _ptypes = [isinstance(p, str) for p in palette]
- if False in _ptypes:
- raise TypeError("all palette list elements must be color strings")
- global _link_line_colors
- _link_line_colors = palette
- @xp_capabilities(cpu_only=True, jax_jit=False, allow_dask_compute=True)
- def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None,
- get_leaves=True, orientation='top', labels=None,
- count_sort=False, distance_sort=False, show_leaf_counts=True,
- no_plot=False, no_labels=False, leaf_font_size=None,
- leaf_rotation=None, leaf_label_func=None,
- show_contracted=False, link_color_func=None, ax=None,
- above_threshold_color='C0'):
- """
- Plot the hierarchical clustering as a dendrogram.
- The dendrogram illustrates how each cluster is
- composed by drawing a U-shaped link between a non-singleton
- cluster and its children. The top of the U-link indicates a
- cluster merge. The two legs of the U-link indicate which clusters
- were merged. The length of the two legs of the U-link represents
- the distance between the child clusters. It is also the
- cophenetic distance between original observations in the two
- children clusters.
- Parameters
- ----------
- Z : ndarray
- The linkage matrix encoding the hierarchical clustering to
- render as a dendrogram. See the ``linkage`` function for more
- information on the format of ``Z``.
- p : int, optional
- The ``p`` parameter for ``truncate_mode``.
- truncate_mode : str, optional
- The dendrogram can be hard to read when the original
- observation matrix from which the linkage is derived is
- large. Truncation is used to condense the dendrogram. There
- are several modes:
- ``None``
- No truncation is performed (default).
- Note: ``'none'`` is an alias for ``None`` that's kept for
- backward compatibility.
- ``'lastp'``
- The last ``p`` non-singleton clusters formed in the linkage are the
- only non-leaf nodes in the linkage; they correspond to rows
- ``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are
- contracted into leaf nodes.
- ``'level'``
- No more than ``p`` levels of the dendrogram tree are displayed.
- A "level" includes all nodes with ``p`` merges from the final merge.
- Note: ``'mtica'`` is an alias for ``'level'`` that's kept for
- backward compatibility.
- color_threshold : double, optional
- For brevity, let :math:`t` be the ``color_threshold``.
- Colors all the descendent links below a cluster node
- :math:`k` the same color if :math:`k` is the first node below
- the cut threshold :math:`t`. All links connecting nodes with
- distances greater than or equal to the threshold are colored
- with de default matplotlib color ``'C0'``. If :math:`t` is less
- than or equal to zero, all nodes are colored ``'C0'``.
- If ``color_threshold`` is None or 'default',
- corresponding with MATLAB(TM) behavior, the threshold is set to
- ``0.7*max(Z[:,2])``.
- get_leaves : bool, optional
- Includes a list ``R['leaves']=H`` in the result
- dictionary. For each :math:`i`, ``H[i] == j``, cluster node
- ``j`` appears in position ``i`` in the left-to-right traversal
- of the leaves, where :math:`j < 2n-1` and :math:`i < n`.
- orientation : str, optional
- The direction to plot the dendrogram, which can be any
- of the following strings:
- ``'top'``
- Plots the root at the top, and plot descendent links going downwards.
- (default).
- ``'bottom'``
- Plots the root at the bottom, and plot descendent links going
- upwards.
- ``'left'``
- Plots the root at the left, and plot descendent links going right.
- ``'right'``
- Plots the root at the right, and plot descendent links going left.
- labels : ndarray, optional
- By default, ``labels`` is None so the index of the original observation
- is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized
- sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the
- text to put under the :math:`i` th leaf node only if it corresponds to
- an original observation and not a non-singleton cluster.
- count_sort : str or bool, optional
- For each node n, the order (visually, from left-to-right) n's
- two descendent links are plotted is determined by this
- parameter, which can be any of the following values:
- ``False``
- Nothing is done.
- ``'ascending'`` or ``True``
- The child with the minimum number of original objects in its cluster
- is plotted first.
- ``'descending'``
- The child with the maximum number of original objects in its cluster
- is plotted first.
- Note, ``distance_sort`` and ``count_sort`` cannot both be True.
- distance_sort : str or bool, optional
- For each node n, the order (visually, from left-to-right) n's
- two descendent links are plotted is determined by this
- parameter, which can be any of the following values:
- ``False``
- Nothing is done.
- ``'ascending'`` or ``True``
- The child with the minimum distance between its direct descendents is
- plotted first.
- ``'descending'``
- The child with the maximum distance between its direct descendents is
- plotted first.
- Note ``distance_sort`` and ``count_sort`` cannot both be True.
- show_leaf_counts : bool, optional
- When True, leaf nodes representing :math:`k>1` original
- observation are labeled with the number of observations they
- contain in parentheses.
- no_plot : bool, optional
- When True, the final rendering is not performed. This is
- useful if only the data structures computed for the rendering
- are needed or if matplotlib is not available.
- no_labels : bool, optional
- When True, no labels appear next to the leaf nodes in the
- rendering of the dendrogram.
- leaf_rotation : double, optional
- Specifies the angle (in degrees) to rotate the leaf
- labels. When unspecified, the rotation is based on the number of
- nodes in the dendrogram (default is 0).
- leaf_font_size : int, optional
- Specifies the font size (in points) of the leaf labels. When
- unspecified, the size based on the number of nodes in the
- dendrogram.
- leaf_label_func : lambda or function, optional
- When ``leaf_label_func`` is a callable function, for each
- leaf with cluster index :math:`k < 2n-1`. The function
- is expected to return a string with the label for the
- leaf.
- Indices :math:`k < n` correspond to original observations
- while indices :math:`k \\geq n` correspond to non-singleton
- clusters.
- For example, to label singletons with their node id and
- non-singletons with their id, count, and inconsistency
- coefficient, simply do::
- # First define the leaf label function.
- def llf(id):
- if id < n:
- return str(id)
- else:
- return '[%d %d %1.2f]' % (id, count, R[n-id,3])
- # The text for the leaf nodes is going to be big so force
- # a rotation of 90 degrees.
- dendrogram(Z, leaf_label_func=llf, leaf_rotation=90)
- # leaf_label_func can also be used together with ``truncate_mode``,
- # in which case you will get your leaves labeled after truncation:
- dendrogram(Z, leaf_label_func=llf, leaf_rotation=90,
- truncate_mode='level', p=2)
- show_contracted : bool, optional
- When True the heights of non-singleton nodes contracted
- into a leaf node are plotted as crosses along the link
- connecting that leaf node. This really is only useful when
- truncation is used (see ``truncate_mode`` parameter).
- link_color_func : callable, optional
- If given, `link_color_function` is called with each non-singleton id
- corresponding to each U-shaped link it will paint. The function is
- expected to return the color to paint the link, encoded as a matplotlib
- color string code. For example::
- dendrogram(Z, link_color_func=lambda k: colors[k])
- colors the direct links below each untruncated non-singleton node
- ``k`` using ``colors[k]``.
- ax : matplotlib Axes instance, optional
- If None and `no_plot` is not True, the dendrogram will be plotted
- on the current axes. Otherwise if `no_plot` is not True the
- dendrogram will be plotted on the given ``Axes`` instance. This can be
- useful if the dendrogram is part of a more complex figure.
- above_threshold_color : str, optional
- This matplotlib color string sets the color of the links above the
- color_threshold. The default is ``'C0'``.
- Returns
- -------
- R : dict
- A dictionary of data structures computed to render the
- dendrogram. Its has the following keys:
- ``'color_list'``
- A list of color names. The k'th element represents the color of the
- k'th link.
- ``'icoord'`` and ``'dcoord'``
- Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]``
- where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]``
- where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is
- ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``.
- ``'ivl'``
- A list of labels corresponding to the leaf nodes.
- ``'leaves'``
- For each i, ``H[i] == j``, cluster node ``j`` appears in position
- ``i`` in the left-to-right traversal of the leaves, where
- :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the
- ``i``-th leaf node corresponds to an original observation.
- Otherwise, it corresponds to a non-singleton cluster.
- ``'leaves_color_list'``
- A list of color names. The k'th element represents the color of the
- k'th leaf.
- See Also
- --------
- linkage, set_link_color_palette
- Notes
- -----
- It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise
- crossings appear in the dendrogram.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.cluster import hierarchy
- >>> import matplotlib.pyplot as plt
- A very basic example:
- >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
- ... 400., 754., 564., 138., 219., 869., 669.])
- >>> Z = hierarchy.linkage(ytdist, 'single')
- >>> plt.figure()
- >>> dn = hierarchy.dendrogram(Z)
- Now, plot in given axes, improve the color scheme and use both vertical and
- horizontal orientations:
- >>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
- >>> fig, axes = plt.subplots(1, 2, figsize=(8, 3))
- >>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
- ... orientation='top')
- >>> dn2 = hierarchy.dendrogram(Z, ax=axes[1],
- ... above_threshold_color='#bcbddc',
- ... orientation='right')
- >>> hierarchy.set_link_color_palette(None) # reset to default after use
- >>> plt.show()
- """
- # This feature was thought about but never implemented (still useful?):
- #
- # ... = dendrogram(..., leaves_order=None)
- #
- # Plots the leaves in the order specified by a vector of
- # original observation indices. If the vector contains duplicates
- # or results in a crossing, an exception will be thrown. Passing
- # None orders leaf nodes based on the order they appear in the
- # pre-order traversal.
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', xp=xp)
- if orientation not in ["top", "left", "bottom", "right"]:
- raise ValueError("orientation must be one of 'top', 'left', "
- "'bottom', or 'right'")
- if labels is not None:
- try:
- len_labels = len(labels)
- except (TypeError, AttributeError):
- len_labels = labels.shape[0]
- if Z.shape[0] + 1 != len_labels:
- raise ValueError("Dimensions of Z and labels must be consistent.")
- _is_valid_linkage(Z, throw=True, name='Z', materialize=True, xp=xp)
- Zs = Z.shape
- n = Zs[0] + 1
- if isinstance(p, int | float):
- p = int(p)
- else:
- raise TypeError('The second argument must be a number')
- if truncate_mode not in ('lastp', 'mtica', 'level', 'none', None):
- # 'mtica' is kept working for backwards compat.
- raise ValueError('Invalid truncation mode.')
- if truncate_mode == 'lastp':
- if p > n or p == 0:
- p = n
- if truncate_mode == 'mtica':
- # 'mtica' is an alias
- truncate_mode = 'level'
- if truncate_mode == 'level':
- if p <= 0:
- p = np.inf
- if get_leaves:
- lvs = []
- else:
- lvs = None
- icoord_list = []
- dcoord_list = []
- color_list = []
- current_color = [0]
- currently_below_threshold = [False]
- ivl = [] # list of leaves
- if color_threshold is None or (isinstance(color_threshold, str) and
- color_threshold == 'default'):
- color_threshold = xp.max(Z[:, 2]) * 0.7
- R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl,
- 'leaves': lvs, 'color_list': color_list}
- # Empty list will be filled in _dendrogram_calculate_info
- contraction_marks = [] if show_contracted else None
- _dendrogram_calculate_info(
- Z=Z, p=p,
- truncate_mode=truncate_mode,
- color_threshold=color_threshold,
- get_leaves=get_leaves,
- orientation=orientation,
- labels=labels,
- count_sort=count_sort,
- distance_sort=distance_sort,
- show_leaf_counts=show_leaf_counts,
- i=2*n - 2,
- iv=0.0,
- ivl=ivl,
- n=n,
- icoord_list=icoord_list,
- dcoord_list=dcoord_list,
- lvs=lvs,
- current_color=current_color,
- color_list=color_list,
- currently_below_threshold=currently_below_threshold,
- leaf_label_func=leaf_label_func,
- contraction_marks=contraction_marks,
- link_color_func=link_color_func,
- above_threshold_color=above_threshold_color)
- if not no_plot:
- mh = xp.max(Z[:, 2])
- _plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation,
- no_labels, color_list,
- leaf_font_size=leaf_font_size,
- leaf_rotation=leaf_rotation,
- contraction_marks=contraction_marks,
- ax=ax,
- above_threshold_color=above_threshold_color)
- R["leaves_color_list"] = _get_leaves_color_list(R)
- return R
- def _get_leaves_color_list(R):
- leaves_color_list = [None] * len(R['leaves'])
- for link_x, link_y, link_color in zip(R['icoord'],
- R['dcoord'],
- R['color_list']):
- for (xi, yi) in zip(link_x, link_y):
- if yi == 0.0 and (xi % 5 == 0 and xi % 2 == 1):
- # if yi is 0.0 and xi is divisible by 5 and odd,
- # the point is a leaf
- # xi of leaves are 5, 15, 25, 35, ... (see `iv_ticks`)
- # index of leaves are 0, 1, 2, 3, ... as below
- leaf_index = (int(xi) - 5) // 10
- # each leaf has a same color of its link.
- leaves_color_list[leaf_index] = link_color
- return leaves_color_list
- def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
- i, labels):
- # If the leaf id structure is not None and is a list then the caller
- # to dendrogram has indicated that cluster id's corresponding to the
- # leaf nodes should be recorded.
- if lvs is not None:
- lvs.append(int(i))
- # If leaf node labels are to be displayed...
- if ivl is not None:
- # If a leaf_label_func has been provided, the label comes from the
- # string returned from the leaf_label_func, which is a function
- # passed to dendrogram.
- if leaf_label_func:
- ivl.append(leaf_label_func(int(i)))
- else:
- # Otherwise, if the dendrogram caller has passed a labels list
- # for the leaf nodes, use it.
- if labels is not None:
- ivl.append(labels[int(i - n)])
- else:
- # Otherwise, use the id as the label for the leaf.x
- ivl.append(str(int(i)))
- def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
- i, labels, show_leaf_counts):
- # If the leaf id structure is not None and is a list then the caller
- # to dendrogram has indicated that cluster id's corresponding to the
- # leaf nodes should be recorded.
- if lvs is not None:
- lvs.append(int(i))
- if ivl is not None:
- if leaf_label_func:
- ivl.append(leaf_label_func(int(i)))
- else:
- if show_leaf_counts:
- ivl.append("(" + str(np.asarray(Z[i - n, 3], dtype=np.int64)) + ")")
- else:
- ivl.append("")
- def _append_contraction_marks(Z, iv, i, n, contraction_marks, xp):
- _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp),
- n, contraction_marks, xp)
- _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp),
- n, contraction_marks, xp)
- def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks, xp):
- if i >= n:
- contraction_marks.append((iv, Z[i - n, 2]))
- _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp),
- n, contraction_marks, xp)
- _append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp),
- n, contraction_marks, xp)
- def _dendrogram_calculate_info(Z, p, truncate_mode,
- color_threshold=np.inf, get_leaves=True,
- orientation='top', labels=None,
- count_sort=False, distance_sort=False,
- show_leaf_counts=False, i=-1, iv=0.0,
- ivl=None, n=0, icoord_list=None, dcoord_list=None,
- lvs=None, mhr=False,
- current_color=None, color_list=None,
- currently_below_threshold=None,
- leaf_label_func=None, level=0,
- contraction_marks=None,
- link_color_func=None,
- above_threshold_color='C0'):
- """
- Calculate the endpoints of the links as well as the labels for the
- the dendrogram rooted at the node with index i. iv is the independent
- variable value to plot the left-most leaf node below the root node i
- (if orientation='top', this would be the left-most x value where the
- plotting of this root node i and its descendents should begin).
- ivl is a list to store the labels of the leaf nodes. The leaf_label_func
- is called whenever ivl != None, labels == None, and
- leaf_label_func != None. When ivl != None and labels != None, the
- labels list is used only for labeling the leaf nodes. When
- ivl == None, no labels are generated for leaf nodes.
- When get_leaves==True, a list of leaves is built as they are visited
- in the dendrogram.
- Returns a tuple with l being the independent variable coordinate that
- corresponds to the midpoint of cluster to the left of cluster i if
- i is non-singleton, otherwise the independent coordinate of the leaf
- node if i is a leaf node.
- Returns
- -------
- A tuple (left, w, h, md), where:
- * left is the independent variable coordinate of the center of the
- the U of the subtree
- * w is the amount of space used for the subtree (in independent
- variable units)
- * h is the height of the subtree in dependent variable units
- * md is the ``max(Z[*,2]``) for all nodes ``*`` below and including
- the target node.
- """
- xp = array_namespace(Z)
- if n == 0:
- raise ValueError("Invalid singleton cluster count n.")
- if i == -1:
- raise ValueError("Invalid root cluster index i.")
- if truncate_mode == 'lastp':
- # If the node is a leaf node but corresponds to a non-singleton
- # cluster, its label is either the empty string or the number of
- # original observations belonging to cluster i.
- if 2*n - p > i >= n:
- d = Z[i - n, 2]
- _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
- leaf_label_func, i, labels,
- show_leaf_counts)
- if contraction_marks is not None:
- _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp)
- return (iv + 5.0, 10.0, 0.0, d)
- elif i < n:
- _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
- leaf_label_func, i, labels)
- return (iv + 5.0, 10.0, 0.0, 0.0)
- elif truncate_mode == 'level':
- if i > n and level > p:
- d = Z[i - n, 2]
- _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
- leaf_label_func, i, labels,
- show_leaf_counts)
- if contraction_marks is not None:
- _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp)
- return (iv + 5.0, 10.0, 0.0, d)
- elif i < n:
- _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
- leaf_label_func, i, labels)
- return (iv + 5.0, 10.0, 0.0, 0.0)
- # Otherwise, only truncate if we have a leaf node.
- #
- # Only place leaves if they correspond to original observations.
- if i < n:
- _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
- leaf_label_func, i, labels)
- return (iv + 5.0, 10.0, 0.0, 0.0)
- # !!! Otherwise, we don't have a leaf node, so work on plotting a
- # non-leaf node.
- # Actual indices of a and b
- aa = int_floor(Z[i - n, 0], xp)
- ab = int_floor(Z[i - n, 1], xp)
- if aa >= n:
- # The number of singletons below cluster a
- na = Z[aa - n, 3]
- # The distance between a's two direct children.
- da = Z[aa - n, 2]
- else:
- na = 1
- da = 0.0
- if ab >= n:
- nb = Z[ab - n, 3]
- db = Z[ab - n, 2]
- else:
- nb = 1
- db = 0.0
- if count_sort == 'ascending' or count_sort is True:
- # If a has a count greater than b, it and its descendents should
- # be drawn to the right. Otherwise, to the left.
- if na > nb:
- # The cluster index to draw to the left (ua) will be ab
- # and the one to draw to the right (ub) will be aa
- ua = ab
- ub = aa
- else:
- ua = aa
- ub = ab
- elif count_sort == 'descending':
- # If a has a count less than or equal to b, it and its
- # descendents should be drawn to the left. Otherwise, to
- # the right.
- if na > nb:
- ua = aa
- ub = ab
- else:
- ua = ab
- ub = aa
- elif distance_sort == 'ascending' or distance_sort is True:
- # If a has a distance greater than b, it and its descendents should
- # be drawn to the right. Otherwise, to the left.
- if da > db:
- ua = ab
- ub = aa
- else:
- ua = aa
- ub = ab
- elif distance_sort == 'descending':
- # If a has a distance less than or equal to b, it and its
- # descendents should be drawn to the left. Otherwise, to
- # the right.
- if da > db:
- ua = aa
- ub = ab
- else:
- ua = ab
- ub = aa
- else:
- ua = aa
- ub = ab
- # Updated iv variable and the amount of space used.
- (uiva, uwa, uah, uamd) = \
- _dendrogram_calculate_info(
- Z=Z, p=p,
- truncate_mode=truncate_mode,
- color_threshold=color_threshold,
- get_leaves=get_leaves,
- orientation=orientation,
- labels=labels,
- count_sort=count_sort,
- distance_sort=distance_sort,
- show_leaf_counts=show_leaf_counts,
- i=ua, iv=iv, ivl=ivl, n=n,
- icoord_list=icoord_list,
- dcoord_list=dcoord_list, lvs=lvs,
- current_color=current_color,
- color_list=color_list,
- currently_below_threshold=currently_below_threshold,
- leaf_label_func=leaf_label_func,
- level=level + 1, contraction_marks=contraction_marks,
- link_color_func=link_color_func,
- above_threshold_color=above_threshold_color)
- h = Z[i - n, 2]
- if h >= color_threshold or color_threshold <= 0:
- c = above_threshold_color
- if currently_below_threshold[0]:
- current_color[0] = (current_color[0] + 1) % len(_link_line_colors)
- currently_below_threshold[0] = False
- else:
- currently_below_threshold[0] = True
- c = _link_line_colors[current_color[0]]
- (uivb, uwb, ubh, ubmd) = \
- _dendrogram_calculate_info(
- Z=Z, p=p,
- truncate_mode=truncate_mode,
- color_threshold=color_threshold,
- get_leaves=get_leaves,
- orientation=orientation,
- labels=labels,
- count_sort=count_sort,
- distance_sort=distance_sort,
- show_leaf_counts=show_leaf_counts,
- i=ub, iv=iv + uwa, ivl=ivl, n=n,
- icoord_list=icoord_list,
- dcoord_list=dcoord_list, lvs=lvs,
- current_color=current_color,
- color_list=color_list,
- currently_below_threshold=currently_below_threshold,
- leaf_label_func=leaf_label_func,
- level=level + 1, contraction_marks=contraction_marks,
- link_color_func=link_color_func,
- above_threshold_color=above_threshold_color)
- max_dist = max(uamd, ubmd, h)
- icoord_list.append([uiva, uiva, uivb, uivb])
- dcoord_list.append([uah, h, h, ubh])
- if link_color_func is not None:
- v = link_color_func(int(i))
- if not isinstance(v, str):
- raise TypeError("link_color_func must return a matplotlib "
- "color string!")
- color_list.append(v)
- else:
- color_list.append(c)
- return (((uiva + uivb) / 2), uwa + uwb, h, max_dist)
- @xp_capabilities()
- def is_isomorphic(T1, T2):
- """
- Determine if two different cluster assignments are equivalent.
- Parameters
- ----------
- T1 : array_like
- An assignment of singleton cluster ids to flat cluster ids.
- T2 : array_like
- An assignment of singleton cluster ids to flat cluster ids.
- Returns
- -------
- b : bool
- Whether the flat cluster assignments `T1` and `T2` are
- equivalent.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- fcluster : for the creation of flat cluster assignments.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import fcluster, is_isomorphic
- >>> from scipy.cluster.hierarchy import single, complete
- >>> from scipy.spatial.distance import pdist
- Two flat cluster assignments can be isomorphic if they represent the same
- cluster assignment, with different labels.
- For example, we can use the `scipy.cluster.hierarchy.single` method
- and flatten the output to four clusters:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = single(pdist(X))
- >>> T = fcluster(Z, 1, criterion='distance')
- >>> T
- array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
- We can then do the same using the
- `scipy.cluster.hierarchy.complete`: method:
- >>> Z = complete(pdist(X))
- >>> T_ = fcluster(Z, 1.5, criterion='distance')
- >>> T_
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- As we can see, in both cases we obtain four clusters and all the data
- points are distributed in the same way - the only thing that changes
- are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both
- cluster assignments are isomorphic:
- >>> is_isomorphic(T, T_)
- True
- """
- xp = array_namespace(T1, T2)
- T1 = _asarray(T1, xp=xp)
- T2 = _asarray(T2, xp=xp)
- if T1.ndim != 1:
- raise ValueError('T1 must be one-dimensional.')
- if T2.ndim != 1:
- raise ValueError('T2 must be one-dimensional.')
- if T1.shape != T2.shape:
- raise ValueError('T1 and T2 must have the same number of elements.')
- # For each pair of (i, j) indices, test that
- # T1[i] == T1[j] <--> T2[i] == T2[j]
- # In other words, if the same symbol appears multiple times in T1,
- # then a potentially different symbol must also be repeated in the same
- # positions in T2, and vice versa.
- # O(n*log(n)) algorithm.
- # It is also possible to write a O(n) algorithm on top of unique_all(),
- # but in practice it's much slower even for large clusters.
- idx = xp.argsort(T1)
- T1 = xp.take(T1, idx)
- T2 = xp.take(T2, idx)
- changes1 = T1 != xp.roll(T1, -1)
- changes2 = T2 != xp.roll(T2, -1)
- return xp.all(changes1 == changes2)
- @lazy_cython
- def maxdists(Z):
- """
- Return the maximum distance between any non-singleton cluster.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as a matrix. See
- ``linkage`` for more information.
- Returns
- -------
- maxdists : ndarray
- A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents
- the maximum distance between any cluster (including
- singletons) below and including the node with index i. More
- specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the
- set of all node indices below and including node i.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- is_monotonic : for testing for monotonicity of a linkage matrix.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import median, maxdists
- >>> from scipy.spatial.distance import pdist
- Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists`
- computes for each new cluster generated (i.e., for each row of the linkage
- matrix) what is the maximum distance between any two child clusters.
- Due to the nature of hierarchical clustering, in many cases this is going
- to be just the distance between the two child clusters that were merged
- to form the current one - that is, Z[:,2].
- However, for non-monotonic cluster assignments such as
- `scipy.cluster.hierarchy.median` clustering this is not always the
- case: There may be cluster formations were the distance between the two
- clusters merged is smaller than the distance between their children.
- We can see this in an example:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = median(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 2. , 12. , 1.11803399, 3. ],
- [ 5. , 13. , 1.11803399, 3. ],
- [ 8. , 15. , 1.11803399, 3. ],
- [11. , 14. , 1.11803399, 3. ],
- [18. , 19. , 3. , 6. ],
- [16. , 17. , 3.5 , 6. ],
- [20. , 21. , 3.25 , 12. ]])
- >>> maxdists(Z)
- array([1. , 1. , 1. , 1. , 1.11803399,
- 1.11803399, 1.11803399, 1.11803399, 3. , 3.5 ,
- 3.5 ])
- Note that while the distance between the two clusters merged when creating the
- last cluster is 3.25, there are two children (clusters 16 and 17) whose distance
- is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in
- this case.
- """
- xp = array_namespace(Z)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- def cy_maxdists(Z, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- MD = np.zeros((Z.shape[0],))
- _hierarchy.get_max_dist_for_each_cluster(Z, MD, Z.shape[0] + 1)
- return MD
- return xpx.lazy_apply(cy_maxdists, Z, validate=is_lazy_array(Z),
- shape=(Z.shape[0], ), dtype=xp.float64,
- as_numpy=True, xp=xp)
- @lazy_cython
- def maxinconsts(Z, R):
- """
- Return the maximum inconsistency coefficient for each
- non-singleton cluster and its children.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as a matrix. See
- `linkage` for more information.
- R : ndarray
- The inconsistency matrix.
- Returns
- -------
- MI : ndarray
- A monotonic ``(n-1)``-sized numpy array of doubles.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- inconsistent : for the creation of a inconsistency matrix.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts
- >>> from scipy.spatial.distance import pdist
- Given a data set ``X``, we can apply a clustering method to obtain a
- linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
- be also used to obtain the inconsistency matrix ``R`` associated to
- this clustering process:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = median(pdist(X))
- >>> R = inconsistent(Z)
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 2. , 12. , 1.11803399, 3. ],
- [ 5. , 13. , 1.11803399, 3. ],
- [ 8. , 15. , 1.11803399, 3. ],
- [11. , 14. , 1.11803399, 3. ],
- [18. , 19. , 3. , 6. ],
- [16. , 17. , 3.5 , 6. ],
- [20. , 21. , 3.25 , 12. ]])
- >>> R
- array([[1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.74535599, 1.08655358, 3. , 1.15470054],
- [1.91202266, 1.37522872, 3. , 1.15470054],
- [3.25 , 0.25 , 3. , 0. ]])
- Here, `scipy.cluster.hierarchy.maxinconsts` can be used to compute
- the maximum value of the inconsistency statistic (the last column of
- ``R``) for each non-singleton cluster and its children:
- >>> maxinconsts(Z, R)
- array([0. , 0. , 0. , 0. , 0.70710678,
- 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
- 1.15470054])
- """
- xp = array_namespace(Z, R)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- _is_valid_im(R, throw=True, name='R', xp=xp)
- if Z.shape[0] != R.shape[0]:
- raise ValueError("The inconsistency matrix and linkage matrix each "
- "have a different number of rows.")
-
- def cy_maxinconsts(Z, R, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- _is_valid_im(R, throw=True, name='R', xp=np)
- n = Z.shape[0] + 1
- MI = np.zeros((n - 1,))
- _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, n, 3)
- return MI
- return xpx.lazy_apply(cy_maxinconsts, Z, R, validate=is_lazy_array(Z),
- shape=(Z.shape[0], ), dtype=xp.float64,
- as_numpy=True, xp=xp)
- @lazy_cython
- def maxRstat(Z, R, i):
- """
- Return the maximum statistic for each non-singleton cluster and its
- children.
- Parameters
- ----------
- Z : array_like
- The hierarchical clustering encoded as a matrix. See `linkage` for more
- information.
- R : array_like
- The inconsistency matrix.
- i : int
- The column of `R` to use as the statistic.
- Returns
- -------
- MR : ndarray
- Calculates the maximum statistic for the i'th column of the
- inconsistency matrix `R` for each non-singleton cluster
- node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]``, where
- ``Q(j)`` the set of all node ids corresponding to nodes below
- and including ``j``.
- See Also
- --------
- linkage : for a description of what a linkage matrix is.
- inconsistent : for the creation of a inconsistency matrix.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat
- >>> from scipy.spatial.distance import pdist
- Given a data set ``X``, we can apply a clustering method to obtain a
- linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
- be also used to obtain the inconsistency matrix ``R`` associated to
- this clustering process:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = median(pdist(X))
- >>> R = inconsistent(Z)
- >>> R
- array([[1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1. , 0. , 1. , 0. ],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.05901699, 0.08346263, 2. , 0.70710678],
- [1.74535599, 1.08655358, 3. , 1.15470054],
- [1.91202266, 1.37522872, 3. , 1.15470054],
- [3.25 , 0.25 , 3. , 0. ]])
- `scipy.cluster.hierarchy.maxRstat` can be used to compute
- the maximum value of each column of ``R``, for each non-singleton
- cluster and its children:
- >>> maxRstat(Z, R, 0)
- array([1. , 1. , 1. , 1. , 1.05901699,
- 1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266,
- 3.25 ])
- >>> maxRstat(Z, R, 1)
- array([0. , 0. , 0. , 0. , 0.08346263,
- 0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872,
- 1.37522872])
- >>> maxRstat(Z, R, 3)
- array([0. , 0. , 0. , 0. , 0.70710678,
- 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
- 1.15470054])
- """
- xp = array_namespace(Z, R)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- R = _asarray(R, order='C', dtype=xp.float64, xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- _is_valid_im(R, throw=True, name='R', xp=xp)
- if not isinstance(i, int):
- raise TypeError('The third argument must be an integer.')
- if i < 0 or i > 3:
- raise ValueError('i must be an integer between 0 and 3 inclusive.')
- if Z.shape[0] != R.shape[0]:
- raise ValueError("The inconsistency matrix and linkage matrix each "
- "have a different number of rows.")
- def cy_maxRstat(Z, R, i, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- _is_valid_im(R, throw=True, name='R', xp=np)
- MR = np.zeros((Z.shape[0],))
- n = Z.shape[0] + 1
- _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, n, i)
- return MR
- return xpx.lazy_apply(cy_maxRstat, Z, R, i=i, validate=is_lazy_array(Z),
- shape=(Z.shape[0], ), dtype=xp.float64,
- as_numpy=True, xp=xp)
- # Data-dependent output shape makes it impossible to use jax.jit
- @xp_capabilities(cpu_only=True, reason="Cython code", jax_jit=False,
- warnings=[("dask.array", "merges chunks")])
- def leaders(Z, T):
- """
- Return the root nodes in a hierarchical clustering.
- Returns the root nodes in a hierarchical clustering corresponding
- to a cut defined by a flat cluster assignment vector ``T``. See
- the ``fcluster`` function for more information on the format of ``T``.
- For each flat cluster :math:`j` of the :math:`k` flat clusters
- represented in the n-sized flat cluster assignment vector ``T``,
- this function finds the lowest cluster node :math:`i` in the linkage
- tree Z, such that:
- * leaf descendants belong only to flat cluster j
- (i.e., ``T[p]==j`` for all :math:`p` in :math:`S(i)`, where
- :math:`S(i)` is the set of leaf ids of descendant leaf nodes
- with cluster node :math:`i`)
- * there does not exist a leaf that is not a descendant with
- :math:`i` that also belongs to cluster :math:`j`
- (i.e., ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If
- this condition is violated, ``T`` is not a valid cluster
- assignment vector, and an exception will be thrown.
- Parameters
- ----------
- Z : ndarray
- The hierarchical clustering encoded as a matrix. See
- `linkage` for more information.
- T : ndarray
- The flat cluster assignment vector.
- Returns
- -------
- L : ndarray
- The leader linkage node id's stored as a k-element 1-D array,
- where ``k`` is the number of flat clusters found in ``T``.
- ``L[j]=i`` is the linkage cluster node id that is the
- leader of flat cluster with id M[j]. If ``i < n``, ``i``
- corresponds to an original observation, otherwise it
- corresponds to a non-singleton cluster.
- M : ndarray
- The leader linkage node id's stored as a k-element 1-D array, where
- ``k`` is the number of flat clusters found in ``T``. This allows the
- set of flat cluster ids to be any arbitrary set of ``k`` integers.
- For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with
- id 8's leader is linkage node 2.
- See Also
- --------
- fcluster : for the creation of flat cluster assignments.
- Examples
- --------
- >>> from scipy.cluster.hierarchy import ward, fcluster, leaders
- >>> from scipy.spatial.distance import pdist
- Given a linkage matrix ``Z`` - obtained after apply a clustering method
- to a dataset ``X`` - and a flat cluster assignment array ``T``:
- >>> X = [[0, 0], [0, 1], [1, 0],
- ... [0, 4], [0, 3], [1, 4],
- ... [4, 0], [3, 0], [4, 1],
- ... [4, 4], [3, 4], [4, 3]]
- >>> Z = ward(pdist(X))
- >>> Z
- array([[ 0. , 1. , 1. , 2. ],
- [ 3. , 4. , 1. , 2. ],
- [ 6. , 7. , 1. , 2. ],
- [ 9. , 10. , 1. , 2. ],
- [ 2. , 12. , 1.29099445, 3. ],
- [ 5. , 13. , 1.29099445, 3. ],
- [ 8. , 14. , 1.29099445, 3. ],
- [11. , 15. , 1.29099445, 3. ],
- [16. , 17. , 5.77350269, 6. ],
- [18. , 19. , 5.77350269, 6. ],
- [20. , 21. , 8.16496581, 12. ]])
- >>> T = fcluster(Z, 3, criterion='distance')
- >>> T
- array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
- `scipy.cluster.hierarchy.leaders` returns the indices of the nodes
- in the dendrogram that are the leaders of each flat cluster:
- >>> L, M = leaders(Z, T)
- >>> L
- array([16, 17, 18, 19], dtype=int32)
- (remember that indices 0-11 point to the 12 data points in ``X``,
- whereas indices 12-22 point to the 11 rows of ``Z``)
- `scipy.cluster.hierarchy.leaders` also returns the indices of
- the flat clusters in ``T``:
- >>> M
- array([1, 2, 3, 4], dtype=int32)
- Notes
- -----
- *Array API support (experimental):* This function returns arrays
- with data-dependent shape. In JAX, at the moment of writing this makes it
- impossible to execute it inside `@jax.jit`.
- """
- xp = array_namespace(Z, T)
- Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
- T = _asarray(T, order='C', xp=xp)
- _is_valid_linkage(Z, throw=True, name='Z', xp=xp)
- if T.dtype != xp.int32:
- raise TypeError('T must be a 1-D array of dtype int32.')
- if T.shape[0] != Z.shape[0] + 1:
- raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
- n_obs = Z.shape[0] + 1
- def cy_leaders(Z, T, validate):
- if validate:
- _is_valid_linkage(Z, throw=True, name='Z', xp=np)
- n_clusters = int(xpx.nunique(T))
- L = np.zeros(n_clusters, dtype=np.int32)
- M = np.zeros(n_clusters, dtype=np.int32)
- s = _hierarchy.leaders(Z, T, L, M, n_clusters, n_obs)
- if s >= 0:
- raise ValueError('T is not a valid assignment vector. Error found '
- f'when examining linkage node {s} (< 2n-1).')
- return L, M
- return xpx.lazy_apply(cy_leaders, Z, T, validate=is_lazy_array(Z),
- shape=((None,), (None, )), dtype=(xp.int32, xp.int32),
- as_numpy=True, xp=xp)
|