similarity.py 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107
  1. """Functions measuring similarity using graph edit distance.
  2. The graph edit distance is the number of edge/node changes needed
  3. to make two graphs isomorphic.
  4. The default algorithm/implementation is sub-optimal for some graphs.
  5. The problem of finding the exact Graph Edit Distance (GED) is NP-hard
  6. so it is often slow. If the simple interface `graph_edit_distance`
  7. takes too long for your graph, try `optimize_graph_edit_distance`
  8. and/or `optimize_edit_paths`.
  9. At the same time, I encourage capable people to investigate
  10. alternative GED algorithms, in order to improve the choices available.
  11. """
  12. import math
  13. import time
  14. from dataclasses import dataclass
  15. from itertools import product
  16. import networkx as nx
  17. from networkx.utils import np_random_state
  18. __all__ = [
  19. "graph_edit_distance",
  20. "optimal_edit_paths",
  21. "optimize_graph_edit_distance",
  22. "optimize_edit_paths",
  23. "simrank_similarity",
  24. "panther_similarity",
  25. "panther_vector_similarity",
  26. "generate_random_paths",
  27. ]
  28. @nx._dispatchable(
  29. graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
  30. )
  31. def graph_edit_distance(
  32. G1,
  33. G2,
  34. node_match=None,
  35. edge_match=None,
  36. node_subst_cost=None,
  37. node_del_cost=None,
  38. node_ins_cost=None,
  39. edge_subst_cost=None,
  40. edge_del_cost=None,
  41. edge_ins_cost=None,
  42. roots=None,
  43. upper_bound=None,
  44. timeout=None,
  45. ):
  46. """Returns GED (graph edit distance) between graphs G1 and G2.
  47. Graph edit distance is a graph similarity measure analogous to
  48. Levenshtein distance for strings. It is defined as minimum cost
  49. of edit path (sequence of node and edge edit operations)
  50. transforming graph G1 to graph isomorphic to G2.
  51. Parameters
  52. ----------
  53. G1, G2: graphs
  54. The two graphs G1 and G2 must be of the same type.
  55. node_match : callable
  56. A function that returns True if node n1 in G1 and n2 in G2
  57. should be considered equal during matching.
  58. The function will be called like
  59. node_match(G1.nodes[n1], G2.nodes[n2]).
  60. That is, the function will receive the node attribute
  61. dictionaries for n1 and n2 as inputs.
  62. Ignored if node_subst_cost is specified. If neither
  63. node_match nor node_subst_cost are specified then node
  64. attributes are not considered.
  65. edge_match : callable
  66. A function that returns True if the edge attribute dictionaries
  67. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  68. be considered equal during matching.
  69. The function will be called like
  70. edge_match(G1[u1][v1], G2[u2][v2]).
  71. That is, the function will receive the edge attribute
  72. dictionaries of the edges under consideration.
  73. Ignored if edge_subst_cost is specified. If neither
  74. edge_match nor edge_subst_cost are specified then edge
  75. attributes are not considered.
  76. node_subst_cost, node_del_cost, node_ins_cost : callable
  77. Functions that return the costs of node substitution, node
  78. deletion, and node insertion, respectively.
  79. The functions will be called like
  80. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  81. node_del_cost(G1.nodes[n1]),
  82. node_ins_cost(G2.nodes[n2]).
  83. That is, the functions will receive the node attribute
  84. dictionaries as inputs. The functions are expected to return
  85. positive numeric values.
  86. Function node_subst_cost overrides node_match if specified.
  87. If neither node_match nor node_subst_cost are specified then
  88. default node substitution cost of 0 is used (node attributes
  89. are not considered during matching).
  90. If node_del_cost is not specified then default node deletion
  91. cost of 1 is used. If node_ins_cost is not specified then
  92. default node insertion cost of 1 is used.
  93. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  94. Functions that return the costs of edge substitution, edge
  95. deletion, and edge insertion, respectively.
  96. The functions will be called like
  97. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  98. edge_del_cost(G1[u1][v1]),
  99. edge_ins_cost(G2[u2][v2]).
  100. That is, the functions will receive the edge attribute
  101. dictionaries as inputs. The functions are expected to return
  102. positive numeric values.
  103. Function edge_subst_cost overrides edge_match if specified.
  104. If neither edge_match nor edge_subst_cost are specified then
  105. default edge substitution cost of 0 is used (edge attributes
  106. are not considered during matching).
  107. If edge_del_cost is not specified then default edge deletion
  108. cost of 1 is used. If edge_ins_cost is not specified then
  109. default edge insertion cost of 1 is used.
  110. roots : 2-tuple
  111. Tuple where first element is a node in G1 and the second
  112. is a node in G2.
  113. These nodes are forced to be matched in the comparison to
  114. allow comparison between rooted graphs.
  115. upper_bound : numeric
  116. Maximum edit distance to consider. Return None if no edit
  117. distance under or equal to upper_bound exists.
  118. timeout : numeric
  119. Maximum number of seconds to execute.
  120. After timeout is met, the current best GED is returned.
  121. Examples
  122. --------
  123. >>> G1 = nx.cycle_graph(6)
  124. >>> G2 = nx.wheel_graph(7)
  125. >>> nx.graph_edit_distance(G1, G2)
  126. 7.0
  127. >>> G1 = nx.star_graph(5)
  128. >>> G2 = nx.star_graph(5)
  129. >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))
  130. 0.0
  131. >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))
  132. 8.0
  133. See Also
  134. --------
  135. optimal_edit_paths, optimize_graph_edit_distance,
  136. is_isomorphic: test for graph edit distance of 0
  137. References
  138. ----------
  139. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  140. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  141. Pattern Recognition Problems. 4th International Conference on
  142. Pattern Recognition Applications and Methods 2015, Jan 2015,
  143. Lisbon, Portugal. 2015,
  144. <10.5220/0005209202710278>. <hal-01168816>
  145. https://hal.archives-ouvertes.fr/hal-01168816
  146. """
  147. bestcost = None
  148. for _, _, cost in optimize_edit_paths(
  149. G1,
  150. G2,
  151. node_match,
  152. edge_match,
  153. node_subst_cost,
  154. node_del_cost,
  155. node_ins_cost,
  156. edge_subst_cost,
  157. edge_del_cost,
  158. edge_ins_cost,
  159. upper_bound,
  160. True,
  161. roots,
  162. timeout,
  163. ):
  164. # assert bestcost is None or cost < bestcost
  165. bestcost = cost
  166. return bestcost
  167. @nx._dispatchable(graphs={"G1": 0, "G2": 1})
  168. def optimal_edit_paths(
  169. G1,
  170. G2,
  171. node_match=None,
  172. edge_match=None,
  173. node_subst_cost=None,
  174. node_del_cost=None,
  175. node_ins_cost=None,
  176. edge_subst_cost=None,
  177. edge_del_cost=None,
  178. edge_ins_cost=None,
  179. upper_bound=None,
  180. ):
  181. """Returns all minimum-cost edit paths transforming G1 to G2.
  182. Graph edit path is a sequence of node and edge edit operations
  183. transforming graph G1 to graph isomorphic to G2. Edit operations
  184. include substitutions, deletions, and insertions.
  185. Parameters
  186. ----------
  187. G1, G2: graphs
  188. The two graphs G1 and G2 must be of the same type.
  189. node_match : callable
  190. A function that returns True if node n1 in G1 and n2 in G2
  191. should be considered equal during matching.
  192. The function will be called like
  193. node_match(G1.nodes[n1], G2.nodes[n2]).
  194. That is, the function will receive the node attribute
  195. dictionaries for n1 and n2 as inputs.
  196. Ignored if node_subst_cost is specified. If neither
  197. node_match nor node_subst_cost are specified then node
  198. attributes are not considered.
  199. edge_match : callable
  200. A function that returns True if the edge attribute dictionaries
  201. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  202. be considered equal during matching.
  203. The function will be called like
  204. edge_match(G1[u1][v1], G2[u2][v2]).
  205. That is, the function will receive the edge attribute
  206. dictionaries of the edges under consideration.
  207. Ignored if edge_subst_cost is specified. If neither
  208. edge_match nor edge_subst_cost are specified then edge
  209. attributes are not considered.
  210. node_subst_cost, node_del_cost, node_ins_cost : callable
  211. Functions that return the costs of node substitution, node
  212. deletion, and node insertion, respectively.
  213. The functions will be called like
  214. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  215. node_del_cost(G1.nodes[n1]),
  216. node_ins_cost(G2.nodes[n2]).
  217. That is, the functions will receive the node attribute
  218. dictionaries as inputs. The functions are expected to return
  219. positive numeric values.
  220. Function node_subst_cost overrides node_match if specified.
  221. If neither node_match nor node_subst_cost are specified then
  222. default node substitution cost of 0 is used (node attributes
  223. are not considered during matching).
  224. If node_del_cost is not specified then default node deletion
  225. cost of 1 is used. If node_ins_cost is not specified then
  226. default node insertion cost of 1 is used.
  227. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  228. Functions that return the costs of edge substitution, edge
  229. deletion, and edge insertion, respectively.
  230. The functions will be called like
  231. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  232. edge_del_cost(G1[u1][v1]),
  233. edge_ins_cost(G2[u2][v2]).
  234. That is, the functions will receive the edge attribute
  235. dictionaries as inputs. The functions are expected to return
  236. positive numeric values.
  237. Function edge_subst_cost overrides edge_match if specified.
  238. If neither edge_match nor edge_subst_cost are specified then
  239. default edge substitution cost of 0 is used (edge attributes
  240. are not considered during matching).
  241. If edge_del_cost is not specified then default edge deletion
  242. cost of 1 is used. If edge_ins_cost is not specified then
  243. default edge insertion cost of 1 is used.
  244. upper_bound : numeric
  245. Maximum edit distance to consider.
  246. Returns
  247. -------
  248. edit_paths : list of tuples (node_edit_path, edge_edit_path)
  249. - node_edit_path : list of tuples ``(u, v)`` indicating node transformations
  250. between `G1` and `G2`. ``u`` is `None` for insertion, ``v`` is `None`
  251. for deletion.
  252. - edge_edit_path : list of tuples ``((u1, v1), (u2, v2))`` indicating edge
  253. transformations between `G1` and `G2`. ``(None, (u2,v2))`` for insertion
  254. and ``((u1,v1), None)`` for deletion.
  255. cost : numeric
  256. Optimal edit path cost (graph edit distance). When the cost
  257. is zero, it indicates that `G1` and `G2` are isomorphic.
  258. Examples
  259. --------
  260. >>> G1 = nx.cycle_graph(4)
  261. >>> G2 = nx.wheel_graph(5)
  262. >>> paths, cost = nx.optimal_edit_paths(G1, G2)
  263. >>> len(paths)
  264. 40
  265. >>> cost
  266. 5.0
  267. Notes
  268. -----
  269. To transform `G1` into a graph isomorphic to `G2`, apply the node
  270. and edge edits in the returned ``edit_paths``.
  271. In the case of isomorphic graphs, the cost is zero, and the paths
  272. represent different isomorphic mappings (isomorphisms). That is, the
  273. edits involve renaming nodes and edges to match the structure of `G2`.
  274. See Also
  275. --------
  276. graph_edit_distance, optimize_edit_paths
  277. References
  278. ----------
  279. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  280. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  281. Pattern Recognition Problems. 4th International Conference on
  282. Pattern Recognition Applications and Methods 2015, Jan 2015,
  283. Lisbon, Portugal. 2015,
  284. <10.5220/0005209202710278>. <hal-01168816>
  285. https://hal.archives-ouvertes.fr/hal-01168816
  286. """
  287. paths = []
  288. bestcost = None
  289. for vertex_path, edge_path, cost in optimize_edit_paths(
  290. G1,
  291. G2,
  292. node_match,
  293. edge_match,
  294. node_subst_cost,
  295. node_del_cost,
  296. node_ins_cost,
  297. edge_subst_cost,
  298. edge_del_cost,
  299. edge_ins_cost,
  300. upper_bound,
  301. False,
  302. ):
  303. # assert bestcost is None or cost <= bestcost
  304. if bestcost is not None and cost < bestcost:
  305. paths = []
  306. paths.append((vertex_path, edge_path))
  307. bestcost = cost
  308. return paths, bestcost
  309. @nx._dispatchable(graphs={"G1": 0, "G2": 1})
  310. def optimize_graph_edit_distance(
  311. G1,
  312. G2,
  313. node_match=None,
  314. edge_match=None,
  315. node_subst_cost=None,
  316. node_del_cost=None,
  317. node_ins_cost=None,
  318. edge_subst_cost=None,
  319. edge_del_cost=None,
  320. edge_ins_cost=None,
  321. upper_bound=None,
  322. ):
  323. """Returns consecutive approximations of GED (graph edit distance)
  324. between graphs G1 and G2.
  325. Graph edit distance is a graph similarity measure analogous to
  326. Levenshtein distance for strings. It is defined as minimum cost
  327. of edit path (sequence of node and edge edit operations)
  328. transforming graph G1 to graph isomorphic to G2.
  329. Parameters
  330. ----------
  331. G1, G2: graphs
  332. The two graphs G1 and G2 must be of the same type.
  333. node_match : callable
  334. A function that returns True if node n1 in G1 and n2 in G2
  335. should be considered equal during matching.
  336. The function will be called like
  337. node_match(G1.nodes[n1], G2.nodes[n2]).
  338. That is, the function will receive the node attribute
  339. dictionaries for n1 and n2 as inputs.
  340. Ignored if node_subst_cost is specified. If neither
  341. node_match nor node_subst_cost are specified then node
  342. attributes are not considered.
  343. edge_match : callable
  344. A function that returns True if the edge attribute dictionaries
  345. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  346. be considered equal during matching.
  347. The function will be called like
  348. edge_match(G1[u1][v1], G2[u2][v2]).
  349. That is, the function will receive the edge attribute
  350. dictionaries of the edges under consideration.
  351. Ignored if edge_subst_cost is specified. If neither
  352. edge_match nor edge_subst_cost are specified then edge
  353. attributes are not considered.
  354. node_subst_cost, node_del_cost, node_ins_cost : callable
  355. Functions that return the costs of node substitution, node
  356. deletion, and node insertion, respectively.
  357. The functions will be called like
  358. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  359. node_del_cost(G1.nodes[n1]),
  360. node_ins_cost(G2.nodes[n2]).
  361. That is, the functions will receive the node attribute
  362. dictionaries as inputs. The functions are expected to return
  363. positive numeric values.
  364. Function node_subst_cost overrides node_match if specified.
  365. If neither node_match nor node_subst_cost are specified then
  366. default node substitution cost of 0 is used (node attributes
  367. are not considered during matching).
  368. If node_del_cost is not specified then default node deletion
  369. cost of 1 is used. If node_ins_cost is not specified then
  370. default node insertion cost of 1 is used.
  371. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  372. Functions that return the costs of edge substitution, edge
  373. deletion, and edge insertion, respectively.
  374. The functions will be called like
  375. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  376. edge_del_cost(G1[u1][v1]),
  377. edge_ins_cost(G2[u2][v2]).
  378. That is, the functions will receive the edge attribute
  379. dictionaries as inputs. The functions are expected to return
  380. positive numeric values.
  381. Function edge_subst_cost overrides edge_match if specified.
  382. If neither edge_match nor edge_subst_cost are specified then
  383. default edge substitution cost of 0 is used (edge attributes
  384. are not considered during matching).
  385. If edge_del_cost is not specified then default edge deletion
  386. cost of 1 is used. If edge_ins_cost is not specified then
  387. default edge insertion cost of 1 is used.
  388. upper_bound : numeric
  389. Maximum edit distance to consider.
  390. Returns
  391. -------
  392. Generator of consecutive approximations of graph edit distance.
  393. Examples
  394. --------
  395. >>> G1 = nx.cycle_graph(6)
  396. >>> G2 = nx.wheel_graph(7)
  397. >>> for v in nx.optimize_graph_edit_distance(G1, G2):
  398. ... minv = v
  399. >>> minv
  400. 7.0
  401. See Also
  402. --------
  403. graph_edit_distance, optimize_edit_paths
  404. References
  405. ----------
  406. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  407. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  408. Pattern Recognition Problems. 4th International Conference on
  409. Pattern Recognition Applications and Methods 2015, Jan 2015,
  410. Lisbon, Portugal. 2015,
  411. <10.5220/0005209202710278>. <hal-01168816>
  412. https://hal.archives-ouvertes.fr/hal-01168816
  413. """
  414. for _, _, cost in optimize_edit_paths(
  415. G1,
  416. G2,
  417. node_match,
  418. edge_match,
  419. node_subst_cost,
  420. node_del_cost,
  421. node_ins_cost,
  422. edge_subst_cost,
  423. edge_del_cost,
  424. edge_ins_cost,
  425. upper_bound,
  426. True,
  427. ):
  428. yield cost
  429. @nx._dispatchable(
  430. graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
  431. )
  432. def optimize_edit_paths(
  433. G1,
  434. G2,
  435. node_match=None,
  436. edge_match=None,
  437. node_subst_cost=None,
  438. node_del_cost=None,
  439. node_ins_cost=None,
  440. edge_subst_cost=None,
  441. edge_del_cost=None,
  442. edge_ins_cost=None,
  443. upper_bound=None,
  444. strictly_decreasing=True,
  445. roots=None,
  446. timeout=None,
  447. ):
  448. """GED (graph edit distance) calculation: advanced interface.
  449. Graph edit path is a sequence of node and edge edit operations
  450. transforming graph G1 to graph isomorphic to G2. Edit operations
  451. include substitutions, deletions, and insertions.
  452. Graph edit distance is defined as minimum cost of edit path.
  453. Parameters
  454. ----------
  455. G1, G2: graphs
  456. The two graphs G1 and G2 must be of the same type.
  457. node_match : callable
  458. A function that returns True if node n1 in G1 and n2 in G2
  459. should be considered equal during matching.
  460. The function will be called like
  461. node_match(G1.nodes[n1], G2.nodes[n2]).
  462. That is, the function will receive the node attribute
  463. dictionaries for n1 and n2 as inputs.
  464. Ignored if node_subst_cost is specified. If neither
  465. node_match nor node_subst_cost are specified then node
  466. attributes are not considered.
  467. edge_match : callable
  468. A function that returns True if the edge attribute dictionaries
  469. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  470. be considered equal during matching.
  471. The function will be called like
  472. edge_match(G1[u1][v1], G2[u2][v2]).
  473. That is, the function will receive the edge attribute
  474. dictionaries of the edges under consideration.
  475. Ignored if edge_subst_cost is specified. If neither
  476. edge_match nor edge_subst_cost are specified then edge
  477. attributes are not considered.
  478. node_subst_cost, node_del_cost, node_ins_cost : callable
  479. Functions that return the costs of node substitution, node
  480. deletion, and node insertion, respectively.
  481. The functions will be called like
  482. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  483. node_del_cost(G1.nodes[n1]),
  484. node_ins_cost(G2.nodes[n2]).
  485. That is, the functions will receive the node attribute
  486. dictionaries as inputs. The functions are expected to return
  487. positive numeric values.
  488. Function node_subst_cost overrides node_match if specified.
  489. If neither node_match nor node_subst_cost are specified then
  490. default node substitution cost of 0 is used (node attributes
  491. are not considered during matching).
  492. If node_del_cost is not specified then default node deletion
  493. cost of 1 is used. If node_ins_cost is not specified then
  494. default node insertion cost of 1 is used.
  495. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  496. Functions that return the costs of edge substitution, edge
  497. deletion, and edge insertion, respectively.
  498. The functions will be called like
  499. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  500. edge_del_cost(G1[u1][v1]),
  501. edge_ins_cost(G2[u2][v2]).
  502. That is, the functions will receive the edge attribute
  503. dictionaries as inputs. The functions are expected to return
  504. positive numeric values.
  505. Function edge_subst_cost overrides edge_match if specified.
  506. If neither edge_match nor edge_subst_cost are specified then
  507. default edge substitution cost of 0 is used (edge attributes
  508. are not considered during matching).
  509. If edge_del_cost is not specified then default edge deletion
  510. cost of 1 is used. If edge_ins_cost is not specified then
  511. default edge insertion cost of 1 is used.
  512. upper_bound : numeric
  513. Maximum edit distance to consider.
  514. strictly_decreasing : bool
  515. If True, return consecutive approximations of strictly
  516. decreasing cost. Otherwise, return all edit paths of cost
  517. less than or equal to the previous minimum cost.
  518. roots : 2-tuple
  519. Tuple where first element is a node in G1 and the second
  520. is a node in G2.
  521. These nodes are forced to be matched in the comparison to
  522. allow comparison between rooted graphs.
  523. timeout : numeric
  524. Maximum number of seconds to execute.
  525. After timeout is met, the current best GED is returned.
  526. Returns
  527. -------
  528. Generator of tuples (node_edit_path, edge_edit_path, cost)
  529. node_edit_path : list of tuples (u, v)
  530. edge_edit_path : list of tuples ((u1, v1), (u2, v2))
  531. cost : numeric
  532. See Also
  533. --------
  534. graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
  535. References
  536. ----------
  537. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  538. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  539. Pattern Recognition Problems. 4th International Conference on
  540. Pattern Recognition Applications and Methods 2015, Jan 2015,
  541. Lisbon, Portugal. 2015,
  542. <10.5220/0005209202710278>. <hal-01168816>
  543. https://hal.archives-ouvertes.fr/hal-01168816
  544. """
  545. # TODO: support DiGraph
  546. import numpy as np
  547. import scipy as sp
  548. @dataclass
  549. class CostMatrix:
  550. C: ...
  551. lsa_row_ind: ...
  552. lsa_col_ind: ...
  553. ls: ...
  554. def make_CostMatrix(C, m, n):
  555. # assert(C.shape == (m + n, m + n))
  556. lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)
  557. # Fixup dummy assignments:
  558. # each substitution i<->j should have dummy assignment m+j<->n+i
  559. # NOTE: fast reduce of Cv relies on it
  560. # Create masks for substitution and dummy indices
  561. is_subst = (lsa_row_ind < m) & (lsa_col_ind < n)
  562. is_dummy = (lsa_row_ind >= m) & (lsa_col_ind >= n)
  563. # Map dummy assignments to the correct indices
  564. lsa_row_ind[is_dummy] = lsa_col_ind[is_subst] + m
  565. lsa_col_ind[is_dummy] = lsa_row_ind[is_subst] + n
  566. return CostMatrix(
  567. C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()
  568. )
  569. def extract_C(C, i, j, m, n):
  570. # assert(C.shape == (m + n, m + n))
  571. row_ind = [k in i or k - m in j for k in range(m + n)]
  572. col_ind = [k in j or k - n in i for k in range(m + n)]
  573. return C[row_ind, :][:, col_ind]
  574. def reduce_C(C, i, j, m, n):
  575. # assert(C.shape == (m + n, m + n))
  576. row_ind = [k not in i and k - m not in j for k in range(m + n)]
  577. col_ind = [k not in j and k - n not in i for k in range(m + n)]
  578. return C[row_ind, :][:, col_ind]
  579. def reduce_ind(ind, i):
  580. # assert set(ind) == set(range(len(ind)))
  581. rind = ind[[k not in i for k in ind]]
  582. for k in set(i):
  583. rind[rind >= k] -= 1
  584. return rind
  585. def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):
  586. """
  587. Parameters:
  588. u, v: matched vertices, u=None or v=None for
  589. deletion/insertion
  590. pending_g, pending_h: lists of edges not yet mapped
  591. Ce: CostMatrix of pending edge mappings
  592. matched_uv: partial vertex edit path
  593. list of tuples (u, v) of previously matched vertex
  594. mappings u<->v, u=None or v=None for
  595. deletion/insertion
  596. Returns:
  597. list of (i, j): indices of edge mappings g<->h
  598. localCe: local CostMatrix of edge mappings
  599. (basically submatrix of Ce at cross of rows i, cols j)
  600. """
  601. M = len(pending_g)
  602. N = len(pending_h)
  603. # assert Ce.C.shape == (M + N, M + N)
  604. # only attempt to match edges after one node match has been made
  605. # this will stop self-edges on the first node being automatically deleted
  606. # even when a substitution is the better option
  607. substitution_possible = M and N
  608. at_least_one_node_match = matched_uv is None or len(matched_uv) == 0
  609. if at_least_one_node_match and substitution_possible:
  610. g_ind = []
  611. h_ind = []
  612. else:
  613. g_ind = [
  614. i
  615. for i in range(M)
  616. if pending_g[i][:2] == (u, u)
  617. or any(
  618. pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv
  619. )
  620. ]
  621. h_ind = [
  622. j
  623. for j in range(N)
  624. if pending_h[j][:2] == (v, v)
  625. or any(
  626. pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv
  627. )
  628. ]
  629. m = len(g_ind)
  630. n = len(h_ind)
  631. if m or n:
  632. C = extract_C(Ce.C, g_ind, h_ind, M, N)
  633. # assert C.shape == (m + n, m + n)
  634. # Forbid structurally invalid matches
  635. # NOTE: inf remembered from Ce construction
  636. for k, i in enumerate(g_ind):
  637. g = pending_g[i][:2]
  638. for l, j in enumerate(h_ind):
  639. h = pending_h[j][:2]
  640. if nx.is_directed(G1) or nx.is_directed(G2):
  641. if any(
  642. g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)
  643. for p, q in matched_uv
  644. ):
  645. continue
  646. else:
  647. if any(
  648. g in ((p, u), (u, p)) and h in ((q, v), (v, q))
  649. for p, q in matched_uv
  650. ):
  651. continue
  652. if g == (u, u) or any(g == (p, p) for p, q in matched_uv):
  653. continue
  654. if h == (v, v) or any(h == (q, q) for p, q in matched_uv):
  655. continue
  656. C[k, l] = inf
  657. localCe = make_CostMatrix(C, m, n)
  658. ij = [
  659. (
  660. g_ind[k] if k < m else M + h_ind[l],
  661. h_ind[l] if l < n else N + g_ind[k],
  662. )
  663. for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
  664. if k < m or l < n
  665. ]
  666. else:
  667. ij = []
  668. localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
  669. return ij, localCe
  670. def reduce_Ce(Ce, ij, m, n):
  671. if len(ij):
  672. i, j = zip(*ij)
  673. m_i = m - sum(1 for t in i if t < m)
  674. n_j = n - sum(1 for t in j if t < n)
  675. return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
  676. return Ce
  677. def get_edit_ops(
  678. matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost
  679. ):
  680. """
  681. Parameters:
  682. matched_uv: partial vertex edit path
  683. list of tuples (u, v) of vertex mappings u<->v,
  684. u=None or v=None for deletion/insertion
  685. pending_u, pending_v: lists of vertices not yet mapped
  686. Cv: CostMatrix of pending vertex mappings
  687. pending_g, pending_h: lists of edges not yet mapped
  688. Ce: CostMatrix of pending edge mappings
  689. matched_cost: cost of partial edit path
  690. Returns:
  691. sequence of
  692. (i, j): indices of vertex mapping u<->v
  693. Cv_ij: reduced CostMatrix of pending vertex mappings
  694. (basically Cv with row i, col j removed)
  695. list of (x, y): indices of edge mappings g<->h
  696. Ce_xy: reduced CostMatrix of pending edge mappings
  697. (basically Ce with rows x, cols y removed)
  698. cost: total cost of edit operation
  699. NOTE: most promising ops first
  700. """
  701. m = len(pending_u)
  702. n = len(pending_v)
  703. # assert Cv.C.shape == (m + n, m + n)
  704. # 1) a vertex mapping from optimal linear sum assignment
  705. i, j = min(
  706. (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n
  707. )
  708. xy, localCe = match_edges(
  709. pending_u[i] if i < m else None,
  710. pending_v[j] if j < n else None,
  711. pending_g,
  712. pending_h,
  713. Ce,
  714. matched_uv,
  715. )
  716. Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
  717. # assert Ce.ls <= localCe.ls + Ce_xy.ls
  718. if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
  719. pass
  720. else:
  721. # get reduced Cv efficiently
  722. Cv_ij = CostMatrix(
  723. reduce_C(Cv.C, (i,), (j,), m, n),
  724. reduce_ind(Cv.lsa_row_ind, (i, m + j)),
  725. reduce_ind(Cv.lsa_col_ind, (j, n + i)),
  726. Cv.ls - Cv.C[i, j],
  727. )
  728. yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
  729. # 2) other candidates, sorted by lower-bound cost estimate
  730. other = []
  731. fixed_i, fixed_j = i, j
  732. if m <= n:
  733. candidates = (
  734. (t, fixed_j)
  735. for t in range(m + n)
  736. if t != fixed_i and (t < m or t == m + fixed_j)
  737. )
  738. else:
  739. candidates = (
  740. (fixed_i, t)
  741. for t in range(m + n)
  742. if t != fixed_j and (t < n or t == n + fixed_i)
  743. )
  744. for i, j in candidates:
  745. if prune(matched_cost + Cv.C[i, j] + Ce.ls):
  746. continue
  747. Cv_ij = make_CostMatrix(
  748. reduce_C(Cv.C, (i,), (j,), m, n),
  749. m - 1 if i < m else m,
  750. n - 1 if j < n else n,
  751. )
  752. # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
  753. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
  754. continue
  755. xy, localCe = match_edges(
  756. pending_u[i] if i < m else None,
  757. pending_v[j] if j < n else None,
  758. pending_g,
  759. pending_h,
  760. Ce,
  761. matched_uv,
  762. )
  763. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
  764. continue
  765. Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
  766. # assert Ce.ls <= localCe.ls + Ce_xy.ls
  767. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):
  768. continue
  769. other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
  770. yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)
  771. def get_edit_paths(
  772. matched_uv,
  773. pending_u,
  774. pending_v,
  775. Cv,
  776. matched_gh,
  777. pending_g,
  778. pending_h,
  779. Ce,
  780. matched_cost,
  781. ):
  782. """
  783. Parameters:
  784. matched_uv: partial vertex edit path
  785. list of tuples (u, v) of vertex mappings u<->v,
  786. u=None or v=None for deletion/insertion
  787. pending_u, pending_v: lists of vertices not yet mapped
  788. Cv: CostMatrix of pending vertex mappings
  789. matched_gh: partial edge edit path
  790. list of tuples (g, h) of edge mappings g<->h,
  791. g=None or h=None for deletion/insertion
  792. pending_g, pending_h: lists of edges not yet mapped
  793. Ce: CostMatrix of pending edge mappings
  794. matched_cost: cost of partial edit path
  795. Returns:
  796. sequence of (vertex_path, edge_path, cost)
  797. vertex_path: complete vertex edit path
  798. list of tuples (u, v) of vertex mappings u<->v,
  799. u=None or v=None for deletion/insertion
  800. edge_path: complete edge edit path
  801. list of tuples (g, h) of edge mappings g<->h,
  802. g=None or h=None for deletion/insertion
  803. cost: total cost of edit path
  804. NOTE: path costs are non-increasing
  805. """
  806. if prune(matched_cost + Cv.ls + Ce.ls):
  807. return
  808. if not max(len(pending_u), len(pending_v)):
  809. # assert not len(pending_g)
  810. # assert not len(pending_h)
  811. # path completed!
  812. # assert matched_cost <= maxcost_value
  813. nonlocal maxcost_value
  814. maxcost_value = min(maxcost_value, matched_cost)
  815. yield matched_uv, matched_gh, matched_cost
  816. else:
  817. edit_ops = get_edit_ops(
  818. matched_uv,
  819. pending_u,
  820. pending_v,
  821. Cv,
  822. pending_g,
  823. pending_h,
  824. Ce,
  825. matched_cost,
  826. )
  827. for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
  828. i, j = ij
  829. # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
  830. if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
  831. continue
  832. # dive deeper
  833. u = pending_u.pop(i) if i < len(pending_u) else None
  834. v = pending_v.pop(j) if j < len(pending_v) else None
  835. matched_uv.append((u, v))
  836. for x, y in xy:
  837. len_g = len(pending_g)
  838. len_h = len(pending_h)
  839. matched_gh.append(
  840. (
  841. pending_g[x] if x < len_g else None,
  842. pending_h[y] if y < len_h else None,
  843. )
  844. )
  845. sortedx = sorted(x for x, y in xy)
  846. sortedy = sorted(y for x, y in xy)
  847. G = [
  848. (pending_g.pop(x) if x < len(pending_g) else None)
  849. for x in reversed(sortedx)
  850. ]
  851. H = [
  852. (pending_h.pop(y) if y < len(pending_h) else None)
  853. for y in reversed(sortedy)
  854. ]
  855. yield from get_edit_paths(
  856. matched_uv,
  857. pending_u,
  858. pending_v,
  859. Cv_ij,
  860. matched_gh,
  861. pending_g,
  862. pending_h,
  863. Ce_xy,
  864. matched_cost + edit_cost,
  865. )
  866. # backtrack
  867. if u is not None:
  868. pending_u.insert(i, u)
  869. if v is not None:
  870. pending_v.insert(j, v)
  871. matched_uv.pop()
  872. for x, g in zip(sortedx, reversed(G)):
  873. if g is not None:
  874. pending_g.insert(x, g)
  875. for y, h in zip(sortedy, reversed(H)):
  876. if h is not None:
  877. pending_h.insert(y, h)
  878. for _ in xy:
  879. matched_gh.pop()
  880. # Initialization
  881. pending_u = list(G1.nodes)
  882. pending_v = list(G2.nodes)
  883. initial_cost = 0
  884. if roots:
  885. root_u, root_v = roots
  886. if root_u not in pending_u or root_v not in pending_v:
  887. raise nx.NodeNotFound("Root node not in graph.")
  888. # remove roots from pending
  889. pending_u.remove(root_u)
  890. pending_v.remove(root_v)
  891. # cost matrix of vertex mappings
  892. m = len(pending_u)
  893. n = len(pending_v)
  894. C = np.zeros((m + n, m + n))
  895. if node_subst_cost:
  896. C[0:m, 0:n] = np.array(
  897. [
  898. node_subst_cost(G1.nodes[u], G2.nodes[v])
  899. for u in pending_u
  900. for v in pending_v
  901. ]
  902. ).reshape(m, n)
  903. if roots:
  904. initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])
  905. elif node_match:
  906. C[0:m, 0:n] = np.array(
  907. [
  908. 1 - int(node_match(G1.nodes[u], G2.nodes[v]))
  909. for u in pending_u
  910. for v in pending_v
  911. ]
  912. ).reshape(m, n)
  913. if roots:
  914. initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])
  915. else:
  916. # all zeroes
  917. pass
  918. # assert not min(m, n) or C[0:m, 0:n].min() >= 0
  919. if node_del_cost:
  920. del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
  921. else:
  922. del_costs = [1] * len(pending_u)
  923. # assert not m or min(del_costs) >= 0
  924. if node_ins_cost:
  925. ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
  926. else:
  927. ins_costs = [1] * len(pending_v)
  928. # assert not n or min(ins_costs) >= 0
  929. inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
  930. C[0:m, n : n + m] = np.array(
  931. [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
  932. ).reshape(m, m)
  933. C[m : m + n, 0:n] = np.array(
  934. [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
  935. ).reshape(n, n)
  936. Cv = make_CostMatrix(C, m, n)
  937. pending_g = list(G1.edges)
  938. pending_h = list(G2.edges)
  939. # cost matrix of edge mappings
  940. m = len(pending_g)
  941. n = len(pending_h)
  942. C = np.zeros((m + n, m + n))
  943. if edge_subst_cost:
  944. C[0:m, 0:n] = np.array(
  945. [
  946. edge_subst_cost(G1.edges[g], G2.edges[h])
  947. for g in pending_g
  948. for h in pending_h
  949. ]
  950. ).reshape(m, n)
  951. elif edge_match:
  952. C[0:m, 0:n] = np.array(
  953. [
  954. 1 - int(edge_match(G1.edges[g], G2.edges[h]))
  955. for g in pending_g
  956. for h in pending_h
  957. ]
  958. ).reshape(m, n)
  959. else:
  960. # all zeroes
  961. pass
  962. # assert not min(m, n) or C[0:m, 0:n].min() >= 0
  963. if edge_del_cost:
  964. del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
  965. else:
  966. del_costs = [1] * len(pending_g)
  967. # assert not m or min(del_costs) >= 0
  968. if edge_ins_cost:
  969. ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
  970. else:
  971. ins_costs = [1] * len(pending_h)
  972. # assert not n or min(ins_costs) >= 0
  973. inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
  974. C[0:m, n : n + m] = np.array(
  975. [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
  976. ).reshape(m, m)
  977. C[m : m + n, 0:n] = np.array(
  978. [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
  979. ).reshape(n, n)
  980. Ce = make_CostMatrix(C, m, n)
  981. maxcost_value = Cv.C.sum() + Ce.C.sum() + 1
  982. if timeout is not None:
  983. if timeout <= 0:
  984. raise nx.NetworkXError("Timeout value must be greater than 0")
  985. start = time.perf_counter()
  986. def prune(cost):
  987. if timeout is not None:
  988. if time.perf_counter() - start > timeout:
  989. return True
  990. if upper_bound is not None:
  991. if cost > upper_bound:
  992. return True
  993. if cost > maxcost_value:
  994. return True
  995. if strictly_decreasing and cost >= maxcost_value:
  996. return True
  997. return False
  998. # Now go!
  999. done_uv = [] if roots is None else [roots]
  1000. for vertex_path, edge_path, cost in get_edit_paths(
  1001. done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost
  1002. ):
  1003. # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
  1004. # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
  1005. # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
  1006. # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
  1007. # print(vertex_path, edge_path, cost, file = sys.stderr)
  1008. # assert cost == maxcost_value
  1009. yield list(vertex_path), list(edge_path), float(cost)
  1010. @nx._dispatchable
  1011. def simrank_similarity(
  1012. G,
  1013. source=None,
  1014. target=None,
  1015. importance_factor=0.9,
  1016. max_iterations=1000,
  1017. tolerance=1e-4,
  1018. ):
  1019. """Returns the SimRank similarity of nodes in the graph ``G``.
  1020. SimRank is a similarity metric that says "two objects are considered
  1021. to be similar if they are referenced by similar objects." [1]_.
  1022. The pseudo-code definition from the paper is::
  1023. def simrank(G, u, v):
  1024. in_neighbors_u = G.predecessors(u)
  1025. in_neighbors_v = G.predecessors(v)
  1026. scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
  1027. return scale * sum(
  1028. simrank(G, w, x) for w, x in product(in_neighbors_u, in_neighbors_v)
  1029. )
  1030. where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
  1031. and ``C`` is a float decay or importance factor between 0 and 1.
  1032. The SimRank algorithm for determining node similarity is defined in
  1033. [2]_.
  1034. Parameters
  1035. ----------
  1036. G : NetworkX graph
  1037. A NetworkX graph
  1038. source : node
  1039. If this is specified, the returned dictionary maps each node
  1040. ``v`` in the graph to the similarity between ``source`` and
  1041. ``v``.
  1042. target : node
  1043. If both ``source`` and ``target`` are specified, the similarity
  1044. value between ``source`` and ``target`` is returned. If
  1045. ``target`` is specified but ``source`` is not, this argument is
  1046. ignored.
  1047. importance_factor : float
  1048. The relative importance of indirect neighbors with respect to
  1049. direct neighbors.
  1050. max_iterations : integer
  1051. Maximum number of iterations.
  1052. tolerance : float
  1053. Error tolerance used to check convergence. When an iteration of
  1054. the algorithm finds that no similarity value changes more than
  1055. this amount, the algorithm halts.
  1056. Returns
  1057. -------
  1058. similarity : dictionary or float
  1059. If ``source`` and ``target`` are both ``None``, this returns a
  1060. dictionary of dictionaries, where keys are node pairs and value
  1061. are similarity of the pair of nodes.
  1062. If ``source`` is not ``None`` but ``target`` is, this returns a
  1063. dictionary mapping node to the similarity of ``source`` and that
  1064. node.
  1065. If neither ``source`` nor ``target`` is ``None``, this returns
  1066. the similarity value for the given pair of nodes.
  1067. Raises
  1068. ------
  1069. ExceededMaxIterations
  1070. If the algorithm does not converge within ``max_iterations``.
  1071. NodeNotFound
  1072. If either ``source`` or ``target`` is not in `G`.
  1073. Examples
  1074. --------
  1075. >>> G = nx.cycle_graph(2)
  1076. >>> nx.simrank_similarity(G)
  1077. {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
  1078. >>> nx.simrank_similarity(G, source=0)
  1079. {0: 1.0, 1: 0.0}
  1080. >>> nx.simrank_similarity(G, source=0, target=0)
  1081. 1.0
  1082. The result of this function can be converted to a numpy array
  1083. representing the SimRank matrix by using the node order of the
  1084. graph to determine which row and column represent each node.
  1085. Other ordering of nodes is also possible.
  1086. >>> import numpy as np
  1087. >>> sim = nx.simrank_similarity(G)
  1088. >>> np.array([[sim[u][v] for v in G] for u in G])
  1089. array([[1., 0.],
  1090. [0., 1.]])
  1091. >>> sim_1d = nx.simrank_similarity(G, source=0)
  1092. >>> np.array([sim[0][v] for v in G])
  1093. array([1., 0.])
  1094. References
  1095. ----------
  1096. .. [1] https://en.wikipedia.org/wiki/SimRank
  1097. .. [2] G. Jeh and J. Widom.
  1098. "SimRank: a measure of structural-context similarity",
  1099. In KDD'02: Proceedings of the Eighth ACM SIGKDD
  1100. International Conference on Knowledge Discovery and Data Mining,
  1101. pp. 538--543. ACM Press, 2002.
  1102. """
  1103. import numpy as np
  1104. nodelist = list(G)
  1105. if source is not None:
  1106. if source not in nodelist:
  1107. raise nx.NodeNotFound(f"Source node {source} not in G")
  1108. else:
  1109. s_indx = nodelist.index(source)
  1110. else:
  1111. s_indx = None
  1112. if target is not None:
  1113. if target not in nodelist:
  1114. raise nx.NodeNotFound(f"Target node {target} not in G")
  1115. else:
  1116. t_indx = nodelist.index(target)
  1117. else:
  1118. t_indx = None
  1119. x = _simrank_similarity_numpy(
  1120. G, s_indx, t_indx, importance_factor, max_iterations, tolerance
  1121. )
  1122. if isinstance(x, np.ndarray):
  1123. if x.ndim == 1:
  1124. return dict(zip(G, x.tolist()))
  1125. # else x.ndim == 2
  1126. return {u: dict(zip(G, row)) for u, row in zip(G, x.tolist())}
  1127. return float(x)
  1128. def _simrank_similarity_python(
  1129. G,
  1130. source=None,
  1131. target=None,
  1132. importance_factor=0.9,
  1133. max_iterations=1000,
  1134. tolerance=1e-4,
  1135. ):
  1136. """Returns the SimRank similarity of nodes in the graph ``G``.
  1137. This pure Python version is provided for pedagogical purposes.
  1138. Examples
  1139. --------
  1140. >>> G = nx.cycle_graph(2)
  1141. >>> nx.similarity._simrank_similarity_python(G)
  1142. {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
  1143. >>> nx.similarity._simrank_similarity_python(G, source=0)
  1144. {0: 1, 1: 0.0}
  1145. >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
  1146. 1
  1147. """
  1148. # build up our similarity adjacency dictionary output
  1149. newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
  1150. # These functions compute the update to the similarity value of the nodes
  1151. # `u` and `v` with respect to the previous similarity values.
  1152. def avg_sim(s):
  1153. return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
  1154. Gadj = G.pred if G.is_directed() else G.adj
  1155. def sim(u, v):
  1156. return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
  1157. for its in range(max_iterations):
  1158. oldsim = newsim
  1159. newsim = {u: {v: sim(u, v) if u != v else 1 for v in G} for u in G}
  1160. is_close = all(
  1161. all(
  1162. abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
  1163. for v, old in nbrs.items()
  1164. )
  1165. for u, nbrs in oldsim.items()
  1166. )
  1167. if is_close:
  1168. break
  1169. if its + 1 == max_iterations:
  1170. raise nx.ExceededMaxIterations(
  1171. f"simrank did not converge after {max_iterations} iterations."
  1172. )
  1173. if source is not None and target is not None:
  1174. return newsim[source][target]
  1175. if source is not None:
  1176. return newsim[source]
  1177. return newsim
  1178. def _simrank_similarity_numpy(
  1179. G,
  1180. source=None,
  1181. target=None,
  1182. importance_factor=0.9,
  1183. max_iterations=1000,
  1184. tolerance=1e-4,
  1185. ):
  1186. """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
  1187. The SimRank algorithm for determining node similarity is defined in
  1188. [1]_.
  1189. Parameters
  1190. ----------
  1191. G : NetworkX graph
  1192. A NetworkX graph
  1193. source : node
  1194. If this is specified, the returned dictionary maps each node
  1195. ``v`` in the graph to the similarity between ``source`` and
  1196. ``v``.
  1197. target : node
  1198. If both ``source`` and ``target`` are specified, the similarity
  1199. value between ``source`` and ``target`` is returned. If
  1200. ``target`` is specified but ``source`` is not, this argument is
  1201. ignored.
  1202. importance_factor : float
  1203. The relative importance of indirect neighbors with respect to
  1204. direct neighbors.
  1205. max_iterations : integer
  1206. Maximum number of iterations.
  1207. tolerance : float
  1208. Error tolerance used to check convergence. When an iteration of
  1209. the algorithm finds that no similarity value changes more than
  1210. this amount, the algorithm halts.
  1211. Returns
  1212. -------
  1213. similarity : numpy array or float
  1214. If ``source`` and ``target`` are both ``None``, this returns a
  1215. 2D array containing SimRank scores of the nodes.
  1216. If ``source`` is not ``None`` but ``target`` is, this returns an
  1217. 1D array containing SimRank scores of ``source`` and that
  1218. node.
  1219. If neither ``source`` nor ``target`` is ``None``, this returns
  1220. the similarity value for the given pair of nodes.
  1221. Examples
  1222. --------
  1223. >>> G = nx.cycle_graph(2)
  1224. >>> nx.similarity._simrank_similarity_numpy(G)
  1225. array([[1., 0.],
  1226. [0., 1.]])
  1227. >>> nx.similarity._simrank_similarity_numpy(G, source=0)
  1228. array([1., 0.])
  1229. >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
  1230. 1.0
  1231. References
  1232. ----------
  1233. .. [1] G. Jeh and J. Widom.
  1234. "SimRank: a measure of structural-context similarity",
  1235. In KDD'02: Proceedings of the Eighth ACM SIGKDD
  1236. International Conference on Knowledge Discovery and Data Mining,
  1237. pp. 538--543. ACM Press, 2002.
  1238. """
  1239. # This algorithm follows roughly
  1240. #
  1241. # S = max{C * (A.T * S * A), I}
  1242. #
  1243. # where C is the importance factor, A is the column normalized
  1244. # adjacency matrix, and I is the identity matrix.
  1245. import numpy as np
  1246. adjacency_matrix = nx.to_numpy_array(G)
  1247. # column-normalize the ``adjacency_matrix``
  1248. s = np.array(adjacency_matrix.sum(axis=0))
  1249. s[s == 0] = 1
  1250. adjacency_matrix /= s # adjacency_matrix.sum(axis=0)
  1251. newsim = np.eye(len(G), dtype=np.float64)
  1252. for its in range(max_iterations):
  1253. prevsim = newsim.copy()
  1254. newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
  1255. np.fill_diagonal(newsim, 1.0)
  1256. if np.allclose(prevsim, newsim, atol=tolerance):
  1257. break
  1258. if its + 1 == max_iterations:
  1259. raise nx.ExceededMaxIterations(
  1260. f"simrank did not converge after {max_iterations} iterations."
  1261. )
  1262. if source is not None and target is not None:
  1263. return float(newsim[source, target])
  1264. if source is not None:
  1265. return newsim[source]
  1266. return newsim
  1267. @np_random_state("seed")
  1268. def _prepare_panther_paths(
  1269. G,
  1270. source,
  1271. path_length=5,
  1272. c=0.5,
  1273. delta=0.1,
  1274. eps=None,
  1275. weight="weight",
  1276. remove_isolates=True,
  1277. k=None,
  1278. seed=None,
  1279. ):
  1280. """Common preparation code for Panther similarity algorithms.
  1281. Parameters
  1282. ----------
  1283. G : NetworkX graph
  1284. A NetworkX graph
  1285. source : node
  1286. Source node for similarity calculation
  1287. path_length : int
  1288. How long the randomly generated paths should be
  1289. c : float
  1290. A universal constant that controls the number of random paths to generate
  1291. delta : float
  1292. The probability parameter for similarity approximation
  1293. eps : float or None
  1294. The error bound for similarity approximation
  1295. weight : string or None
  1296. The name of an edge attribute that holds the numerical value used as a weight
  1297. remove_isolates : bool
  1298. Whether to remove isolated nodes from graph processing
  1299. k : int or None
  1300. The number of most similar nodes to return. If provided, validates that
  1301. ``k`` is not greater than the number of nodes in the graph.
  1302. seed : integer, random_state, or None (default)
  1303. Indicator of random number generation state.
  1304. See :ref:`Randomness<randomness>`.
  1305. Returns
  1306. -------
  1307. PantherPaths
  1308. A tuple containing the prepared data:
  1309. - G: The graph (possibly with isolates removed)
  1310. - inv_node_map: Dictionary mapping node names to indices
  1311. - index_map: Populated index map of paths
  1312. - inv_sample_size: Inverse of sample size (for fast calculation)
  1313. - eps: Error bound for similarity approximation
  1314. """
  1315. import numpy as np
  1316. if source not in G:
  1317. raise nx.NodeNotFound(f"Source node {source} not in G")
  1318. isolates = set(nx.isolates(G))
  1319. if source in isolates:
  1320. raise nx.NetworkXUnfeasible(
  1321. f"Panther similarity is not defined for the isolated source node {source}."
  1322. )
  1323. if remove_isolates:
  1324. G = G.subgraph(node for node in G if node not in isolates).copy()
  1325. # According to [1], they empirically determined
  1326. # a good value for ``eps`` to be sqrt( 1 / |E| )
  1327. if eps is None:
  1328. eps = np.sqrt(1.0 / G.number_of_edges())
  1329. num_nodes = G.number_of_nodes()
  1330. # Check if k is provided and validate it against the number of nodes
  1331. if k is not None and not remove_isolates: # For panther_vector_similarity
  1332. if num_nodes < k:
  1333. raise nx.NetworkXUnfeasible(
  1334. f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."
  1335. )
  1336. inv_node_map = {name: index for index, name in enumerate(G)}
  1337. # Calculate the sample size ``R`` for how many paths
  1338. # to randomly generate
  1339. t_choose_2 = math.comb(path_length, 2)
  1340. sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))
  1341. index_map = {}
  1342. # Check for isolated nodes before generating random paths
  1343. # If there are still isolated nodes in the graph after filtering,
  1344. # they will cause issues with path generation
  1345. remaining_isolates = set(nx.isolates(G))
  1346. if remaining_isolates:
  1347. raise nx.NetworkXUnfeasible(
  1348. f"Cannot generate random paths with isolated nodes present: {remaining_isolates}"
  1349. )
  1350. # Generate the random paths and populate the index_map
  1351. for _ in generate_random_paths(
  1352. G,
  1353. sample_size,
  1354. path_length=path_length,
  1355. index_map=index_map,
  1356. weight=weight,
  1357. seed=seed,
  1358. ):
  1359. # NOTE: index_map is modified in-place by `generate_random_paths`
  1360. pass
  1361. return (
  1362. G, # The graph with isolated nodes removed
  1363. inv_node_map,
  1364. index_map,
  1365. 1 / sample_size,
  1366. eps,
  1367. )
  1368. @np_random_state("seed")
  1369. @nx._dispatchable(edge_attrs="weight")
  1370. def panther_similarity(
  1371. G,
  1372. source,
  1373. k=5,
  1374. path_length=5,
  1375. c=0.5,
  1376. delta=0.1,
  1377. eps=None,
  1378. weight="weight",
  1379. seed=None,
  1380. ):
  1381. r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.
  1382. Panther is a similarity metric that says "two objects are considered
  1383. to be similar if they frequently appear on the same paths." [1]_.
  1384. Parameters
  1385. ----------
  1386. G : NetworkX graph
  1387. A NetworkX graph
  1388. source : node
  1389. Source node for which to find the top `k` similar other nodes
  1390. k : int (default = 5)
  1391. The number of most similar nodes to return.
  1392. path_length : int (default = 5)
  1393. How long the randomly generated paths should be (``T`` in [1]_)
  1394. c : float (default = 0.5)
  1395. A universal constant that controls the number of random paths to generate.
  1396. Higher values increase the number of sample paths and potentially improve
  1397. accuracy at the cost of more computation. Defaults to 0.5 as recommended
  1398. in [1]_.
  1399. delta : float (default = 0.1)
  1400. The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),
  1401. where $R$ is the number of random paths and $\phi$ is the probability
  1402. that an element sampled from a set $A \subseteq D$, where $D$ is the domain.
  1403. eps : float or None (default = None)
  1404. The error bound for similarity approximation. This controls the accuracy
  1405. of the sampled paths in representing the true similarity. Smaller values
  1406. yield more accurate results but require more sample paths. If `None`, a
  1407. value of ``sqrt(1/|E|)`` is used, which the authors found empirically
  1408. effective.
  1409. weight : string or None, optional (default="weight")
  1410. The name of an edge attribute that holds the numerical value
  1411. used as a weight. If None then each edge has weight 1.
  1412. seed : integer, random_state, or None (default)
  1413. Indicator of random number generation state.
  1414. See :ref:`Randomness<randomness>`.
  1415. Returns
  1416. -------
  1417. similarity : dictionary
  1418. Dictionary of nodes to similarity scores (as floats). Note:
  1419. the self-similarity (i.e., ``v``) will not be included in
  1420. the returned dictionary. So, for ``k = 5``, a dictionary of
  1421. top 4 nodes and their similarity scores will be returned.
  1422. Raises
  1423. ------
  1424. NetworkXUnfeasible
  1425. If `source` is an isolated node.
  1426. NodeNotFound
  1427. If `source` is not in `G`.
  1428. Notes
  1429. -----
  1430. The isolated nodes in `G` are ignored.
  1431. Examples
  1432. --------
  1433. >>> G = nx.star_graph(10)
  1434. >>> sim = nx.panther_similarity(G, 0)
  1435. References
  1436. ----------
  1437. .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
  1438. Panther: Fast top-k similarity search on large networks.
  1439. In Proceedings of the ACM SIGKDD International Conference
  1440. on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
  1441. Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
  1442. """
  1443. import numpy as np
  1444. # Use helper method to prepare common data structures
  1445. G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(
  1446. G,
  1447. source,
  1448. path_length=path_length,
  1449. c=c,
  1450. delta=delta,
  1451. eps=eps,
  1452. weight=weight,
  1453. k=k,
  1454. seed=seed,
  1455. )
  1456. num_nodes = G.number_of_nodes()
  1457. node_list = list(G.nodes)
  1458. # Check number of nodes after any modifications by _prepare_panther_paths
  1459. if num_nodes < k:
  1460. raise nx.NetworkXUnfeasible(
  1461. f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."
  1462. )
  1463. S = np.zeros(num_nodes)
  1464. source_paths = set(index_map[source])
  1465. # Calculate the path similarities
  1466. # between ``source`` (v) and ``node`` (v_j)
  1467. # using our inverted index mapping of
  1468. # vertices to paths
  1469. for node, paths in index_map.items():
  1470. # Only consider paths where both
  1471. # ``node`` and ``source`` are present
  1472. common_paths = source_paths.intersection(paths)
  1473. S[inv_node_map[node]] = len(common_paths) * inv_sample_size
  1474. # Retrieve top ``k+1`` similar to account for removing self-similarity
  1475. # Note: the below performed anywhere from 4-10x faster
  1476. # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``
  1477. partition_k = min(k + 1, num_nodes)
  1478. top_k_unsorted = np.argpartition(S, -partition_k)[-partition_k:]
  1479. top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]
  1480. # Add back the similarity scores
  1481. # Convert numpy scalars to native Python types for dispatch compatibility
  1482. top_k_with_val = dict(
  1483. zip((node_list[i] for i in top_k_sorted), S[top_k_sorted].tolist())
  1484. )
  1485. # Remove the self-similarity
  1486. top_k_with_val.pop(source, None)
  1487. return top_k_with_val
  1488. @np_random_state("seed")
  1489. @nx._dispatchable(edge_attrs="weight")
  1490. def panther_vector_similarity(
  1491. G,
  1492. source,
  1493. *,
  1494. D=10,
  1495. k=5,
  1496. path_length=5,
  1497. c=0.5,
  1498. delta=0.1,
  1499. eps=None,
  1500. weight="weight",
  1501. seed=None,
  1502. ):
  1503. r"""Returns the Panther vector similarity (Panther++) of nodes in `G`.
  1504. Computes similarity between nodes based on the "Panther++" algorithm [1]_, which extends
  1505. the basic Panther algorithm by using feature vectors to better capture structural
  1506. similarity.
  1507. While basic Panther similarity measures how often two nodes appear on the same paths,
  1508. Panther vector similarity (Panther++) creates a ``D``-dimensional feature vector for each
  1509. node using its top similarity scores with other nodes, then computes similarity based
  1510. on the Euclidean distance between these feature vectors. This approach better captures
  1511. structural similarity and addresses the bias towards close neighbors present in
  1512. the original Panther algorithm.
  1513. This approach is preferred when:
  1514. 1. You need better structural similarity than basic path co-occurrence
  1515. 2. You want to overcome the close-neighbor bias of standard Panther
  1516. 3. You're working with large graphs where k-d tree indexing would be beneficial
  1517. 4. Graph edit distance-like similarity is more appropriate than path co-occurrence
  1518. Parameters
  1519. ----------
  1520. G : NetworkX graph
  1521. A NetworkX graph
  1522. source : node
  1523. Source node for which to find the top ``k`` similar other nodes
  1524. D : int
  1525. The number of similarity scores to use (in descending order)
  1526. for each feature vector. Defaults to 10. Note that the original paper
  1527. used D=50 [1]_, but KDTree is optimized for lower dimensions.
  1528. k : int
  1529. The number of most similar nodes to return
  1530. path_length : int
  1531. How long the randomly generated paths should be (``T`` in [1]_)
  1532. c : float
  1533. A universal constant that controls the number of random paths to generate.
  1534. Higher values increase the number of sample paths and potentially improve
  1535. accuracy at the cost of more computation. Defaults to 0.5 as recommended
  1536. in [1]_.
  1537. delta : float
  1538. The probability that ``S`` is not an epsilon-approximation to (R, phi)
  1539. eps : float
  1540. The error bound for similarity approximation. This controls the accuracy
  1541. of the sampled paths in representing the true similarity. Smaller values
  1542. yield more accurate results but require more sample paths. If None, a
  1543. value of ``sqrt(1/|E|)`` is used, which the authors found empirically
  1544. effective.
  1545. weight : string or None, optional (default="weight")
  1546. The name of an edge attribute that holds the numerical value
  1547. used as a weight. If `None` then each edge has weight 1.
  1548. seed : integer, random_state, or None (default)
  1549. Indicator of random number generation state.
  1550. See :ref:`Randomness<randomness>`.
  1551. Returns
  1552. -------
  1553. similarity : dict
  1554. Dict of nodes to similarity scores (as floats).
  1555. Note: the self-similarity (i.e., `node`) is not included in the dict.
  1556. Examples
  1557. --------
  1558. >>> G = nx.star_graph(100)
  1559. The "hub" node is distinct from the "spoke" nodes
  1560. >>> from pprint import pprint
  1561. >>> pprint(nx.panther_vector_similarity(G, source=0, seed=42))
  1562. {35: 0.10402634656233918,
  1563. 61: 0.10434063328712018,
  1564. 65: 0.10401247833456054,
  1565. 85: 0.10506718868571752,
  1566. 88: 0.10402634656233918}
  1567. But "spoke" nodes are similar to one another
  1568. >>> result = nx.panther_vector_similarity(G, source=1, seed=42)
  1569. >>> len(result)
  1570. 5
  1571. >>> all(similarity == 1.0 for similarity in result.values())
  1572. True
  1573. Notes
  1574. -----
  1575. Results may be nondeterministic when feature vectors have the same distances,
  1576. as the KDTree's internal tie-breaking behavior can vary between runs.
  1577. Using the same ``seed`` parameter ensures reproducible results.
  1578. References
  1579. ----------
  1580. .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
  1581. Panther: Fast top-k similarity search on large networks.
  1582. In Proceedings of the ACM SIGKDD International Conference
  1583. on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
  1584. Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
  1585. """
  1586. import numpy as np
  1587. import scipy as sp
  1588. # Use helper method to prepare common data structures but keep isolates in the graph
  1589. G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(
  1590. G,
  1591. source,
  1592. path_length=path_length,
  1593. c=c,
  1594. delta=delta,
  1595. eps=eps,
  1596. weight=weight,
  1597. remove_isolates=False,
  1598. k=k,
  1599. seed=seed,
  1600. )
  1601. num_nodes = G.number_of_nodes()
  1602. node_list = list(G.nodes)
  1603. # Ensure D doesn't exceed the number of nodes
  1604. if num_nodes < D:
  1605. raise nx.NetworkXUnfeasible(
  1606. f"The number of requested similarity scores {D} is greater than the number of nodes {num_nodes}."
  1607. )
  1608. similarities = np.zeros((num_nodes, num_nodes))
  1609. theta = np.zeros((num_nodes, D))
  1610. index_map_sets = {node: set(paths) for node, paths in index_map.items()}
  1611. # Calculate the path similarities for each node
  1612. for vi_idx, vi in enumerate(G.nodes):
  1613. vi_paths = index_map_sets[vi]
  1614. for node, node_paths in index_map_sets.items():
  1615. # Calculate similarity score
  1616. common_path_count = len(vi_paths.intersection(node_paths))
  1617. similarities[vi_idx, inv_node_map[node]] = (
  1618. common_path_count * inv_sample_size
  1619. )
  1620. # Build up the feature vector using the largest D similarity scores
  1621. theta[vi_idx] = np.sort(np.partition(similarities[vi_idx], -D)[-D:])[::-1]
  1622. # Insert the feature vectors into a k-d tree
  1623. # for fast retrieval
  1624. kdtree = sp.spatial.KDTree(theta)
  1625. # Retrieve top ``k+1`` similar vertices (i.e., vectors)
  1626. # (based on their Euclidean distance)
  1627. # Note that it's k+1 because the source node will be included and later removed
  1628. query_k = min(k + 1, num_nodes)
  1629. neighbor_distances, nearest_neighbors = kdtree.query(
  1630. theta[inv_node_map[source]], k=query_k
  1631. )
  1632. # Ensure results are always arrays (KDTree returns scalars when k=1)
  1633. neighbor_distances = np.atleast_1d(neighbor_distances)
  1634. nearest_neighbors = np.atleast_1d(nearest_neighbors)
  1635. # The paper defines the similarity S(v_i, v_j) as
  1636. # 1 / || Theta(v_i) - Theta(v_j) ||
  1637. # Calculate reciprocals and normalize to [0, 1] range
  1638. # Handle the case where distances are very small or zero (common in small graphs)
  1639. # Use the passed in eps parameter instead of defining a new epsilon
  1640. neighbor_distances = np.maximum(neighbor_distances, eps)
  1641. similarities = 1 / neighbor_distances
  1642. # Always normalize to ensure values are between 0 and 1
  1643. if len(similarities) > 0 and (max_sim := np.max(similarities)) > 0:
  1644. similarities /= max_sim
  1645. # Add back the similarity scores (i.e., distances)
  1646. # Convert numpy scalars to native Python types for dispatch compatibility
  1647. top_k_with_val = dict(
  1648. zip((node_list[n] for n in nearest_neighbors), similarities.tolist())
  1649. )
  1650. # Remove the self-similarity
  1651. top_k_with_val.pop(source, None)
  1652. # Ensure we return exactly k results (sorted by similarity)
  1653. if len(top_k_with_val) > k:
  1654. sorted_items = sorted(top_k_with_val.items(), key=lambda x: x[1], reverse=True)
  1655. top_k_with_val = dict(sorted_items[:k])
  1656. return top_k_with_val
  1657. @np_random_state("seed")
  1658. @nx._dispatchable(edge_attrs="weight")
  1659. def generate_random_paths(
  1660. G,
  1661. sample_size,
  1662. path_length=5,
  1663. index_map=None,
  1664. weight="weight",
  1665. seed=None,
  1666. *,
  1667. source=None,
  1668. ):
  1669. """Randomly generate `sample_size` paths of length `path_length`.
  1670. Parameters
  1671. ----------
  1672. G : NetworkX graph
  1673. A NetworkX graph
  1674. sample_size : integer
  1675. The number of paths to generate. This is ``R`` in [1]_.
  1676. path_length : integer (default = 5)
  1677. The maximum size of the path to randomly generate.
  1678. This is ``T`` in [1]_. According to the paper, ``T >= 5`` is
  1679. recommended.
  1680. index_map : dictionary, optional
  1681. If provided, this will be populated with the inverted
  1682. index of nodes mapped to the set of generated random path
  1683. indices within ``paths``.
  1684. weight : string or None, optional (default="weight")
  1685. The name of an edge attribute that holds the numerical value
  1686. used as a weight. If None then each edge has weight 1.
  1687. seed : integer, random_state, or None (default)
  1688. Indicator of random number generation state.
  1689. See :ref:`Randomness<randomness>`.
  1690. source : node, optional
  1691. Node to use as the starting point for all generated paths.
  1692. If None then starting nodes are selected at random with uniform probability.
  1693. Returns
  1694. -------
  1695. paths : generator of lists
  1696. Generator of `sample_size` paths each with length `path_length`.
  1697. Examples
  1698. --------
  1699. The generator yields `sample_size` number of paths of length `path_length`
  1700. drawn from `G`:
  1701. >>> G = nx.complete_graph(5)
  1702. >>> next(nx.generate_random_paths(G, sample_size=1, path_length=3, seed=42))
  1703. [3, 4, 2, 3]
  1704. >>> list(nx.generate_random_paths(G, sample_size=3, path_length=4, seed=42))
  1705. [[3, 4, 2, 3, 0], [2, 0, 2, 1, 0], [2, 0, 4, 3, 0]]
  1706. By passing a dictionary into `index_map`, it will build an
  1707. inverted index mapping of nodes to the paths in which that node is present:
  1708. >>> G = nx.wheel_graph(10)
  1709. >>> index_map = {}
  1710. >>> random_paths = list(
  1711. ... nx.generate_random_paths(G, sample_size=3, index_map=index_map, seed=2771)
  1712. ... )
  1713. >>> random_paths
  1714. [[3, 2, 1, 9, 8, 7], [4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]
  1715. >>> paths_containing_node_0 = [
  1716. ... random_paths[path_idx] for path_idx in index_map.get(0, [])
  1717. ... ]
  1718. >>> paths_containing_node_0
  1719. [[4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]
  1720. References
  1721. ----------
  1722. .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
  1723. Panther: Fast top-k similarity search on large networks.
  1724. In Proceedings of the ACM SIGKDD International Conference
  1725. on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
  1726. Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
  1727. """
  1728. import numpy as np
  1729. randint_fn = (
  1730. seed.integers if isinstance(seed, np.random.Generator) else seed.randint
  1731. )
  1732. # Calculate transition probabilities between
  1733. # every pair of vertices according to Eq. (3)
  1734. adj_mat = nx.to_numpy_array(G, weight=weight)
  1735. # Handle isolated nodes by checking for zero row sums
  1736. row_sums = adj_mat.sum(axis=1).reshape(-1, 1)
  1737. inv_row_sums = np.reciprocal(row_sums)
  1738. transition_probabilities = adj_mat * inv_row_sums
  1739. node_map = list(G)
  1740. num_nodes = G.number_of_nodes()
  1741. for path_index in range(sample_size):
  1742. if source is None:
  1743. # Sample current vertex v = v_i uniformly at random
  1744. node_index = randint_fn(num_nodes)
  1745. node = node_map[node_index]
  1746. else:
  1747. if source not in node_map:
  1748. raise nx.NodeNotFound(f"Initial node {source} not in G")
  1749. node = source
  1750. node_index = node_map.index(node)
  1751. # Add v into p_r and add p_r into the path set
  1752. # of v, i.e., P_v
  1753. path = [node]
  1754. # Build the inverted index (P_v) of vertices to paths
  1755. if index_map is not None:
  1756. if node in index_map:
  1757. index_map[node].add(path_index)
  1758. else:
  1759. index_map[node] = {path_index}
  1760. starting_index = node_index
  1761. for _ in range(path_length):
  1762. # Randomly sample a neighbor (v_j) according
  1763. # to transition probabilities from ``node`` (v) to its neighbors
  1764. nbr_index = seed.choice(
  1765. num_nodes, p=transition_probabilities[starting_index]
  1766. )
  1767. # Set current vertex (v = v_j)
  1768. starting_index = nbr_index
  1769. # Add v into p_r
  1770. nbr_node = node_map[nbr_index]
  1771. path.append(nbr_node)
  1772. # Add p_r into P_v
  1773. if index_map is not None:
  1774. if nbr_node in index_map:
  1775. index_map[nbr_node].add(path_index)
  1776. else:
  1777. index_map[nbr_node] = {path_index}
  1778. yield path