graph_hashing.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """
  2. Functions for hashing graphs to strings.
  3. Isomorphic graphs should be assigned identical hashes.
  4. For now, only Weisfeiler-Lehman hashing is implemented.
  5. """
  6. import warnings
  7. from collections import Counter, defaultdict
  8. from hashlib import blake2b
  9. import networkx as nx
  10. __all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"]
  11. def _hash_label(label, digest_size):
  12. return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()
  13. def _init_node_labels(G, edge_attr, node_attr):
  14. if node_attr:
  15. return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)}
  16. elif edge_attr:
  17. return {u: "" for u in G}
  18. else:
  19. warnings.warn(
  20. "The hashes produced for graphs without node or edge attributes "
  21. "changed in v3.5 due to a bugfix (see documentation).",
  22. UserWarning,
  23. stacklevel=2,
  24. )
  25. if nx.is_directed(G):
  26. return {u: str(G.in_degree(u)) + "_" + str(G.out_degree(u)) for u in G}
  27. else:
  28. return {u: str(deg) for u, deg in G.degree()}
  29. def _neighborhood_aggregate_undirected(G, node, node_labels, edge_attr=None):
  30. """
  31. Compute new labels for given node in an undirected graph by aggregating
  32. the labels of each node's neighbors.
  33. """
  34. label_list = []
  35. for nbr in G.neighbors(node):
  36. prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr])
  37. label_list.append(prefix + node_labels[nbr])
  38. return node_labels[node] + "".join(sorted(label_list))
  39. def _neighborhood_aggregate_directed(G, node, node_labels, edge_attr=None):
  40. """
  41. Compute new labels for given node in a directed graph by aggregating
  42. the labels of each node's neighbors.
  43. """
  44. successor_labels = []
  45. for nbr in G.successors(node):
  46. prefix = "s_" + "" if edge_attr is None else str(G[node][nbr][edge_attr])
  47. successor_labels.append(prefix + node_labels[nbr])
  48. predecessor_labels = []
  49. for nbr in G.predecessors(node):
  50. prefix = "p_" + "" if edge_attr is None else str(G[nbr][node][edge_attr])
  51. predecessor_labels.append(prefix + node_labels[nbr])
  52. return (
  53. node_labels[node]
  54. + "".join(sorted(successor_labels))
  55. + "".join(sorted(predecessor_labels))
  56. )
  57. @nx.utils.not_implemented_for("multigraph")
  58. @nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
  59. def weisfeiler_lehman_graph_hash(
  60. G, edge_attr=None, node_attr=None, iterations=3, digest_size=16
  61. ):
  62. """Return Weisfeiler Lehman (WL) graph hash.
  63. .. Warning:: Hash values for directed graphs and graphs without edge or
  64. node attributes have changed in v3.5. In previous versions,
  65. directed graphs did not distinguish in- and outgoing edges. Also,
  66. graphs without attributes set initial states such that effectively
  67. one extra iteration of WL occurred than indicated by `iterations`.
  68. For undirected graphs without node or edge labels, the old
  69. hashes can be obtained by increasing the iteration count by one.
  70. For more details, see `issue #7806
  71. <https://github.com/networkx/networkx/issues/7806>`_.
  72. The function iteratively aggregates and hashes neighborhoods of each node.
  73. After each node's neighbors are hashed to obtain updated node labels,
  74. a hashed histogram of resulting labels is returned as the final hash.
  75. Hashes are identical for isomorphic graphs and strong guarantees that
  76. non-isomorphic graphs will get different hashes. See [1]_ for details.
  77. If no node or edge attributes are provided, the degree of each node
  78. is used as its initial label.
  79. Otherwise, node and/or edge labels are used to compute the hash.
  80. Parameters
  81. ----------
  82. G : graph
  83. The graph to be hashed.
  84. Can have node and/or edge attributes. Can also have no attributes.
  85. edge_attr : string, optional (default=None)
  86. The key in edge attribute dictionary to be used for hashing.
  87. If None, edge labels are ignored.
  88. node_attr: string, optional (default=None)
  89. The key in node attribute dictionary to be used for hashing.
  90. If None, and no edge_attr given, use the degrees of the nodes as labels.
  91. iterations: int, optional (default=3)
  92. Number of neighbor aggregations to perform.
  93. Should be larger for larger graphs.
  94. digest_size: int, optional (default=16)
  95. Size (in bytes) of blake2b hash digest to use for hashing node labels.
  96. Returns
  97. -------
  98. h : string
  99. Hexadecimal string corresponding to hash of `G` (length ``2 * digest_size``).
  100. Raises
  101. ------
  102. ValueError
  103. If `iterations` is not a positve number.
  104. Examples
  105. --------
  106. Two graphs with edge attributes that are isomorphic, except for
  107. differences in the edge labels.
  108. >>> G1 = nx.Graph()
  109. >>> G1.add_edges_from(
  110. ... [
  111. ... (1, 2, {"label": "A"}),
  112. ... (2, 3, {"label": "A"}),
  113. ... (3, 1, {"label": "A"}),
  114. ... (1, 4, {"label": "B"}),
  115. ... ]
  116. ... )
  117. >>> G2 = nx.Graph()
  118. >>> G2.add_edges_from(
  119. ... [
  120. ... (5, 6, {"label": "B"}),
  121. ... (6, 7, {"label": "A"}),
  122. ... (7, 5, {"label": "A"}),
  123. ... (7, 8, {"label": "A"}),
  124. ... ]
  125. ... )
  126. Omitting the `edge_attr` option, results in identical hashes.
  127. >>> nx.weisfeiler_lehman_graph_hash(G1)
  128. 'c045439172215f49e0bef8c3d26c6b61'
  129. >>> nx.weisfeiler_lehman_graph_hash(G2)
  130. 'c045439172215f49e0bef8c3d26c6b61'
  131. With edge labels, the graphs are no longer assigned
  132. the same hash digest.
  133. >>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label")
  134. 'c653d85538bcf041d88c011f4f905f10'
  135. >>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label")
  136. '3dcd84af1ca855d0eff3c978d88e7ec7'
  137. Notes
  138. -----
  139. To return the WL hashes of each subgraph of a graph, use
  140. `weisfeiler_lehman_subgraph_hashes`
  141. Similarity between hashes does not imply similarity between graphs.
  142. References
  143. ----------
  144. .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
  145. Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
  146. Graph Kernels. Journal of Machine Learning Research. 2011.
  147. http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
  148. See also
  149. --------
  150. weisfeiler_lehman_subgraph_hashes
  151. """
  152. if G.is_directed():
  153. _neighborhood_aggregate = _neighborhood_aggregate_directed
  154. warnings.warn(
  155. "The hashes produced for directed graphs changed in version v3.5"
  156. " due to a bugfix to track in and out edges separately (see documentation).",
  157. UserWarning,
  158. stacklevel=2,
  159. )
  160. else:
  161. _neighborhood_aggregate = _neighborhood_aggregate_undirected
  162. def weisfeiler_lehman_step(G, labels, edge_attr=None):
  163. """
  164. Apply neighborhood aggregation to each node
  165. in the graph.
  166. Computes a dictionary with labels for each node.
  167. """
  168. new_labels = {}
  169. for node in G.nodes():
  170. label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
  171. new_labels[node] = _hash_label(label, digest_size)
  172. return new_labels
  173. if iterations <= 0:
  174. raise ValueError("The WL algorithm requires that `iterations` be positive")
  175. # set initial node labels
  176. node_labels = _init_node_labels(G, edge_attr, node_attr)
  177. # If the graph has no attributes, initial labels are the nodes' degrees.
  178. # This is equivalent to doing the first iterations of WL.
  179. if not edge_attr and not node_attr:
  180. iterations -= 1
  181. subgraph_hash_counts = []
  182. for _ in range(iterations):
  183. node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr)
  184. counter = Counter(node_labels.values())
  185. # sort the counter, extend total counts
  186. subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0]))
  187. # hash the final counter
  188. return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)
  189. @nx.utils.not_implemented_for("multigraph")
  190. @nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
  191. def weisfeiler_lehman_subgraph_hashes(
  192. G,
  193. edge_attr=None,
  194. node_attr=None,
  195. iterations=3,
  196. digest_size=16,
  197. include_initial_labels=False,
  198. ):
  199. """
  200. Return a dictionary of subgraph hashes by node.
  201. .. Warning:: Hash values for directed graphs have changed in version
  202. v3.5. In previous versions, directed graphs did not distinguish in-
  203. and outgoing edges.
  204. Graphs without attributes previously performed an extra iteration of
  205. WL at initialisation, which was not visible in the output of this
  206. function. This hash value is now included in the returned dictionary,
  207. shifting the other calculated hashes one position to the right. To
  208. obtain the same last subgraph hash, increase the number of iterations
  209. by one.
  210. For more details, see `issue #7806
  211. <https://github.com/networkx/networkx/issues/7806>`_.
  212. Dictionary keys are nodes in `G`, and values are a list of hashes.
  213. Each hash corresponds to a subgraph rooted at a given node u in `G`.
  214. Lists of subgraph hashes are sorted in increasing order of depth from
  215. their root node, with the hash at index i corresponding to a subgraph
  216. of nodes at most i-hops (i edges) distance from u. Thus, each list will contain
  217. `iterations` elements - a hash for a subgraph at each depth. If
  218. `include_initial_labels` is set to `True`, each list will additionally
  219. have contain a hash of the initial node label (or equivalently a
  220. subgraph of depth 0) prepended, totalling ``iterations + 1`` elements.
  221. The function iteratively aggregates and hashes neighborhoods of each node.
  222. This is achieved for each step by replacing for each node its label from
  223. the previous iteration with its hashed 1-hop neighborhood aggregate.
  224. The new node label is then appended to a list of node labels for each
  225. node.
  226. To aggregate neighborhoods for a node $u$ at each step, all labels of
  227. nodes adjacent to $u$ are concatenated. If the `edge_attr` parameter is set,
  228. labels for each neighboring node are prefixed with the value of this attribute
  229. along the connecting edge from this neighbor to node $u$. The resulting string
  230. is then hashed to compress this information into a fixed digest size.
  231. Thus, at the i-th iteration, nodes within i hops influence any given
  232. hashed node label. We can therefore say that at depth $i$ for node $u$
  233. we have a hash for a subgraph induced by the i-hop neighborhood of $u$.
  234. The output can be used to create general Weisfeiler-Lehman graph kernels,
  235. or generate features for graphs or nodes - for example to generate 'words' in
  236. a graph as seen in the 'graph2vec' algorithm.
  237. See [1]_ & [2]_ respectively for details.
  238. Hashes are identical for isomorphic subgraphs and there exist strong
  239. guarantees that non-isomorphic graphs will get different hashes.
  240. See [1]_ for details.
  241. If no node or edge attributes are provided, the degree of each node
  242. is used as its initial label.
  243. Otherwise, node and/or edge labels are used to compute the hash.
  244. Parameters
  245. ----------
  246. G : graph
  247. The graph to be hashed.
  248. Can have node and/or edge attributes. Can also have no attributes.
  249. edge_attr : string, optional (default=None)
  250. The key in edge attribute dictionary to be used for hashing.
  251. If None, edge labels are ignored.
  252. node_attr : string, optional (default=None)
  253. The key in node attribute dictionary to be used for hashing.
  254. If None, and no edge_attr given, use the degrees of the nodes as labels.
  255. If None, and edge_attr is given, each node starts with an identical label.
  256. iterations : int, optional (default=3)
  257. Number of neighbor aggregations to perform.
  258. Should be larger for larger graphs.
  259. digest_size : int, optional (default=16)
  260. Size (in bytes) of blake2b hash digest to use for hashing node labels.
  261. The default size is 16 bytes.
  262. include_initial_labels : bool, optional (default=False)
  263. If True, include the hashed initial node label as the first subgraph
  264. hash for each node.
  265. Returns
  266. -------
  267. node_subgraph_hashes : dict
  268. A dictionary with each key given by a node in G, and each value given
  269. by the subgraph hashes in order of depth from the key node.
  270. Hashes are hexadecimal strings (hence ``2 * digest_size`` long).
  271. Raises
  272. ------
  273. ValueError
  274. If `iterations` is not a positve number.
  275. Examples
  276. --------
  277. Finding similar nodes in different graphs:
  278. >>> G1 = nx.Graph()
  279. >>> G1.add_edges_from([(1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)])
  280. >>> G2 = nx.Graph()
  281. >>> G2.add_edges_from([(1, 3), (2, 3), (1, 6), (1, 5), (4, 6)])
  282. >>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(
  283. ... G1, iterations=4, digest_size=8
  284. ... )
  285. >>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(
  286. ... G2, iterations=4, digest_size=8
  287. ... )
  288. Even though G1 and G2 are not isomorphic (they have different numbers of edges),
  289. the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar:
  290. >>> g1_hashes[1]
  291. ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0']
  292. >>> g2_hashes[5]
  293. ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc']
  294. The first 3 WL subgraph hashes match. From this we can conclude that it's very
  295. likely the neighborhood of 3 hops around these nodes are isomorphic.
  296. However the 4-hop neighborhoods of ``G1`` and ``G2`` are not isomorphic since the
  297. 4th hashes in the lists above are not equal.
  298. These nodes may be candidates to be classified together since their local topology
  299. is similar.
  300. Notes
  301. -----
  302. To hash the full graph when subgraph hashes are not needed, use
  303. `weisfeiler_lehman_graph_hash` for efficiency.
  304. Similarity between hashes does not imply similarity between graphs.
  305. References
  306. ----------
  307. .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
  308. Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
  309. Graph Kernels. Journal of Machine Learning Research. 2011.
  310. http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
  311. .. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan,
  312. Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning
  313. Distributed Representations of Graphs. arXiv. 2017
  314. https://arxiv.org/pdf/1707.05005.pdf
  315. See also
  316. --------
  317. weisfeiler_lehman_graph_hash
  318. """
  319. if G.is_directed():
  320. _neighborhood_aggregate = _neighborhood_aggregate_directed
  321. warnings.warn(
  322. "The hashes produced for directed graphs changed in v3.5"
  323. " due to a bugfix (see documentation).",
  324. UserWarning,
  325. stacklevel=2,
  326. )
  327. else:
  328. _neighborhood_aggregate = _neighborhood_aggregate_undirected
  329. def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None):
  330. """
  331. Apply neighborhood aggregation to each node
  332. in the graph.
  333. Computes a dictionary with labels for each node.
  334. Appends the new hashed label to the dictionary of subgraph hashes
  335. originating from and indexed by each node in G
  336. """
  337. new_labels = {}
  338. for node in G.nodes():
  339. label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
  340. hashed_label = _hash_label(label, digest_size)
  341. new_labels[node] = hashed_label
  342. node_subgraph_hashes[node].append(hashed_label)
  343. return new_labels
  344. if iterations <= 0:
  345. raise ValueError("The WL algorithm requires that `iterations` be positive")
  346. node_labels = _init_node_labels(G, edge_attr, node_attr)
  347. if include_initial_labels:
  348. node_subgraph_hashes = {
  349. k: [_hash_label(v, digest_size)] for k, v in node_labels.items()
  350. }
  351. else:
  352. node_subgraph_hashes = defaultdict(list)
  353. # If the graph has no attributes, initial labels are the nodes' degrees.
  354. # This is equivalent to doing the first iterations of WL.
  355. if not edge_attr and not node_attr:
  356. iterations -= 1
  357. for node in G.nodes():
  358. hashed_label = _hash_label(node_labels[node], digest_size)
  359. node_subgraph_hashes[node].append(hashed_label)
  360. for _ in range(iterations):
  361. node_labels = weisfeiler_lehman_step(
  362. G, node_labels, node_subgraph_hashes, edge_attr
  363. )
  364. return dict(node_subgraph_hashes)