bipartitions.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """Functions for splitting a network into two communities (finding a bipartition)."""
  2. import random
  3. from copy import deepcopy
  4. from itertools import count
  5. import networkx as nx
  6. __all__ = [
  7. "kernighan_lin_bisection",
  8. "spectral_modularity_bipartition",
  9. "greedy_node_swap_bipartition",
  10. ]
  11. def _kernighan_lin_sweep(edge_info, side):
  12. """
  13. This is a modified form of Kernighan-Lin, which moves single nodes at a
  14. time, alternating between sides to keep the bisection balanced. We keep
  15. two min-heaps of swap costs to make optimal-next-move selection fast.
  16. """
  17. heap0, heap1 = cost_heaps = nx.utils.BinaryHeap(), nx.utils.BinaryHeap()
  18. # we use heap methods insert, pop, and get
  19. for u, nbrs in edge_info.items():
  20. cost_u = sum(wt if side[v] else -wt for v, wt in nbrs.items())
  21. if side[u]:
  22. heap1.insert(u, cost_u)
  23. else:
  24. heap0.insert(u, -cost_u)
  25. def _update_heap_values(node):
  26. side_node = side[node]
  27. for nbr, wt in edge_info[node].items():
  28. side_nbr = side[nbr]
  29. if side_nbr == side_node:
  30. wt = -wt
  31. heap_nbr = cost_heaps[side_nbr]
  32. if nbr in heap_nbr:
  33. cost_nbr = heap_nbr.get(nbr) + 2 * wt
  34. # allow_increase lets us update a value already on the heap
  35. heap_nbr.insert(nbr, cost_nbr, allow_increase=True)
  36. i = 0
  37. totcost = 0
  38. while heap0 and heap1:
  39. u, cost_u = heap0.pop()
  40. _update_heap_values(u)
  41. v, cost_v = heap1.pop()
  42. _update_heap_values(v)
  43. totcost += cost_u + cost_v
  44. i += 1
  45. yield totcost, i, (u, v)
  46. @nx.utils.not_implemented_for("directed")
  47. @nx.utils.py_random_state(4)
  48. @nx._dispatchable(edge_attrs="weight")
  49. def kernighan_lin_bisection(G, partition=None, max_iter=10, weight="weight", seed=None):
  50. """Partition a graph into two blocks using the Kernighan–Lin algorithm.
  51. This algorithm partitions a network into two sets by iteratively
  52. swapping pairs of nodes to reduce the edge cut between the two sets. The
  53. pairs are chosen according to a modified form of Kernighan-Lin [1]_, which
  54. moves nodes individually, alternating between sides to keep the bisection
  55. balanced.
  56. Kernighan-Lin is an approximate algorithm for maximal modularity bisection.
  57. In [2]_ they suggest that fine-tuned improvements can be made using
  58. greedy node swapping, (see `greedy_node_swap_bipartition`).
  59. The improvements are typically only a few percent of the modularity value.
  60. But they claim that can make a difference between a good and excellent method.
  61. This function does not perform any improvements. But you can do that yourself.
  62. Parameters
  63. ----------
  64. G : NetworkX graph
  65. Graph must be undirected.
  66. partition : tuple
  67. Pair of iterables containing an initial partition. If not
  68. specified, a random balanced partition is used.
  69. max_iter : int
  70. Maximum number of times to attempt swaps to find an
  71. improvement before giving up.
  72. weight : string or function (default: "weight")
  73. If this is a string, then edge weights will be accessed via the
  74. edge attribute with this key (that is, the weight of the edge
  75. joining `u` to `v` will be ``G.edges[u, v][weight]``). If no
  76. such edge attribute exists, the weight of the edge is assumed to
  77. be one.
  78. If this is a function, the weight of an edge is the value
  79. returned by the function. The function must accept exactly three
  80. positional arguments: the two endpoints of an edge and the
  81. dictionary of edge attributes for that edge. The function must
  82. return a number or None to indicate a hidden edge.
  83. seed : integer, random_state, or None (default)
  84. Indicator of random number generation state.
  85. See :ref:`Randomness<randomness>`.
  86. Only used if partition is None
  87. Returns
  88. -------
  89. partition : tuple
  90. A pair of sets of nodes representing the bipartition.
  91. Raises
  92. ------
  93. NetworkXError
  94. If `partition` is not a valid partition of the nodes of the graph.
  95. References
  96. ----------
  97. .. [1] Kernighan, B. W.; Lin, Shen (1970).
  98. "An efficient heuristic procedure for partitioning graphs."
  99. *Bell Systems Technical Journal* 49: 291--307.
  100. Oxford University Press 2011.
  101. .. [2] M. E. J. Newman,
  102. "Modularity and community structure in networks",
  103. PNAS, 103 (23), p. 8577-8582,
  104. https://doi.org/10.1073/pnas.0601602103
  105. """
  106. nodes = list(G)
  107. if partition is None:
  108. seed.shuffle(nodes)
  109. mid = len(nodes) // 2
  110. A, B = nodes[:mid], nodes[mid:]
  111. else:
  112. try:
  113. A, B = partition
  114. except (TypeError, ValueError) as err:
  115. raise nx.NetworkXError("partition must be two sets") from err
  116. if not nx.community.is_partition(G, [A, B]):
  117. raise nx.NetworkXError("partition invalid")
  118. side = {node: (node in A) for node in nodes}
  119. # ruff: noqa: E731 skips check for no lambda functions
  120. # Using shortest_paths _weight_function with sum instead of min on multiedges
  121. if callable(weight):
  122. sum_weight = weight
  123. elif G.is_multigraph():
  124. sum_weight = lambda u, v, d: sum(dd.get(weight, 1) for dd in d.values())
  125. else:
  126. sum_weight = lambda u, v, d: d.get(weight, 1)
  127. edge_info = {
  128. u: {v: wt for v, d in nbrs.items() if (wt := sum_weight(u, v, d)) is not None}
  129. for u, nbrs in G._adj.items()
  130. }
  131. for i in range(max_iter):
  132. costs = list(_kernighan_lin_sweep(edge_info, side))
  133. # find out how many edges to update: min_i
  134. min_cost, min_i, _ = min(costs)
  135. if min_cost >= 0:
  136. break
  137. for _, _, (u, v) in costs[:min_i]:
  138. side[u] = 1
  139. side[v] = 0
  140. part1 = {u for u, s in side.items() if s == 0}
  141. part2 = {u for u, s in side.items() if s == 1}
  142. return part1, part2
  143. @nx.utils.not_implemented_for("directed")
  144. @nx.utils.not_implemented_for("multigraph")
  145. def spectral_modularity_bipartition(G):
  146. r"""Return a bipartition of the nodes based on the spectrum of the
  147. modularity matrix of the graph.
  148. This method calculates the eigenvector associated with the second
  149. largest eigenvalue of the modularity matrix, where the modularity
  150. matrix *B* is defined by
  151. ..math::
  152. B_{i j} = A_{i j} - \frac{k_i k_j}{2 m},
  153. where *A* is the adjacency matrix, `k_i` is the degree of node *i*,
  154. and *m* is the number of edges in the graph. Nodes whose
  155. corresponding values in the eigenvector are negative are placed in
  156. one block, nodes whose values are nonnegative are placed in another
  157. block.
  158. Parameters
  159. ----------
  160. G : NetworkX graph
  161. Returns
  162. -------
  163. C : tuple
  164. Pair of communities as two sets of nodes of ``G``, partitioned
  165. according to second largest eigenvalue of the modularity matrix.
  166. Examples
  167. --------
  168. >>> G = nx.karate_club_graph()
  169. >>> MrHi, Officer = nx.community.spectral_modularity_bipartition(G)
  170. >>> MrHi, Officer = sorted([sorted(MrHi), sorted(Officer)])
  171. >>> MrHi
  172. [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 16, 17, 19, 21]
  173. >>> Officer
  174. [8, 9, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
  175. References
  176. ----------
  177. .. [1] M. E. J. Newman *Networks: An Introduction*, pages 373--378
  178. Oxford University Press 2011.
  179. .. [2] M. E. J. Newman,
  180. "Modularity and community structure in networks",
  181. PNAS, 103 (23), p. 8577-8582,
  182. https://doi.org/10.1073/pnas.0601602103
  183. """
  184. import numpy as np
  185. B = nx.linalg.modularity_matrix(G)
  186. eigenvalues, eigenvectors = np.linalg.eig(B)
  187. index = np.argsort(eigenvalues)[-1]
  188. v2 = zip(np.real(eigenvectors[:, index]), G)
  189. left, right = set(), set()
  190. for u, n in v2:
  191. if u < 0:
  192. left.add(n)
  193. else:
  194. right.add(n)
  195. return left, right
  196. @nx.utils.not_implemented_for("multigraph")
  197. def greedy_node_swap_bipartition(G, *, init_split=None, max_iter=10):
  198. """Split the nodes into two communities based on greedy
  199. modularity maximization.
  200. The algorithm works by selecting a node to change communities which
  201. will maximize the modularity. The swap is made and the community
  202. structure with the highest modularity is kept.
  203. Parameters
  204. ----------
  205. G : NetworkX graph
  206. init_split : 2-tuple of sets of nodes
  207. Pair of sets of nodes in ``G`` providing an initial bipartition
  208. for the algorithm. If not specified, a random balanced partition
  209. is used. If this pair of sets is not a partition of the nodes of `G`,
  210. :exc:`NetworkXException` is raised.
  211. max_iter : int
  212. Maximum number of iterations of attempting swaps to find an improvement.
  213. Returns
  214. -------
  215. max_split : 2-tuple of sets of nodes
  216. Pair of sets of nodes of ``G``, partitioned according to a
  217. node swap greedy modularity maximization algorithm.
  218. Raises
  219. ------
  220. NetworkXError
  221. if init_split is not a valid partition of the
  222. graph into two communities or if G is a MultiGraph
  223. Examples
  224. --------
  225. >>> G = nx.barbell_graph(3, 0)
  226. >>> left, right = nx.community.greedy_node_swap_bipartition(G)
  227. >>> # Sort the communities so the nodes appear in increasing order.
  228. >>> left, right = sorted([sorted(left), sorted(right)])
  229. >>> sorted(left)
  230. [0, 1, 2]
  231. >>> sorted(right)
  232. [3, 4, 5]
  233. Notes
  234. -----
  235. This function is not implemented for multigraphs.
  236. References
  237. ----------
  238. .. [1] M. E. J. Newman "Networks: An Introduction", pages 373--375.
  239. Oxford University Press 2011.
  240. """
  241. if init_split is None:
  242. m1 = len(G) // 2
  243. m2 = len(G) - m1
  244. some_nodes = set(random.sample(list(G), m1))
  245. other_nodes = {n for n in G if n not in some_nodes}
  246. best_split_so_far = (some_nodes, other_nodes)
  247. else:
  248. if not nx.community.is_partition(G, init_split):
  249. raise nx.NetworkXError("init_split is not a partition of G")
  250. if not len(init_split) == 2:
  251. raise nx.NetworkXError("init_split must be a bipartition")
  252. best_split_so_far = deepcopy(init_split)
  253. best_mod = nx.community.modularity(G, best_split_so_far)
  254. max_split, max_mod = best_split_so_far, best_mod
  255. its = 0
  256. m = G.number_of_edges()
  257. G_degree = dict(G.degree)
  258. while max_mod >= best_mod and its < max_iter:
  259. best_split_so_far = max_split
  260. best_mod = max_mod
  261. next_split = deepcopy(max_split)
  262. next_mod = max_mod
  263. nodes = set(G)
  264. while nodes:
  265. max_swap = -1
  266. max_node = None
  267. max_node_comm = None
  268. left, right = next_split
  269. leftd = sum(G_degree[n] for n in left)
  270. rightd = sum(G_degree[n] for n in right)
  271. for n in nodes:
  272. if n in left:
  273. in_comm, out_comm = left, right
  274. in_deg, out_deg = leftd, rightd
  275. else:
  276. in_comm, out_comm = right, left
  277. in_deg, out_deg = rightd, leftd
  278. d_eii = -len(G[n].keys() & in_comm) / m
  279. d_ejj = len(G[n].keys() & out_comm) / m
  280. deg = G_degree[n]
  281. d_sum_ai = (deg / (2 * m**2)) * (in_deg - out_deg - deg)
  282. swap_change = d_eii + d_ejj + d_sum_ai
  283. if swap_change > max_swap:
  284. max_swap = swap_change
  285. max_node = n
  286. max_node_comm = in_comm
  287. non_max_node_comm = out_comm
  288. # swap the node from one comm to the other
  289. max_node_comm.remove(max_node)
  290. non_max_node_comm.add(max_node)
  291. next_mod += max_swap
  292. # deepcopy next_split each time it reaches a high (might go lower later)
  293. if next_mod > max_mod:
  294. max_split, max_mod = deepcopy(next_split), next_mod
  295. nodes.remove(max_node)
  296. its += 1
  297. return best_split_so_far