louvain.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. """Functions for detecting communities based on Louvain Community Detection
  2. Algorithm"""
  3. import itertools
  4. from collections import defaultdict, deque
  5. import networkx as nx
  6. from networkx.algorithms.community import modularity
  7. from networkx.utils import py_random_state
  8. __all__ = ["louvain_communities", "louvain_partitions"]
  9. @py_random_state("seed")
  10. @nx._dispatchable(edge_attrs="weight")
  11. def louvain_communities(
  12. G, weight="weight", resolution=1, threshold=0.0000001, max_level=None, seed=None
  13. ):
  14. r"""Find the best partition of a graph using the Louvain Community Detection
  15. Algorithm.
  16. Louvain Community Detection Algorithm is a simple method to extract the community
  17. structure of a network. This is a heuristic method based on modularity optimization. [1]_
  18. The algorithm works in 2 steps. On the first step it assigns every node to be
  19. in its own community and then for each node it tries to find the maximum positive
  20. modularity gain by moving each node to all of its neighbor communities. If no positive
  21. gain is achieved the node remains in its original community.
  22. The modularity gain obtained by moving an isolated node $i$ into a community $C$ can
  23. easily be calculated by the following formula (combining [1]_ [2]_ and some algebra):
  24. .. math::
  25. \Delta Q = \frac{k_{i,in}}{2m} - \gamma\frac{ \Sigma_{tot} \cdot k_i}{2m^2}
  26. where $m$ is the size of the graph, $k_{i,in}$ is the sum of the weights of the links
  27. from $i$ to nodes in $C$, $k_i$ is the sum of the weights of the links incident to node $i$,
  28. $\Sigma_{tot}$ is the sum of the weights of the links incident to nodes in $C$ and $\gamma$
  29. is the resolution parameter.
  30. For the directed case the modularity gain can be computed using this formula according to [3]_
  31. .. math::
  32. \Delta Q = \frac{k_{i,in}}{m}
  33. - \gamma\frac{k_i^{out} \cdot\Sigma_{tot}^{in} + k_i^{in} \cdot \Sigma_{tot}^{out}}{m^2}
  34. where $k_i^{out}$, $k_i^{in}$ are the outer and inner weighted degrees of node $i$ and
  35. $\Sigma_{tot}^{in}$, $\Sigma_{tot}^{out}$ are the sum of in-going and out-going links incident
  36. to nodes in $C$.
  37. The first phase continues until no individual move can improve the modularity.
  38. The second phase consists in building a new network whose nodes are now the communities
  39. found in the first phase. To do so, the weights of the links between the new nodes are given by
  40. the sum of the weight of the links between nodes in the corresponding two communities. Once this
  41. phase is complete it is possible to reapply the first phase creating bigger communities with
  42. increased modularity.
  43. The above two phases are executed until no modularity gain is achieved (or is less than
  44. the `threshold`, or until `max_levels` is reached).
  45. Be careful with self-loops in the input graph. These are treated as
  46. previously reduced communities -- as if the process had been started
  47. in the middle of the algorithm. Large self-loop edge weights thus
  48. represent strong communities and in practice may be hard to add
  49. other nodes to. If your input graph edge weights for self-loops
  50. do not represent already reduced communities you may want to remove
  51. the self-loops before inputting that graph.
  52. Parameters
  53. ----------
  54. G : NetworkX graph
  55. weight : string or None, optional (default="weight")
  56. The name of an edge attribute that holds the numerical value
  57. used as a weight. If None then each edge has weight 1.
  58. resolution : float, optional (default=1)
  59. If resolution is less than 1, the algorithm favors larger communities.
  60. Greater than 1 favors smaller communities
  61. threshold : float, optional (default=0.0000001)
  62. Modularity gain threshold for each level. If the gain of modularity
  63. between 2 levels of the algorithm is less than the given threshold
  64. then the algorithm stops and returns the resulting communities.
  65. max_level : int or None, optional (default=None)
  66. The maximum number of levels (steps of the algorithm) to compute.
  67. Must be a positive integer or None. If None, then there is no max
  68. level and the threshold parameter determines the stopping condition.
  69. seed : integer, random_state, or None (default)
  70. Indicator of random number generation state.
  71. See :ref:`Randomness<randomness>`.
  72. Returns
  73. -------
  74. list
  75. A list of sets (partition of `G`). Each set represents one community and contains
  76. all the nodes that constitute it.
  77. Examples
  78. --------
  79. >>> import networkx as nx
  80. >>> G = nx.petersen_graph()
  81. >>> nx.community.louvain_communities(G, seed=123)
  82. [{0, 4, 5, 7, 9}, {1, 2, 3, 6, 8}]
  83. Notes
  84. -----
  85. The order in which the nodes are considered can affect the final output. In the algorithm
  86. the ordering happens using a random shuffle.
  87. References
  88. ----------
  89. .. [1] Blondel, V.D. et al. Fast unfolding of communities in
  90. large networks. J. Stat. Mech 10008, 1-12(2008). https://doi.org/10.1088/1742-5468/2008/10/P10008
  91. .. [2] Traag, V.A., Waltman, L. & van Eck, N.J. From Louvain to Leiden: guaranteeing
  92. well-connected communities. Sci Rep 9, 5233 (2019). https://doi.org/10.1038/s41598-019-41695-z
  93. .. [3] Nicolas Dugué, Anthony Perez. Directed Louvain : maximizing modularity in directed networks.
  94. [Research Report] Université d’Orléans. 2015. hal-01231784. https://hal.archives-ouvertes.fr/hal-01231784
  95. See Also
  96. --------
  97. louvain_partitions
  98. :any:`leiden_communities`
  99. """
  100. partitions = louvain_partitions(G, weight, resolution, threshold, seed)
  101. if max_level is not None:
  102. if max_level <= 0:
  103. raise ValueError("max_level argument must be a positive integer or None")
  104. partitions = itertools.islice(partitions, max_level)
  105. final_partition = deque(partitions, maxlen=1)
  106. return final_partition.pop()
  107. @py_random_state("seed")
  108. @nx._dispatchable(edge_attrs="weight")
  109. def louvain_partitions(
  110. G, weight="weight", resolution=1, threshold=0.0000001, seed=None
  111. ):
  112. """Yield partitions for each level of the Louvain Community Detection Algorithm
  113. Louvain Community Detection Algorithm is a simple method to extract the community
  114. structure of a network. This is a heuristic method based on modularity optimization. [1]_
  115. The partitions at each level (step of the algorithm) form a dendrogram of communities.
  116. A dendrogram is a diagram representing a tree and each level represents
  117. a partition of the G graph. The top level contains the smallest communities
  118. and as you traverse to the bottom of the tree the communities get bigger
  119. and the overall modularity increases making the partition better.
  120. Each level is generated by executing the two phases of the Louvain Community
  121. Detection Algorithm.
  122. Be careful with self-loops in the input graph. These are treated as
  123. previously reduced communities -- as if the process had been started
  124. in the middle of the algorithm. Large self-loop edge weights thus
  125. represent strong communities and in practice may be hard to add
  126. other nodes to. If your input graph edge weights for self-loops
  127. do not represent already reduced communities you may want to remove
  128. the self-loops before inputting that graph.
  129. Parameters
  130. ----------
  131. G : NetworkX graph
  132. weight : string or None, optional (default="weight")
  133. The name of an edge attribute that holds the numerical value
  134. used as a weight. If None then each edge has weight 1.
  135. resolution : float, optional (default=1)
  136. If resolution is less than 1, the algorithm favors larger communities.
  137. Greater than 1 favors smaller communities
  138. threshold : float, optional (default=0.0000001)
  139. Modularity gain threshold for each level. If the gain of modularity
  140. between 2 levels of the algorithm is less than the given threshold
  141. then the algorithm stops and returns the resulting communities.
  142. seed : integer, random_state, or None (default)
  143. Indicator of random number generation state.
  144. See :ref:`Randomness<randomness>`.
  145. Yields
  146. ------
  147. list
  148. A list of sets (partition of `G`). Each set represents one community and contains
  149. all the nodes that constitute it.
  150. References
  151. ----------
  152. .. [1] Blondel, V.D. et al. Fast unfolding of communities in
  153. large networks. J. Stat. Mech 10008, 1-12(2008)
  154. See Also
  155. --------
  156. louvain_communities
  157. :any:`leiden_partitions`
  158. """
  159. partition = [{u} for u in G.nodes()]
  160. if nx.is_empty(G):
  161. yield partition
  162. return
  163. mod = modularity(G, partition, resolution=resolution, weight=weight)
  164. is_directed = G.is_directed()
  165. if G.is_multigraph():
  166. graph = _convert_multigraph(G, weight, is_directed)
  167. else:
  168. graph = G.__class__()
  169. graph.add_nodes_from(G)
  170. graph.add_weighted_edges_from(G.edges(data=weight, default=1))
  171. m = graph.size(weight="weight")
  172. partition, inner_partition, improvement = _one_level(
  173. graph, m, partition, resolution, is_directed, seed
  174. )
  175. improvement = True
  176. while improvement:
  177. # gh-5901 protect the sets in the yielded list from further manipulation here
  178. yield [s.copy() for s in partition]
  179. new_mod = modularity(
  180. graph, inner_partition, resolution=resolution, weight="weight"
  181. )
  182. if new_mod - mod <= threshold:
  183. return
  184. mod = new_mod
  185. graph = _gen_graph(graph, inner_partition)
  186. partition, inner_partition, improvement = _one_level(
  187. graph, m, partition, resolution, is_directed, seed
  188. )
  189. def _one_level(G, m, partition, resolution=1, is_directed=False, seed=None):
  190. """Calculate one level of the Louvain partitions tree
  191. Parameters
  192. ----------
  193. G : NetworkX Graph/DiGraph
  194. The graph from which to detect communities
  195. m : number
  196. The size of the graph `G`.
  197. partition : list of sets of nodes
  198. A valid partition of the graph `G`
  199. resolution : positive number
  200. The resolution parameter for computing the modularity of a partition
  201. is_directed : bool
  202. True if `G` is a directed graph.
  203. seed : integer, random_state, or None (default)
  204. Indicator of random number generation state.
  205. See :ref:`Randomness<randomness>`.
  206. """
  207. node2com = {u: i for i, u in enumerate(G.nodes())}
  208. inner_partition = [{u} for u in G.nodes()]
  209. if is_directed:
  210. in_degrees = dict(G.in_degree(weight="weight"))
  211. out_degrees = dict(G.out_degree(weight="weight"))
  212. Stot_in = list(in_degrees.values())
  213. Stot_out = list(out_degrees.values())
  214. # Calculate weights for both in and out neighbors without considering self-loops
  215. nbrs = {}
  216. for u in G:
  217. nbrs[u] = defaultdict(float)
  218. for _, n, wt in G.out_edges(u, data="weight"):
  219. if u != n:
  220. nbrs[u][n] += wt
  221. for n, _, wt in G.in_edges(u, data="weight"):
  222. if u != n:
  223. nbrs[u][n] += wt
  224. else:
  225. degrees = dict(G.degree(weight="weight"))
  226. Stot = list(degrees.values())
  227. nbrs = {u: {v: data["weight"] for v, data in G[u].items() if v != u} for u in G}
  228. rand_nodes = list(G.nodes)
  229. seed.shuffle(rand_nodes)
  230. nb_moves = 1
  231. improvement = False
  232. while nb_moves > 0:
  233. nb_moves = 0
  234. for u in rand_nodes:
  235. best_mod = 0
  236. best_com = node2com[u]
  237. weights2com = _neighbor_weights(nbrs[u], node2com)
  238. if is_directed:
  239. in_degree = in_degrees[u]
  240. out_degree = out_degrees[u]
  241. Stot_in[best_com] -= in_degree
  242. Stot_out[best_com] -= out_degree
  243. remove_cost = (
  244. -weights2com[best_com] / m
  245. + resolution
  246. * (out_degree * Stot_in[best_com] + in_degree * Stot_out[best_com])
  247. / m**2
  248. )
  249. else:
  250. degree = degrees[u]
  251. Stot[best_com] -= degree
  252. remove_cost = -weights2com[best_com] / m + resolution * (
  253. Stot[best_com] * degree
  254. ) / (2 * m**2)
  255. for nbr_com, wt in weights2com.items():
  256. if is_directed:
  257. gain = (
  258. remove_cost
  259. + wt / m
  260. - resolution
  261. * (
  262. out_degree * Stot_in[nbr_com]
  263. + in_degree * Stot_out[nbr_com]
  264. )
  265. / m**2
  266. )
  267. else:
  268. gain = (
  269. remove_cost
  270. + wt / m
  271. - resolution * (Stot[nbr_com] * degree) / (2 * m**2)
  272. )
  273. if gain > best_mod:
  274. best_mod = gain
  275. best_com = nbr_com
  276. if is_directed:
  277. Stot_in[best_com] += in_degree
  278. Stot_out[best_com] += out_degree
  279. else:
  280. Stot[best_com] += degree
  281. if best_com != node2com[u]:
  282. com = G.nodes[u].get("nodes", {u})
  283. partition[node2com[u]].difference_update(com)
  284. inner_partition[node2com[u]].remove(u)
  285. partition[best_com].update(com)
  286. inner_partition[best_com].add(u)
  287. improvement = True
  288. nb_moves += 1
  289. node2com[u] = best_com
  290. partition = list(filter(len, partition))
  291. inner_partition = list(filter(len, inner_partition))
  292. return partition, inner_partition, improvement
  293. def _neighbor_weights(nbrs, node2com):
  294. """Calculate weights between node and its neighbor communities.
  295. Parameters
  296. ----------
  297. nbrs : dictionary
  298. Dictionary with nodes' neighbors as keys and their edge weight as value.
  299. node2com : dictionary
  300. Dictionary with all graph's nodes as keys and their community index as value.
  301. """
  302. weights = defaultdict(float)
  303. for nbr, wt in nbrs.items():
  304. weights[node2com[nbr]] += wt
  305. return weights
  306. def _gen_graph(G, partition):
  307. """Generate a new graph based on the partitions of a given graph"""
  308. H = G.__class__()
  309. node2com = {}
  310. for i, part in enumerate(partition):
  311. nodes = set()
  312. for node in part:
  313. node2com[node] = i
  314. nodes.update(G.nodes[node].get("nodes", {node}))
  315. H.add_node(i, nodes=nodes)
  316. for node1, node2, wt in G.edges(data=True):
  317. wt = wt["weight"]
  318. com1 = node2com[node1]
  319. com2 = node2com[node2]
  320. temp = H.get_edge_data(com1, com2, {"weight": 0})["weight"]
  321. H.add_edge(com1, com2, weight=wt + temp)
  322. return H
  323. def _convert_multigraph(G, weight, is_directed):
  324. """Convert a Multigraph to normal Graph"""
  325. if is_directed:
  326. H = nx.DiGraph()
  327. else:
  328. H = nx.Graph()
  329. H.add_nodes_from(G)
  330. for u, v, wt in G.edges(data=weight, default=1):
  331. if H.has_edge(u, v):
  332. H[u][v]["weight"] += wt
  333. else:
  334. H.add_edge(u, v, weight=wt)
  335. return H