density.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. """Fast algorithms for the densest subgraph problem"""
  2. import math
  3. import networkx as nx
  4. __all__ = ["densest_subgraph"]
  5. def _greedy_plus_plus(G, iterations):
  6. if G.number_of_edges() == 0:
  7. return 0.0, set()
  8. if iterations < 1:
  9. raise ValueError(
  10. f"The number of iterations must be an integer >= 1. Provided: {iterations}"
  11. )
  12. loads = {node: 0 for node in G.nodes} # Load vector for Greedy++.
  13. best_density = 0.0 # Highest density encountered.
  14. best_subgraph = set() # Nodes of the best subgraph found.
  15. for _ in range(iterations):
  16. # Initialize heap for fast access to minimum weighted degree.
  17. heap = nx.utils.BinaryHeap()
  18. # Compute initial weighted degrees and add nodes to the heap.
  19. for node, degree in G.degree:
  20. heap.insert(node, loads[node] + degree)
  21. # Set up tracking for current graph state.
  22. remaining_nodes = set(G.nodes)
  23. num_edges = G.number_of_edges()
  24. current_degrees = dict(G.degree)
  25. while remaining_nodes:
  26. num_nodes = len(remaining_nodes)
  27. # Current density of the (implicit) graph
  28. current_density = num_edges / num_nodes
  29. # Update the best density.
  30. if current_density > best_density:
  31. best_density = current_density
  32. best_subgraph = set(remaining_nodes)
  33. # Pop the node with the smallest weighted degree.
  34. node, _ = heap.pop()
  35. if node not in remaining_nodes:
  36. continue # Skip nodes already removed.
  37. # Update the load of the popped node.
  38. loads[node] += current_degrees[node]
  39. # Update neighbors' degrees and the heap.
  40. for neighbor in G.neighbors(node):
  41. if neighbor in remaining_nodes:
  42. current_degrees[neighbor] -= 1
  43. num_edges -= 1
  44. heap.insert(neighbor, loads[neighbor] + current_degrees[neighbor])
  45. # Remove the node from the remaining nodes.
  46. remaining_nodes.remove(node)
  47. return best_density, best_subgraph
  48. def _fractional_peeling(G, b, x, node_to_idx, edge_to_idx):
  49. """
  50. Optimized fractional peeling using NumPy arrays.
  51. Parameters
  52. ----------
  53. G : networkx.Graph
  54. The input graph.
  55. b : numpy.ndarray
  56. Induced load vector.
  57. x : numpy.ndarray
  58. Fractional edge values.
  59. node_to_idx : dict
  60. Mapping from node to index.
  61. edge_to_idx : dict
  62. Mapping from edge to index.
  63. Returns
  64. -------
  65. best_density : float
  66. The best density found.
  67. best_subgraph : set
  68. The subset of nodes defining the densest subgraph.
  69. """
  70. heap = nx.utils.BinaryHeap()
  71. remaining_nodes = set(G.nodes)
  72. # Initialize heap with b values
  73. for idx, node in enumerate(G):
  74. heap.insert(node, b[idx])
  75. num_edges = G.number_of_edges()
  76. best_density = 0.0
  77. best_subgraph = set()
  78. while remaining_nodes:
  79. num_nodes = len(remaining_nodes)
  80. current_density = num_edges / num_nodes
  81. if current_density > best_density:
  82. best_density = current_density
  83. best_subgraph = set(remaining_nodes)
  84. # Pop the node with the smallest b
  85. node, _ = heap.pop()
  86. while node not in remaining_nodes:
  87. node, _ = heap.pop() # Clean the heap from stale values
  88. # Update neighbors b values by subtracting fractional x value
  89. for neighbor in G.neighbors(node):
  90. if neighbor in remaining_nodes:
  91. neighbor_idx = node_to_idx[neighbor]
  92. # Take off fractional value
  93. b[neighbor_idx] -= x[edge_to_idx[(neighbor, node)]]
  94. num_edges -= 1
  95. heap.insert(neighbor, b[neighbor_idx])
  96. remaining_nodes.remove(node) # peel off node
  97. return best_density, best_subgraph
  98. def _fista(G, iterations):
  99. if G.number_of_edges() == 0:
  100. return 0.0, set()
  101. if iterations < 1:
  102. raise ValueError(
  103. f"The number of iterations must be an integer >= 1. Provided: {iterations}"
  104. )
  105. import numpy as np
  106. # 1. Node Mapping: Assign a unique index to each node and edge
  107. node_to_idx = {node: idx for idx, node in enumerate(G)}
  108. num_nodes = G.number_of_nodes()
  109. num_undirected_edges = G.number_of_edges()
  110. # 2. Edge Mapping: Assign a unique index to each bidirectional edge
  111. bidirectional_edges = [(u, v) for u, v in G.edges] + [(v, u) for u, v in G.edges]
  112. edge_to_idx = {edge: idx for idx, edge in enumerate(bidirectional_edges)}
  113. num_edges = len(bidirectional_edges)
  114. # 3. Reverse Edge Mapping: Map each (bidirectional) edge to its reverse edge index
  115. reverse_edge_idx = np.empty(num_edges, dtype=np.int32)
  116. for idx in range(num_undirected_edges):
  117. reverse_edge_idx[idx] = num_undirected_edges + idx
  118. for idx in range(num_undirected_edges, 2 * num_undirected_edges):
  119. reverse_edge_idx[idx] = idx - num_undirected_edges
  120. # 4. Initialize Variables as NumPy Arrays
  121. x = np.full(num_edges, 0.5, dtype=np.float32)
  122. y = x.copy()
  123. z = np.zeros(num_edges, dtype=np.float32)
  124. b = np.zeros(num_nodes, dtype=np.float32) # Induced load vector
  125. tk = 1.0 # Momentum term
  126. # 5. Precompute Edge Source Indices
  127. edge_src_indices = np.array(
  128. [node_to_idx[u] for u, _ in bidirectional_edges], dtype=np.int32
  129. )
  130. # 6. Compute Learning Rate
  131. max_degree = max(deg for _, deg in G.degree)
  132. # 0.9 for floating point errs when max_degree is very large
  133. learning_rate = 0.9 / max_degree
  134. # 7. Iterative Updates
  135. for _ in range(iterations):
  136. # 7a. Update b: sum y over outgoing edges for each node
  137. b[:] = 0.0 # Reset b to zero
  138. np.add.at(b, edge_src_indices, y) # b_u = \sum_{v : (u,v) \in E(G)} y_{uv}
  139. # 7b. Compute z, z_{uv} = y_{uv} - 2 * learning_rate * b_u
  140. z = y - 2.0 * learning_rate * b[edge_src_indices]
  141. # 7c. Update Momentum Term
  142. tknew = (1.0 + math.sqrt(1 + 4.0 * tk**2)) / 2.0
  143. # 7d. Update x in a vectorized manner, x_{uv} = (z_{uv} - z_{vu} + 1.0) / 2.0
  144. new_xuv = (z - z[reverse_edge_idx] + 1.0) / 2.0
  145. clamped_x = np.clip(new_xuv, 0.0, 1.0) # Clamp x_{uv} between 0 and 1
  146. # Update y using the FISTA update formula (similar to gradient descent)
  147. y = (
  148. clamped_x
  149. + ((tk - 1.0) / tknew) * (clamped_x - x)
  150. + (tk / tknew) * (clamped_x - y)
  151. )
  152. # Update x
  153. x = clamped_x
  154. # Update tk, the momemntum term
  155. tk = tknew
  156. # Rebalance the b values! Otherwise performance is a bit suboptimal.
  157. b[:] = 0.0
  158. np.add.at(b, edge_src_indices, x) # b_u = \sum_{v : (u,v) \in E(G)} x_{uv}
  159. # Extract the actual (approximate) dense subgraph.
  160. return _fractional_peeling(G, b, x, node_to_idx, edge_to_idx)
  161. ALGORITHMS = {"greedy++": _greedy_plus_plus, "fista": _fista}
  162. @nx.utils.not_implemented_for("directed")
  163. @nx.utils.not_implemented_for("multigraph")
  164. @nx._dispatchable
  165. def densest_subgraph(G, iterations=1, *, method="fista"):
  166. r"""Returns an approximate densest subgraph for a graph `G`.
  167. This function runs an iterative algorithm to find the densest subgraph,
  168. and returns both the density and the subgraph. For a discussion on the
  169. notion of density used and the different algorithms available on
  170. networkx, please see the Notes section below.
  171. Parameters
  172. ----------
  173. G : NetworkX graph
  174. Undirected graph.
  175. iterations : int, optional (default=1)
  176. Number of iterations to use for the iterative algorithm. Can be
  177. specified positionally or as a keyword argument.
  178. method : string, optional (default='fista')
  179. The algorithm to use to approximate the densest subgraph. Supported
  180. options: 'greedy++' by Boob et al. [2]_ and 'fista' by Harb et al. [3]_.
  181. Must be specified as a keyword argument. Other inputs produce a
  182. ValueError.
  183. Returns
  184. -------
  185. d : float
  186. The density of the approximate subgraph found.
  187. S : set
  188. The subset of nodes defining the approximate densest subgraph.
  189. Examples
  190. --------
  191. >>> G = nx.star_graph(4)
  192. >>> nx.approximation.densest_subgraph(G, iterations=1)
  193. (0.8, {0, 1, 2, 3, 4})
  194. Notes
  195. -----
  196. **Problem Definition:**
  197. The densest subgraph problem (DSG) asks to find the subgraph
  198. $S \subseteq V(G)$ with maximum density. For a subset of the nodes of
  199. $G$, $S \subseteq V(G)$, define $E(S) = \{ (u,v) : (u,v)\in E(G),
  200. u\in S, v\in S \}$ as the set of edges with both endpoints in $S$.
  201. The density of $S$ is defined as $|E(S)|/|S|$, the ratio between the
  202. edges in the subgraph $G[S]$ and the number of nodes in that subgraph.
  203. Note that this is different from the standard graph theoretic definition
  204. of density, defined as $\frac{2|E(S)|}{|S|(|S|-1)}$, for historical
  205. reasons.
  206. **Exact Algorithms:**
  207. The densest subgraph problem is polynomial time solvable using maximum
  208. flow, commonly referred to as Goldberg's algorithm. However, the
  209. algorithm is quite involved. It first binary searches on the optimal
  210. density, $d^\ast$. For a guess of the density $d$, it sets up a flow
  211. network $G'$ with size $O(m)$. The maximum flow solution either
  212. informs the algorithm that no subgraph with density $d$ exists, or it
  213. provides a subgraph with density at least $d$. However, this is
  214. inherently bottlenecked by the maximum flow algorithm. For example, [2]_
  215. notes that Goldberg’s algorithm was not feasible on many large graphs
  216. even though they used a highly optimized maximum flow library.
  217. **Charikar's Greedy Peeling:**
  218. While exact solution algorithms are quite involved, there are several
  219. known approximation algorithms for the densest subgraph problem.
  220. Charikar [1]_ described a very simple 1/2-approximation algorithm for DSG
  221. known as the greedy "peeling" algorithm. The algorithm creates an
  222. ordering of the nodes as follows. The first node $v_1$ is the one with
  223. the smallest degree in $G$ (ties broken arbitrarily). It selects
  224. $v_2$ to be the smallest degree node in $G \setminus v_1$. Letting
  225. $G_i$ be the graph after removing $v_1, ..., v_i$ (with $G_0=G$),
  226. the algorithm returns the graph among $G_0, ..., G_n$ with the highest
  227. density.
  228. **Greedy++:**
  229. Boob et al. [2]_ generalized this algorithm into Greedy++, an iterative
  230. algorithm that runs several rounds of "peeling". In fact, Greedy++ with 1
  231. iteration is precisely Charikar's algorithm. The algorithm converges to a
  232. $(1-\epsilon)$ approximate densest subgraph in $O(\Delta(G)\log
  233. n/\epsilon^2)$ iterations, where $\Delta(G)$ is the maximum degree,
  234. and $n$ is the number of nodes in $G$. The algorithm also has other
  235. desirable properties as shown by [4]_ and [5]_.
  236. **FISTA Algorithm:**
  237. Harb et al. [3]_ gave a faster and more scalable algorithm using ideas
  238. from quadratic programming for the densest subgraph, which is based on a
  239. fast iterative shrinkage-thresholding algorithm (FISTA) algorithm. It is
  240. known that computing the densest subgraph can be formulated as the
  241. following convex optimization problem:
  242. Minimize $\sum_{u \in V(G)} b_u^2$
  243. Subject to:
  244. $b_u = \sum_{v: \{u,v\} \in E(G)} x_{uv}$ for all $u \in V(G)$
  245. $x_{uv} + x_{vu} = 1.0$ for all $\{u,v\} \in E(G)$
  246. $x_{uv} \geq 0, x_{vu} \geq 0$ for all $\{u,v\} \in E(G)$
  247. Here, $x_{uv}$ represents the fraction of edge $\{u,v\}$ assigned to
  248. $u$, and $x_{vu}$ to $v$.
  249. The FISTA algorithm efficiently solves this convex program using gradient
  250. descent with projections. For a learning rate $\alpha$, the algorithm
  251. does:
  252. 1. **Initialization**: Set $x^{(0)}_{uv} = x^{(0)}_{vu} = 0.5$ for all
  253. edges as a feasible solution.
  254. 2. **Gradient Update**: For iteration $k\geq 1$, set
  255. $x^{(k+1)}_{uv} = x^{(k)}_{uv} - 2 \alpha \sum_{v: \{u,v\} \in E(G)}
  256. x^{(k)}_{uv}$. However, now $x^{(k+1)}_{uv}$ might be infeasible!
  257. To ensure feasibility, we project $x^{(k+1)}_{uv}$.
  258. 3. **Projection to the Feasible Set**: Compute
  259. $b^{(k+1)}_u = \sum_{v: \{u,v\} \in E(G)} x^{(k)}_{uv}$ for all
  260. nodes $u$. Define $z^{(k+1)}_{uv} = x^{(k+1)}_{uv} - 2 \alpha
  261. b^{(k+1)}_u$. Update $x^{(k+1)}_{uv} =
  262. CLAMP((z^{(k+1)}_{uv} - z^{(k+1)}_{vu} + 1.0) / 2.0)$, where
  263. $CLAMP(x) = \max(0, \min(1, x))$.
  264. With a learning rate of $\alpha=1/\Delta(G)$, where $\Delta(G)$ is
  265. the maximum degree, the algorithm converges to the optimum solution of
  266. the convex program.
  267. **Fractional Peeling:**
  268. To obtain a **discrete** subgraph, we use fractional peeling, an
  269. adaptation of the standard peeling algorithm which peels the minimum
  270. degree vertex in each iteration, and returns the densest subgraph found
  271. along the way. Here, we instead peel the vertex with the smallest
  272. induced load $b_u$:
  273. 1. Compute $b_u$ and $x_{uv}$.
  274. 2. Iteratively remove the vertex with the smallest $b_u$, updating its
  275. neighbors' load by $x_{vu}$.
  276. Fractional peeling transforms the approximately optimal fractional
  277. values $b_u, x_{uv}$ into a discrete subgraph. Unlike traditional
  278. peeling, which removes the lowest-degree node, this method accounts for
  279. fractional edge contributions from the convex program.
  280. This approach is both scalable and theoretically sound, ensuring a quick
  281. approximation of the densest subgraph while leveraging fractional load
  282. balancing.
  283. References
  284. ----------
  285. .. [1] Charikar, Moses. "Greedy approximation algorithms for finding dense
  286. components in a graph." In International workshop on approximation
  287. algorithms for combinatorial optimization, pp. 84-95. Berlin, Heidelberg:
  288. Springer Berlin Heidelberg, 2000.
  289. .. [2] Boob, Digvijay, Yu Gao, Richard Peng, Saurabh Sawlani, Charalampos
  290. Tsourakakis, Di Wang, and Junxing Wang. "Flowless: Extracting densest
  291. subgraphs without flow computations." In Proceedings of The Web Conference
  292. 2020, pp. 573-583. 2020.
  293. .. [3] Harb, Elfarouk, Kent Quanrud, and Chandra Chekuri. "Faster and scalable
  294. algorithms for densest subgraph and decomposition." Advances in Neural
  295. Information Processing Systems 35 (2022): 26966-26979.
  296. .. [4] Harb, Elfarouk, Kent Quanrud, and Chandra Chekuri. "Convergence to
  297. lexicographically optimal base in a (contra) polymatroid and applications
  298. to densest subgraph and tree packing." arXiv preprint arXiv:2305.02987
  299. (2023).
  300. .. [5] Chekuri, Chandra, Kent Quanrud, and Manuel R. Torres. "Densest
  301. subgraph: Supermodularity, iterative peeling, and flow." In Proceedings of
  302. the 2022 Annual ACM-SIAM Symposium on Discrete Algorithms (SODA), pp.
  303. 1531-1555. Society for Industrial and Applied Mathematics, 2022.
  304. """
  305. try:
  306. algo = ALGORITHMS[method]
  307. except KeyError as e:
  308. raise ValueError(f"{method} is not a valid choice for an algorithm.") from e
  309. return algo(G, iterations)