debug.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. import gc
  2. import os
  3. import re
  4. import time
  5. import tracemalloc
  6. from collections import defaultdict, namedtuple
  7. from typing import Callable, List, Optional
  8. from ray.util.annotations import DeveloperAPI
  9. _logged = set()
  10. _disabled = False
  11. _periodic_log = False
  12. _last_logged = 0.0
  13. @DeveloperAPI
  14. def log_once(key):
  15. """Returns True if this is the "first" call for a given key.
  16. Various logging settings can adjust the definition of "first".
  17. Example:
  18. .. testcode::
  19. import logging
  20. from ray.util.debug import log_once
  21. logger = logging.getLogger(__name__)
  22. if log_once("some_key"):
  23. logger.info("Some verbose logging statement")
  24. """
  25. global _last_logged
  26. if _disabled:
  27. return False
  28. elif key not in _logged:
  29. _logged.add(key)
  30. _last_logged = time.time()
  31. return True
  32. elif _periodic_log and time.time() - _last_logged > 60.0:
  33. _logged.clear()
  34. _last_logged = time.time()
  35. return False
  36. else:
  37. return False
  38. @DeveloperAPI
  39. def disable_log_once_globally():
  40. """Make log_once() return False in this process."""
  41. global _disabled
  42. _disabled = True
  43. @DeveloperAPI
  44. def enable_periodic_logging():
  45. """Make log_once() periodically return True in this process."""
  46. global _periodic_log
  47. _periodic_log = True
  48. @DeveloperAPI
  49. def reset_log_once(key: Optional[str] = None):
  50. """Resets log_once for the provided key.
  51. If you don't provide a key, resets log_once for all keys.
  52. """
  53. if key is None:
  54. _logged.clear()
  55. else:
  56. _logged.discard(key)
  57. # A suspicious memory-allocating stack-trace that we should re-test
  58. # to make sure it's not a false positive.
  59. Suspect = DeveloperAPI(
  60. namedtuple(
  61. "Suspect",
  62. [
  63. # The stack trace of the allocation, going back n frames, depending
  64. # on the tracemalloc.start(n) call.
  65. "traceback",
  66. # The amount of memory taken by this particular stack trace
  67. # over the course of the experiment.
  68. "memory_increase",
  69. # The slope of the scipy linear regression (x=iteration; y=memory size).
  70. "slope",
  71. # The rvalue of the scipy linear regression.
  72. "rvalue",
  73. # The memory size history (list of all memory sizes over all iterations).
  74. "hist",
  75. ],
  76. )
  77. )
  78. def _test_some_code_for_memory_leaks(
  79. desc: str,
  80. init: Optional[Callable[[], None]],
  81. code: Callable[[], None],
  82. repeats: int,
  83. max_num_trials: int = 1,
  84. ) -> List[Suspect]:
  85. """Runs given code (and init code) n times and checks for memory leaks.
  86. Args:
  87. desc: A descriptor of the test.
  88. init: Optional code to be executed initially.
  89. code: The actual code to be checked for producing memory leaks.
  90. repeats: How many times to repeatedly execute `code`.
  91. max_num_trials: The maximum number of trials to run. A new trial is only
  92. run, if the previous one produced a memory leak. For all non-1st trials,
  93. `repeats` calculates as: actual_repeats = `repeats` * (trial + 1), where
  94. the first trial is 0.
  95. Returns:
  96. A list of Suspect objects, describing possible memory leaks. If list
  97. is empty, no leaks have been found.
  98. """
  99. def _i_print(i):
  100. if (i + 1) % 10 == 0:
  101. print(".", end="" if (i + 1) % 100 else f" {i + 1}\n", flush=True)
  102. # Do n trials to make sure a found leak is really one.
  103. suspicious = set()
  104. suspicious_stats = []
  105. for trial in range(max_num_trials):
  106. # Store up to n frames of each call stack.
  107. tracemalloc.start(20)
  108. table = defaultdict(list)
  109. # Repeat running code for n times.
  110. # Increase repeat value with each trial to make sure stats are more
  111. # solid each time (avoiding false positives).
  112. actual_repeats = repeats * (trial + 1)
  113. print(f"{desc} {actual_repeats} times.")
  114. # Initialize if necessary.
  115. if init is not None:
  116. init()
  117. # Run `code` n times, each time taking a memory snapshot.
  118. for i in range(actual_repeats):
  119. _i_print(i)
  120. # Manually trigger garbage collection before and after code runs in order to
  121. # make tracemalloc snapshots as accurate as possible.
  122. gc.collect()
  123. code()
  124. gc.collect()
  125. _take_snapshot(table, suspicious)
  126. print("\n")
  127. # Check, which traces have moved up in their memory consumption
  128. # constantly over time.
  129. suspicious.clear()
  130. suspicious_stats.clear()
  131. # Suspicious memory allocation found?
  132. suspects = _find_memory_leaks_in_table(table)
  133. for suspect in sorted(suspects, key=lambda s: s.memory_increase, reverse=True):
  134. # Only print out the biggest offender:
  135. if len(suspicious) == 0:
  136. _pprint_suspect(suspect)
  137. print("-> added to retry list")
  138. suspicious.add(suspect.traceback)
  139. suspicious_stats.append(suspect)
  140. tracemalloc.stop()
  141. # Some suspicious memory allocations found.
  142. if len(suspicious) > 0:
  143. print(f"{len(suspicious)} suspects found. Top-ten:")
  144. for i, s in enumerate(suspicious_stats):
  145. if i > 10:
  146. break
  147. print(
  148. f"{i}) line={s.traceback[-1]} mem-increase={s.memory_increase}B "
  149. f"slope={s.slope}B/detection rval={s.rvalue}"
  150. )
  151. # Nothing suspicious found -> Exit trial loop and return.
  152. else:
  153. print("No remaining suspects found -> returning")
  154. break
  155. # Print out final top offender.
  156. if len(suspicious_stats) > 0:
  157. _pprint_suspect(suspicious_stats[0])
  158. return suspicious_stats
  159. def _take_snapshot(table, suspicious=None):
  160. # Take a memory snapshot.
  161. snapshot = tracemalloc.take_snapshot()
  162. # Group all memory allocations by their stacktrace (going n frames
  163. # deep as defined above in tracemalloc.start(n)).
  164. # Then sort groups by size, then count, then trace.
  165. top_stats = snapshot.statistics("traceback")
  166. # For the first m largest increases, keep only, if a) first trial or b) those
  167. # that are already in the `suspicious` set.
  168. for stat in top_stats[:100]:
  169. if not suspicious or stat.traceback in suspicious:
  170. table[stat.traceback].append(stat.size)
  171. def _find_memory_leaks_in_table(table):
  172. import numpy as np
  173. import scipy.stats
  174. suspects = []
  175. for traceback, hist in table.items():
  176. # Do a quick mem increase check.
  177. memory_increase = hist[-1] - hist[0]
  178. # Only if memory increased, do we check further.
  179. if memory_increase <= 0.0:
  180. continue
  181. # Ignore this very module here (we are collecting lots of data
  182. # so an increase is expected).
  183. top_stack = str(traceback[-1])
  184. drive_separator = "\\\\" if os.name == "nt" else "/"
  185. if any(
  186. s in top_stack
  187. for s in [
  188. "tracemalloc",
  189. "pycharm",
  190. "thirdparty_files/psutil",
  191. re.sub("\\.", drive_separator, __name__) + ".py",
  192. ]
  193. ):
  194. continue
  195. # Do a linear regression to get the slope and R-value.
  196. line = scipy.stats.linregress(x=np.arange(len(hist)), y=np.array(hist))
  197. # - If weak positive slope and some confidence and
  198. # increase > n bytes -> error.
  199. # - If stronger positive slope -> error.
  200. if memory_increase > 1000 and (
  201. (line.slope > 60.0 and line.rvalue > 0.875)
  202. or (line.slope > 20.0 and line.rvalue > 0.9)
  203. or (line.slope > 10.0 and line.rvalue > 0.95)
  204. ):
  205. suspects.append(
  206. Suspect(
  207. traceback=traceback,
  208. memory_increase=memory_increase,
  209. slope=line.slope,
  210. rvalue=line.rvalue,
  211. hist=hist,
  212. )
  213. )
  214. return suspects
  215. def _pprint_suspect(suspect):
  216. print(
  217. "Most suspicious memory allocation in traceback "
  218. "(only printing out this one, but all (less suspicious)"
  219. " suspects will be investigated as well):"
  220. )
  221. print("\n".join(suspect.traceback.format()))
  222. print(f"Increase total={suspect.memory_increase}B")
  223. print(f"Slope={suspect.slope} B/detection")
  224. print(f"Rval={suspect.rvalue}")