sentry.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. from __future__ import annotations
  2. import atexit
  3. import contextlib
  4. import functools
  5. import os
  6. import pathlib
  7. import sys
  8. import threading
  9. from types import TracebackType
  10. from typing import Any, Callable, Literal, TypeVar
  11. from urllib.parse import quote
  12. from typing_extensions import Concatenate, Never, ParamSpec
  13. _P = ParamSpec("_P")
  14. _T = TypeVar("_T")
  15. SENTRY_DEFAULT_DSN = (
  16. "https://2592b1968ea94cca9b5ef5e348e094a7@o151352.ingest.sentry.io/4504800232407040"
  17. )
  18. SessionStatus = Literal["ok", "exited", "crashed", "abnormal"]
  19. def _guard(
  20. method: Callable[Concatenate[Sentry, _P], _T],
  21. ) -> Callable[Concatenate[Sentry, _P], _T | None]:
  22. """Make a Sentry method safe, lazy, and non-raising.
  23. The wrapped method becomes a no-op if Sentry is disabled,
  24. this instance belongs to a different PID, or lazy boot fails
  25. """
  26. @functools.wraps(method)
  27. def wrapper(
  28. self: Sentry,
  29. *args: _P.args,
  30. **kwargs: _P.kwargs,
  31. ) -> _T | None:
  32. if not self._enabled:
  33. return None
  34. # If this instance belongs to a different process (fork happened),
  35. # do nothing; get_sentry() will create a fresh instance for the child.
  36. if self._pid != os.getpid():
  37. return None
  38. if not self._booted and not self._boot():
  39. return None
  40. try:
  41. return method(self, *args, **kwargs)
  42. except Exception as e:
  43. if method.__name__ != "exception":
  44. # Best-effort logging of wrapper-level failures.
  45. with contextlib.suppress(Exception):
  46. self.exception(f"Error in {method.__name__}: {e}")
  47. return None
  48. return wrapper
  49. class Sentry:
  50. def __init__(self, *, pid: int) -> None:
  51. from wandb import env as _env
  52. self._pid: int = pid
  53. self._enabled: bool = bool(_env.error_reporting_enabled())
  54. self._booted: bool = False
  55. self._boot_lock = threading.Lock()
  56. self._atexit_registered: bool = False
  57. self._sent_messages: set[str] = set()
  58. self._sdk: Any | None = None # will hold the sentry_sdk module after boot
  59. self.scope: Any | None = None
  60. self.dsn: str | None = os.environ.get(_env.SENTRY_DSN, SENTRY_DEFAULT_DSN)
  61. @property
  62. def environment(self) -> str:
  63. is_git = pathlib.Path(__file__).parent.parent.parent.joinpath(".git").exists()
  64. return "development" if is_git else "production"
  65. def _boot(self) -> bool:
  66. """Import sentry_sdk and set up client/scope."""
  67. from wandb import __version__
  68. with self._boot_lock:
  69. if not self._enabled:
  70. return False
  71. if self._booted:
  72. return True
  73. try:
  74. import sentry_sdk # type: ignore
  75. import sentry_sdk.scope # type: ignore
  76. import sentry_sdk.utils # type: ignore
  77. self._sdk = sentry_sdk
  78. client = self._sdk.Client(
  79. dsn=self.dsn,
  80. default_integrations=False,
  81. environment=self.environment,
  82. release=__version__,
  83. )
  84. scope = self._sdk.get_global_scope().fork()
  85. scope.clear()
  86. scope.set_client(client)
  87. self.scope = scope
  88. self._booted = True
  89. if not self._atexit_registered:
  90. atexit.register(self.end_session)
  91. self._atexit_registered = True
  92. except Exception:
  93. # Disable on any failure.
  94. self._enabled = False
  95. self._booted = False
  96. self._sdk = None
  97. self.scope = None
  98. return False
  99. return True
  100. @_guard
  101. def message(
  102. self,
  103. message: str,
  104. repeat: bool = True,
  105. level: str = "info",
  106. ) -> str | None:
  107. if not repeat and message in self._sent_messages:
  108. return None
  109. self._sent_messages.add(message)
  110. with self._sdk.scope.use_isolation_scope(self.scope): # type: ignore
  111. return self._sdk.capture_message(message, level=level) # type: ignore
  112. @_guard
  113. def exception(
  114. self,
  115. exc: str
  116. | BaseException
  117. | tuple[
  118. type[BaseException] | None,
  119. BaseException | None,
  120. TracebackType | None,
  121. ]
  122. | None,
  123. handled: bool = False,
  124. status: SessionStatus | None = None,
  125. ) -> str | None:
  126. if isinstance(exc, str):
  127. exc_info = self._sdk.utils.exc_info_from_error(Exception(exc)) # type: ignore
  128. elif isinstance(exc, BaseException):
  129. exc_info = self._sdk.utils.exc_info_from_error(exc) # type: ignore
  130. else:
  131. exc_info = sys.exc_info()
  132. event, _ = self._sdk.utils.event_from_exception( # type: ignore
  133. exc_info,
  134. client_options=self.scope.get_client().options, # type: ignore
  135. mechanism={"type": "generic", "handled": handled},
  136. )
  137. event_id = None
  138. with contextlib.suppress(Exception):
  139. with self._sdk.scope.use_isolation_scope(self.scope): # type: ignore
  140. event_id = self._sdk.capture_event(event) # type: ignore
  141. status = status or ("crashed" if not handled else "errored") # type: ignore
  142. self.mark_session(status=status)
  143. client = self.scope.get_client() # type: ignore
  144. if client is not None:
  145. client.flush()
  146. return event_id
  147. def reraise(self, exc: Any) -> Never:
  148. """Re-raise after logging, preserving traceback. Safe if disabled."""
  149. try:
  150. self.exception(exc) # @_guard applies here
  151. finally:
  152. _, _, tb = sys.exc_info()
  153. if tb is not None and hasattr(exc, "with_traceback"):
  154. raise exc.with_traceback(tb)
  155. raise exc
  156. @_guard
  157. def start_session(self) -> None:
  158. if self.scope is None:
  159. return
  160. if self.scope._session is None:
  161. self.scope.start_session()
  162. @_guard
  163. def end_session(self) -> None:
  164. if self.scope is None:
  165. return
  166. client = self.scope.get_client()
  167. session = self.scope._session
  168. if session is not None and client is not None:
  169. self.scope.end_session()
  170. client.flush()
  171. @_guard
  172. def mark_session(self, status: SessionStatus | None = None) -> None:
  173. if self.scope is None:
  174. return
  175. session = self.scope._session
  176. if session is not None:
  177. session.update(status=status)
  178. @_guard
  179. def configure_scope(
  180. self,
  181. tags: dict[str, Any] | None = None,
  182. process_context: str | None = None,
  183. ) -> None:
  184. import wandb.util
  185. if self.scope is None:
  186. return
  187. settings_tags = (
  188. "entity",
  189. "project",
  190. "run_id",
  191. "run_url",
  192. "sweep_url",
  193. "sweep_id",
  194. "deployment",
  195. "launch",
  196. "_platform",
  197. )
  198. if process_context:
  199. self.scope.set_tag("process_context", process_context)
  200. if tags is None:
  201. return None
  202. for tag in settings_tags:
  203. val = tags.get(tag, None)
  204. if val not in (None, ""):
  205. self.scope.set_tag(tag, val)
  206. if tags.get("_colab", None):
  207. python_runtime = "colab"
  208. elif tags.get("_jupyter", None):
  209. python_runtime = "jupyter"
  210. elif tags.get("_ipython", None):
  211. python_runtime = "ipython"
  212. else:
  213. python_runtime = "python"
  214. self.scope.set_tag("python_runtime", python_runtime)
  215. # Construct run_url and sweep_url given run_id and sweep_id.
  216. for obj in ("run", "sweep"):
  217. obj_id, obj_url = f"{obj}_id", f"{obj}_url"
  218. if tags.get(obj_url, None):
  219. continue
  220. try:
  221. app_url = wandb.util.app_url(tags["base_url"]) # type: ignore[index]
  222. entity, project = (quote(tags[k]) for k in ("entity", "project")) # type: ignore[index]
  223. self.scope.set_tag(
  224. obj_url,
  225. f"{app_url}/{entity}/{project}/{obj}s/{tags[obj_id]}",
  226. )
  227. except Exception:
  228. pass
  229. email = tags.get("email")
  230. if email:
  231. self.scope.user = {"email": email}
  232. self.start_session()
  233. _singleton: Sentry | None = None
  234. _singleton_lock = threading.Lock()
  235. def get_sentry() -> Sentry:
  236. """Return the Sentry singleton for the current process (fork-aware).
  237. Creates a new instance in child processes after fork.
  238. Thread-safe within each process.
  239. """
  240. global _singleton
  241. pid = os.getpid()
  242. with _singleton_lock:
  243. if _singleton is not None and _singleton._pid == pid:
  244. return _singleton
  245. if _singleton is None or _singleton._pid != pid:
  246. _singleton = Sentry(pid=pid)
  247. return _singleton