restarter.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. """A basic kernel monitor with autorestarting.
  2. This watches a kernel's state using KernelManager.is_alive and auto
  3. restarts the kernel if it dies.
  4. It is an incomplete base class, and must be subclassed.
  5. """
  6. # Copyright (c) Jupyter Development Team.
  7. # Distributed under the terms of the Modified BSD License.
  8. from __future__ import annotations
  9. import time
  10. import typing as t
  11. from traitlets import Bool, Dict, Float, Instance, Integer, default
  12. from traitlets.config.configurable import LoggingConfigurable
  13. class KernelRestarter(LoggingConfigurable):
  14. """Monitor and autorestart a kernel."""
  15. kernel_manager = Instance("jupyter_client.KernelManager")
  16. debug = Bool(
  17. False,
  18. config=True,
  19. help="""Whether to include every poll event in debugging output.
  20. Has to be set explicitly, because there will be *a lot* of output.
  21. """,
  22. )
  23. time_to_dead = Float(3.0, config=True, help="""Kernel heartbeat interval in seconds.""")
  24. stable_start_time = Float(
  25. 10.0,
  26. config=True,
  27. help="""The time in seconds to consider the kernel to have completed a stable start up.""",
  28. )
  29. restart_limit = Integer(
  30. 5,
  31. config=True,
  32. help="""The number of consecutive autorestarts before the kernel is presumed dead.""",
  33. )
  34. random_ports_until_alive = Bool(
  35. True,
  36. config=True,
  37. help="""Whether to choose new random ports when restarting before the kernel is alive.""",
  38. )
  39. _restarting = Bool(False)
  40. _restart_count = Integer(0)
  41. _initial_startup = Bool(True)
  42. _last_dead = Float()
  43. @default("_last_dead")
  44. def _default_last_dead(self) -> float:
  45. return time.time()
  46. callbacks = Dict()
  47. def _callbacks_default(self) -> dict[str, list]:
  48. return {"restart": [], "dead": []}
  49. def start(self) -> None:
  50. """Start the polling of the kernel."""
  51. msg = "Must be implemented in a subclass"
  52. raise NotImplementedError(msg)
  53. def stop(self) -> None:
  54. """Stop the kernel polling."""
  55. msg = "Must be implemented in a subclass"
  56. raise NotImplementedError(msg)
  57. def add_callback(self, f: t.Callable[..., t.Any], event: str = "restart") -> None:
  58. """register a callback to fire on a particular event
  59. Possible values for event:
  60. 'restart' (default): kernel has died, and will be restarted.
  61. 'dead': restart has failed, kernel will be left dead.
  62. """
  63. self.callbacks[event].append(f)
  64. def remove_callback(self, f: t.Callable[..., t.Any], event: str = "restart") -> None:
  65. """unregister a callback to fire on a particular event
  66. Possible values for event:
  67. 'restart' (default): kernel has died, and will be restarted.
  68. 'dead': restart has failed, kernel will be left dead.
  69. """
  70. try:
  71. self.callbacks[event].remove(f)
  72. except ValueError:
  73. pass
  74. def _fire_callbacks(self, event: t.Any) -> None:
  75. """fire our callbacks for a particular event"""
  76. for callback in self.callbacks[event]:
  77. try:
  78. callback()
  79. except Exception:
  80. self.log.error(
  81. "KernelRestarter: %s callback %r failed",
  82. event,
  83. callback,
  84. exc_info=True,
  85. )
  86. def poll(self) -> None:
  87. if self.debug:
  88. self.log.debug("Polling kernel...")
  89. if self.kernel_manager.shutting_down:
  90. self.log.debug("Kernel shutdown in progress...")
  91. return
  92. now = time.time()
  93. if not self.kernel_manager.is_alive():
  94. self._last_dead = now
  95. if self._restarting:
  96. self._restart_count += 1
  97. else:
  98. self._restart_count = 1
  99. if self._restart_count > self.restart_limit:
  100. self.log.warning("KernelRestarter: restart failed")
  101. self._fire_callbacks("dead")
  102. self._restarting = False
  103. self._restart_count = 0
  104. self.stop()
  105. else:
  106. newports = self.random_ports_until_alive and self._initial_startup
  107. self.log.info(
  108. "KernelRestarter: restarting kernel (%i/%i), %s random ports",
  109. self._restart_count,
  110. self.restart_limit,
  111. "new" if newports else "keep",
  112. )
  113. self._fire_callbacks("restart")
  114. self.kernel_manager.restart_kernel(now=True, newports=newports)
  115. self._restarting = True
  116. else:
  117. # Since `is_alive` only tests that the kernel process is alive, it does not
  118. # indicate that the kernel has successfully completed startup. To solve this
  119. # correctly, we would need to wait for a kernel info reply, but it is not
  120. # necessarily appropriate to start a kernel client + channels in the
  121. # restarter. Therefore, we use "has been alive continuously for X time" as a
  122. # heuristic for a stable start up.
  123. # See https://github.com/jupyter/jupyter_client/pull/717 for details.
  124. stable_start_time = self.stable_start_time
  125. if self.kernel_manager.provisioner:
  126. stable_start_time = self.kernel_manager.provisioner.get_stable_start_time(
  127. recommended=stable_start_time
  128. )
  129. if self._initial_startup and now - self._last_dead >= stable_start_time:
  130. self._initial_startup = False
  131. if self._restarting and now - self._last_dead >= stable_start_time:
  132. self.log.debug("KernelRestarter: restart apparently succeeded")
  133. self._restarting = False