dashboard.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. import argparse
  2. import logging
  3. import os
  4. import platform
  5. import signal
  6. import sys
  7. import traceback
  8. from typing import Optional, Set
  9. import ray
  10. import ray._private.ray_constants as ray_constants
  11. import ray.dashboard.consts as dashboard_consts
  12. import ray.dashboard.head as dashboard_head
  13. import ray.dashboard.utils as dashboard_utils
  14. from ray._common.ray_constants import (
  15. LOGGING_ROTATE_BACKUP_COUNT,
  16. LOGGING_ROTATE_BYTES,
  17. )
  18. from ray._common.utils import get_or_create_event_loop
  19. from ray._private import logging_utils
  20. from ray._private.ray_logging import setup_component_logger
  21. from ray._private.utils import (
  22. format_error_message,
  23. publish_error_to_driver,
  24. )
  25. # Logger for this module. It should be configured at the entry point
  26. # into the program using Ray. Ray provides a default configuration at
  27. # entry/init points.
  28. logger = logging.getLogger(__name__)
  29. class Dashboard:
  30. """A dashboard process for monitoring Ray nodes.
  31. This dashboard is made up of a REST API which collates data published by
  32. Reporter processes on nodes into a json structure, and a webserver
  33. which polls said API for display purposes.
  34. Args:
  35. host: Host address of dashboard aiohttp server.
  36. port: Port number of dashboard aiohttp server.
  37. port_retries: The retry times to select a valid port.
  38. gcs_address: GCS address of the cluster.
  39. cluster_id_hex: Cluster ID hex string.
  40. node_ip_address: The IP address of the dashboard.
  41. serve_frontend: If configured, frontend HTML
  42. is not served from the dashboard.
  43. log_dir: Log directory of dashboard.
  44. logging_level: The logging level (e.g. logging.INFO, logging.DEBUG)
  45. logging_format: The format string for log messages
  46. logging_filename: The name of the log file
  47. logging_rotate_bytes: Max size in bytes before rotating log file
  48. logging_rotate_backup_count: Number of backup files to keep when rotating
  49. """
  50. def __init__(
  51. self,
  52. host: str,
  53. port: int,
  54. port_retries: int,
  55. gcs_address: str,
  56. cluster_id_hex: str,
  57. node_ip_address: str,
  58. log_dir: str,
  59. logging_level: int,
  60. logging_format: str,
  61. logging_filename: str,
  62. logging_rotate_bytes: int,
  63. logging_rotate_backup_count: int,
  64. temp_dir: str = None,
  65. session_dir: str = None,
  66. minimal: bool = False,
  67. serve_frontend: bool = True,
  68. modules_to_load: Optional[Set[str]] = None,
  69. ):
  70. self.dashboard_head = dashboard_head.DashboardHead(
  71. http_host=host,
  72. http_port=port,
  73. http_port_retries=port_retries,
  74. gcs_address=gcs_address,
  75. cluster_id_hex=cluster_id_hex,
  76. node_ip_address=node_ip_address,
  77. log_dir=log_dir,
  78. logging_level=logging_level,
  79. logging_format=logging_format,
  80. logging_filename=logging_filename,
  81. logging_rotate_bytes=logging_rotate_bytes,
  82. logging_rotate_backup_count=logging_rotate_backup_count,
  83. temp_dir=temp_dir,
  84. session_dir=session_dir,
  85. minimal=minimal,
  86. serve_frontend=serve_frontend,
  87. modules_to_load=modules_to_load,
  88. )
  89. async def run(self):
  90. await self.dashboard_head.run()
  91. if __name__ == "__main__":
  92. parser = argparse.ArgumentParser(description="Ray dashboard.")
  93. parser.add_argument(
  94. "--host", required=True, type=str, help="The host to use for the HTTP server."
  95. )
  96. parser.add_argument(
  97. "--port", required=True, type=int, help="The port to use for the HTTP server."
  98. )
  99. parser.add_argument(
  100. "--port-retries",
  101. required=False,
  102. type=int,
  103. default=0,
  104. help="The retry times to select a valid port.",
  105. )
  106. parser.add_argument(
  107. "--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
  108. )
  109. parser.add_argument(
  110. "--cluster-id-hex", required=True, type=str, help="The cluster ID in hex."
  111. )
  112. parser.add_argument(
  113. "--node-ip-address",
  114. required=True,
  115. type=str,
  116. help="The IP address of the node where this is running.",
  117. )
  118. parser.add_argument(
  119. "--logging-level",
  120. required=False,
  121. type=lambda s: logging.getLevelName(s.upper()),
  122. default=ray_constants.LOGGER_LEVEL,
  123. choices=ray_constants.LOGGER_LEVEL_CHOICES,
  124. help=ray_constants.LOGGER_LEVEL_HELP,
  125. )
  126. parser.add_argument(
  127. "--logging-format",
  128. required=False,
  129. type=str,
  130. default=ray_constants.LOGGER_FORMAT,
  131. help=ray_constants.LOGGER_FORMAT_HELP,
  132. )
  133. parser.add_argument(
  134. "--logging-filename",
  135. required=False,
  136. type=str,
  137. default=dashboard_consts.DASHBOARD_LOG_FILENAME,
  138. help="Specify the name of log file, "
  139. 'log to stdout if set empty, default is "{}"'.format(
  140. dashboard_consts.DASHBOARD_LOG_FILENAME
  141. ),
  142. )
  143. parser.add_argument(
  144. "--logging-rotate-bytes",
  145. required=False,
  146. type=int,
  147. default=LOGGING_ROTATE_BYTES,
  148. help="Specify the max bytes for rotating "
  149. "log file, default is {} bytes.".format(LOGGING_ROTATE_BYTES),
  150. )
  151. parser.add_argument(
  152. "--logging-rotate-backup-count",
  153. required=False,
  154. type=int,
  155. default=LOGGING_ROTATE_BACKUP_COUNT,
  156. help="Specify the backup count of rotated log file, default is {}.".format(
  157. LOGGING_ROTATE_BACKUP_COUNT
  158. ),
  159. )
  160. parser.add_argument(
  161. "--log-dir",
  162. required=True,
  163. type=str,
  164. default=None,
  165. help="Specify the path of log directory.",
  166. )
  167. parser.add_argument(
  168. "--temp-dir",
  169. required=True,
  170. type=str,
  171. default=None,
  172. help="Specify the path of the temporary directory use by Ray process.",
  173. )
  174. parser.add_argument(
  175. "--session-dir",
  176. required=True,
  177. type=str,
  178. default=None,
  179. help="Specify the path of the session directory of the cluster.",
  180. )
  181. parser.add_argument(
  182. "--minimal",
  183. action="store_true",
  184. help=(
  185. "Minimal dashboard only contains a subset of features that don't "
  186. "require additional dependencies installed when ray is installed "
  187. "by `pip install ray[default]`."
  188. ),
  189. )
  190. parser.add_argument(
  191. "--modules-to-load",
  192. required=False,
  193. default=None,
  194. help=(
  195. "Specify the list of module names in [module_1],[module_2] format."
  196. "E.g., JobHead,StateHead... "
  197. "If nothing is specified, all modules are loaded."
  198. ),
  199. )
  200. parser.add_argument(
  201. "--disable-frontend",
  202. action="store_true",
  203. help=("If configured, frontend html is not served from the server."),
  204. )
  205. parser.add_argument(
  206. "--stdout-filepath",
  207. required=False,
  208. type=str,
  209. default="",
  210. help="The filepath to dump dashboard stdout.",
  211. )
  212. parser.add_argument(
  213. "--stderr-filepath",
  214. required=False,
  215. type=str,
  216. default="",
  217. help="The filepath to dump dashboard stderr.",
  218. )
  219. args = parser.parse_args()
  220. try:
  221. # Disable log rotation for windows platform.
  222. logging_rotation_bytes = (
  223. args.logging_rotate_bytes if sys.platform != "win32" else 0
  224. )
  225. logging_rotation_backup_count = (
  226. args.logging_rotate_backup_count if sys.platform != "win32" else 1
  227. )
  228. setup_component_logger(
  229. logging_level=args.logging_level,
  230. logging_format=args.logging_format,
  231. log_dir=args.log_dir,
  232. filename=args.logging_filename,
  233. max_bytes=logging_rotation_bytes,
  234. backup_count=logging_rotation_backup_count,
  235. )
  236. # Setup stdout/stderr redirect files if redirection enabled.
  237. logging_utils.redirect_stdout_stderr_if_needed(
  238. args.stdout_filepath,
  239. args.stderr_filepath,
  240. logging_rotation_bytes,
  241. logging_rotation_backup_count,
  242. )
  243. if args.modules_to_load:
  244. modules_to_load = set(args.modules_to_load.strip(" ,").split(","))
  245. else:
  246. # None == default.
  247. modules_to_load = None
  248. loop = get_or_create_event_loop()
  249. dashboard = Dashboard(
  250. host=args.host,
  251. port=args.port,
  252. port_retries=args.port_retries,
  253. gcs_address=args.gcs_address,
  254. cluster_id_hex=args.cluster_id_hex,
  255. node_ip_address=args.node_ip_address,
  256. log_dir=args.log_dir,
  257. logging_level=args.logging_level,
  258. logging_format=args.logging_format,
  259. logging_filename=args.logging_filename,
  260. logging_rotate_bytes=logging_rotation_bytes,
  261. logging_rotate_backup_count=logging_rotation_backup_count,
  262. temp_dir=args.temp_dir,
  263. session_dir=args.session_dir,
  264. minimal=args.minimal,
  265. serve_frontend=(not args.disable_frontend),
  266. modules_to_load=modules_to_load,
  267. )
  268. def sigterm_handler():
  269. logger.warning("Exiting with SIGTERM immediately...")
  270. os._exit(signal.SIGTERM)
  271. if sys.platform != "win32":
  272. # TODO(rickyyx): we currently do not have any logic for actual
  273. # graceful termination in the dashboard. Most of the underlying
  274. # async tasks run by the dashboard head doesn't handle CancelledError.
  275. # So a truly graceful shutdown is not trivial w/o much refactoring.
  276. # Re-open the issue: https://github.com/ray-project/ray/issues/25518
  277. # if a truly graceful shutdown is required.
  278. loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
  279. loop.run_until_complete(dashboard.run())
  280. except Exception as e:
  281. traceback_str = format_error_message(traceback.format_exc())
  282. message = (
  283. f"The dashboard on node {platform.uname()[1]} "
  284. f"failed with the following "
  285. f"error:\n{traceback_str}"
  286. )
  287. if isinstance(e, dashboard_utils.FrontendNotFoundError):
  288. logger.warning(message)
  289. else:
  290. logger.error(message)
  291. raise e
  292. # Something went wrong, so push an error to all drivers.
  293. publish_error_to_driver(
  294. ray_constants.DASHBOARD_DIED_ERROR,
  295. message,
  296. gcs_client=ray._raylet.GcsClient(address=args.gcs_address),
  297. )