consts.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import os
  2. from ray._private.ray_constants import env_bool, env_float, env_integer
  3. DASHBOARD_LOG_FILENAME = "dashboard.log"
  4. DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX = "DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX:"
  5. DASHBOARD_AGENT_ADDR_IP_PREFIX = "DASHBOARD_AGENT_ADDR_IP_PREFIX:"
  6. DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
  7. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME = (
  8. "RAY_DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S" # noqa
  9. )
  10. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S = env_integer(
  11. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME, 0.4
  12. )
  13. # The maximum time that parent can be considered
  14. # as dead before agent kills itself.
  15. _PARENT_DEATH_THREASHOLD = 5
  16. RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME = "RAY_STATE_SERVER_MAX_HTTP_REQUEST"
  17. # Default number of in-progress requests to the state api server.
  18. RAY_STATE_SERVER_MAX_HTTP_REQUEST = env_integer(
  19. RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, 100
  20. )
  21. # Max allowed number of in-progress requests could be configured.
  22. RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000
  23. RAY_DASHBOARD_STATS_PURGING_INTERVAL = env_integer(
  24. "RAY_DASHBOARD_STATS_PURGING_INTERVAL", 60 * 10
  25. )
  26. RAY_DASHBOARD_STATS_UPDATING_INTERVAL = env_integer(
  27. "RAY_DASHBOARD_STATS_UPDATING_INTERVAL", 15
  28. )
  29. GCS_SERVER_ADDRESS = "GcsServerAddress"
  30. # GCS check alive
  31. GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
  32. GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
  33. # aiohttp_cache
  34. AIOHTTP_CACHE_TTL_SECONDS = 2
  35. AIOHTTP_CACHE_MAX_SIZE = 128
  36. AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
  37. # Default value for datacenter (the default value in protobuf)
  38. DEFAULT_LANGUAGE = "PYTHON"
  39. DEFAULT_JOB_ID = "ffff"
  40. # Hook that is invoked on the dashboard `/api/component_activities` endpoint.
  41. # Environment variable stored here should be a callable that does not
  42. # take any arguments and should return a dictionary mapping
  43. # activity component type (str) to
  44. # ray.dashboard.modules.api.api_head.RayActivityResponse.
  45. # Example: "your.module.ray_cluster_activity_hook".
  46. RAY_CLUSTER_ACTIVITY_HOOK = "RAY_CLUSTER_ACTIVITY_HOOK"
  47. # The number of candidate agents
  48. CANDIDATE_AGENT_NUMBER = max(env_integer("CANDIDATE_AGENT_NUMBER", 1), 1)
  49. # when head receive JobSubmitRequest, maybe not any agent is available,
  50. # we need to wait for agents in other node start
  51. WAIT_AVAILABLE_AGENT_TIMEOUT = 10
  52. TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS = 0.5
  53. RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR = "RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES"
  54. RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR = (
  55. "RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG"
  56. )
  57. # The max time to wait for the JobSupervisor to start before failing the job.
  58. DEFAULT_JOB_START_TIMEOUT_SECONDS = 60 * 15
  59. RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR = "RAY_JOB_START_TIMEOUT_SECONDS"
  60. # Port that dashboard prometheus metrics will be exported to
  61. DASHBOARD_METRIC_PORT = env_integer("DASHBOARD_METRIC_PORT", 44227)
  62. # We use RayNodeType to mark head/worker nodes. IsHeadNode is retained
  63. # for backward compatibility for user-customized dashboards that might rely on it
  64. NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode", "RayNodeType"]
  65. GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"]
  66. # TpuDeviceName and TpuIndex are expected to be equal to the number of TPU
  67. # chips in the cluster. TpuType and TpuTopology are proportional to the number
  68. # of node pools.
  69. TPU_TAG_KEYS = NODE_TAG_KEYS + ["TpuDeviceName", "TpuIndex", "TpuType", "TpuTopology"]
  70. CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
  71. COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]
  72. COMPONENT_GPU_TAG_KEYS = GPU_TAG_KEYS + COMPONENT_METRICS_TAG_KEYS
  73. # Dashboard metrics are tracked separately at the dashboard. TODO(sang): Support GCS.
  74. # Note that for dashboard subprocess module, the component name is "dashboard_[module_name]".
  75. AVAILABLE_COMPONENT_NAMES_FOR_METRICS = {
  76. "workers",
  77. "raylet",
  78. "agent",
  79. "dashboard",
  80. "gcs",
  81. }
  82. METRICS_INPUT_ROOT = os.path.join(
  83. os.path.dirname(__file__), "modules", "metrics", "export"
  84. )
  85. METRICS_RECORD_INTERVAL_S = env_integer("METRICS_RECORD_INTERVAL_S", 5)
  86. PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
  87. METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
  88. )
  89. PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
  90. "RAY_enable_pipe_based_agent_to_parent_health_check", False
  91. )
  92. # Maximum time to wait for the subprocess module to be ready.
  93. SUBPROCESS_MODULE_WAIT_READY_TIMEOUT = env_float(
  94. "RAY_DASHBOARD_SUBPROCESS_MODULE_WAIT_READY_TIMEOUT", 30.0
  95. )
  96. # Timeout for graceful shutdown of subprocess module.
  97. SUBPROCESS_MODULE_GRACEFUL_SHUTDOWN_TIMEOUT = env_float(
  98. "RAY_DASHBOARD_SUBPROCESS_MODULE_GRACEFUL_SHUTDOWN_TIMEOUT", 5.0
  99. )
  100. # Timeout to wait for subprocess to join (after force kill or when already dead).
  101. SUBPROCESS_MODULE_JOIN_TIMEOUT = env_float(
  102. "RAY_DASHBOARD_SUBPROCESS_MODULE_JOIN_TIMEOUT", 2.0
  103. )