| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- import os
- from ray._private.ray_constants import env_bool, env_float, env_integer
- DASHBOARD_LOG_FILENAME = "dashboard.log"
- DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX = "DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX:"
- DASHBOARD_AGENT_ADDR_IP_PREFIX = "DASHBOARD_AGENT_ADDR_IP_PREFIX:"
- DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
- DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME = (
- "RAY_DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S" # noqa
- )
- DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S = env_integer(
- DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME, 0.4
- )
- # The maximum time that parent can be considered
- # as dead before agent kills itself.
- _PARENT_DEATH_THREASHOLD = 5
- RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME = "RAY_STATE_SERVER_MAX_HTTP_REQUEST"
- # Default number of in-progress requests to the state api server.
- RAY_STATE_SERVER_MAX_HTTP_REQUEST = env_integer(
- RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, 100
- )
- # Max allowed number of in-progress requests could be configured.
- RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000
- RAY_DASHBOARD_STATS_PURGING_INTERVAL = env_integer(
- "RAY_DASHBOARD_STATS_PURGING_INTERVAL", 60 * 10
- )
- RAY_DASHBOARD_STATS_UPDATING_INTERVAL = env_integer(
- "RAY_DASHBOARD_STATS_UPDATING_INTERVAL", 15
- )
- GCS_SERVER_ADDRESS = "GcsServerAddress"
- # GCS check alive
- GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
- GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
- # aiohttp_cache
- AIOHTTP_CACHE_TTL_SECONDS = 2
- AIOHTTP_CACHE_MAX_SIZE = 128
- AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
- # Default value for datacenter (the default value in protobuf)
- DEFAULT_LANGUAGE = "PYTHON"
- DEFAULT_JOB_ID = "ffff"
- # Hook that is invoked on the dashboard `/api/component_activities` endpoint.
- # Environment variable stored here should be a callable that does not
- # take any arguments and should return a dictionary mapping
- # activity component type (str) to
- # ray.dashboard.modules.api.api_head.RayActivityResponse.
- # Example: "your.module.ray_cluster_activity_hook".
- RAY_CLUSTER_ACTIVITY_HOOK = "RAY_CLUSTER_ACTIVITY_HOOK"
- # The number of candidate agents
- CANDIDATE_AGENT_NUMBER = max(env_integer("CANDIDATE_AGENT_NUMBER", 1), 1)
- # when head receive JobSubmitRequest, maybe not any agent is available,
- # we need to wait for agents in other node start
- WAIT_AVAILABLE_AGENT_TIMEOUT = 10
- TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS = 0.5
- RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR = "RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES"
- RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR = (
- "RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG"
- )
- # The max time to wait for the JobSupervisor to start before failing the job.
- DEFAULT_JOB_START_TIMEOUT_SECONDS = 60 * 15
- RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR = "RAY_JOB_START_TIMEOUT_SECONDS"
- # Port that dashboard prometheus metrics will be exported to
- DASHBOARD_METRIC_PORT = env_integer("DASHBOARD_METRIC_PORT", 44227)
- # We use RayNodeType to mark head/worker nodes. IsHeadNode is retained
- # for backward compatibility for user-customized dashboards that might rely on it
- NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode", "RayNodeType"]
- GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"]
- # TpuDeviceName and TpuIndex are expected to be equal to the number of TPU
- # chips in the cluster. TpuType and TpuTopology are proportional to the number
- # of node pools.
- TPU_TAG_KEYS = NODE_TAG_KEYS + ["TpuDeviceName", "TpuIndex", "TpuType", "TpuTopology"]
- CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
- COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]
- COMPONENT_GPU_TAG_KEYS = GPU_TAG_KEYS + COMPONENT_METRICS_TAG_KEYS
- # Dashboard metrics are tracked separately at the dashboard. TODO(sang): Support GCS.
- # Note that for dashboard subprocess module, the component name is "dashboard_[module_name]".
- AVAILABLE_COMPONENT_NAMES_FOR_METRICS = {
- "workers",
- "raylet",
- "agent",
- "dashboard",
- "gcs",
- }
- METRICS_INPUT_ROOT = os.path.join(
- os.path.dirname(__file__), "modules", "metrics", "export"
- )
- METRICS_RECORD_INTERVAL_S = env_integer("METRICS_RECORD_INTERVAL_S", 5)
- PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
- METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
- )
- PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
- "RAY_enable_pipe_based_agent_to_parent_health_check", False
- )
- # Maximum time to wait for the subprocess module to be ready.
- SUBPROCESS_MODULE_WAIT_READY_TIMEOUT = env_float(
- "RAY_DASHBOARD_SUBPROCESS_MODULE_WAIT_READY_TIMEOUT", 30.0
- )
- # Timeout for graceful shutdown of subprocess module.
- SUBPROCESS_MODULE_GRACEFUL_SHUTDOWN_TIMEOUT = env_float(
- "RAY_DASHBOARD_SUBPROCESS_MODULE_GRACEFUL_SHUTDOWN_TIMEOUT", 5.0
- )
- # Timeout to wait for subprocess to join (after force kill or when already dead).
- SUBPROCESS_MODULE_JOIN_TIMEOUT = env_float(
- "RAY_DASHBOARD_SUBPROCESS_MODULE_JOIN_TIMEOUT", 2.0
- )
|