ray_constants.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. """Ray constants used in the Python code."""
  2. import json
  3. import logging
  4. import os
  5. import sys
  6. logger = logging.getLogger(__name__)
  7. def env_integer(key, default):
  8. if key in os.environ:
  9. value = os.environ[key]
  10. if value.isdigit():
  11. return int(os.environ[key])
  12. logger.debug(
  13. f"Found {key} in environment, but value must "
  14. f"be an integer. Got: {value}. Returning "
  15. f"provided default {default}."
  16. )
  17. return default
  18. return default
  19. def env_float(key, default):
  20. if key in os.environ:
  21. value = os.environ[key]
  22. try:
  23. return float(value)
  24. except ValueError:
  25. logger.debug(
  26. f"Found {key} in environment, but value must "
  27. f"be a float. Got: {value}. Returning "
  28. f"provided default {default}."
  29. )
  30. return default
  31. return default
  32. def env_bool(key, default):
  33. if key in os.environ:
  34. return (
  35. True
  36. if os.environ[key].lower() == "true" or os.environ[key] == "1"
  37. else False
  38. )
  39. return default
  40. def env_set_by_user(key):
  41. return key in os.environ
  42. # Whether event logging to driver is enabled. Set to 0 to disable.
  43. AUTOSCALER_EVENTS = env_integer("RAY_SCHEDULER_EVENTS", 1)
  44. # Whether to disable the C++ failure signal handler that provides stack traces
  45. # on crashes. Disabling this is necessary when using Java libraries
  46. # because Ray's signal handler conflicts with the JVM's signal handling.
  47. RAY_DISABLE_FAILURE_SIGNAL_HANDLER = env_bool(
  48. "RAY_DISABLE_FAILURE_SIGNAL_HANDLER", False
  49. )
  50. RAY_LOG_TO_DRIVER = env_bool("RAY_LOG_TO_DRIVER", True)
  51. # Filter level under which events will be filtered out, i.e. not printing to driver
  52. RAY_LOG_TO_DRIVER_EVENT_LEVEL = os.environ.get("RAY_LOG_TO_DRIVER_EVENT_LEVEL", "INFO")
  53. # Internal kv keys for storing monitor debug status.
  54. DEBUG_AUTOSCALING_ERROR = "__autoscaling_error"
  55. DEBUG_AUTOSCALING_STATUS = "__autoscaling_status"
  56. DEBUG_AUTOSCALING_STATUS_LEGACY = "__autoscaling_status_legacy"
  57. ID_SIZE = 28
  58. # The following constants are used to create default values for
  59. # resource isolation when it is enabled.
  60. # TODO(54703): Link to OSS documentation about the feature once it's available.
  61. DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"
  62. # The default proportion of cpu cores to reserve for ray system processes.
  63. DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float(
  64. "RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05
  65. )
  66. # The default minimum number of cpu cores to reserve for ray system processes.
  67. # This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION < this value.
  68. DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES = env_float(
  69. "RAY_DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES", 1.0
  70. )
  71. # The default maximum number of cpu cores to reserve for ray system processes.
  72. # This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION > this value.
  73. DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES = env_float(
  74. "RAY_DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES", 3.0
  75. )
  76. # The values for SYSTEM_RESERVED_MEMORY do not include the memory reserveed
  77. # for the object store.
  78. # The default proportion available memory to reserve for ray system processes.
  79. DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_float(
  80. "RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
  81. )
  82. # The default minimum number of bytes to reserve for ray system processes.
  83. # This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION < this value.
  84. DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
  85. "RAY_DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES", (500) * (1024**2)
  86. )
  87. # The default maximum number of bytes to reserve for ray system processes.
  88. # This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION > this value.
  89. DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
  90. "RAY_DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES", (10) * (1024**3)
  91. )
  92. # The default maximum number of bytes to allocate to the object store unless
  93. # overridden by the user.
  94. DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = env_integer(
  95. "RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES", (200) * (10**9) # 200 GB
  96. )
  97. # The default proportion of available memory allocated to the object store
  98. DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = env_float(
  99. "RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION",
  100. 0.3,
  101. )
  102. # The smallest cap on the memory used by the object store that we allow.
  103. # This must be greater than MEMORY_RESOURCE_UNIT_BYTES
  104. OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
  105. # Each ObjectRef currently uses about 3KB of caller memory.
  106. CALLER_MEMORY_USAGE_PER_OBJECT_REF = 3000
  107. # Above this number of bytes, raise an error by default unless the user sets
  108. # RAY_ALLOW_SLOW_STORAGE=1. This avoids swapping with large object stores.
  109. REQUIRE_SHM_SIZE_THRESHOLD = 10**10
  110. # Mac with 16GB memory has degraded performance when the object store size is
  111. # greater than 2GB.
  112. # (see https://github.com/ray-project/ray/issues/20388 for details)
  113. # The workaround here is to limit capacity to 2GB for Mac by default,
  114. # and raise error if the capacity is overwritten by user.
  115. MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT = (2) * (2**30)
  116. # If a user does not specify a port for the primary Ray service,
  117. # we attempt to start the service running at this port.
  118. DEFAULT_PORT = 6379
  119. RAY_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_ADDRESS"
  120. RAY_API_SERVER_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_API_SERVER_ADDRESS"
  121. RAY_NAMESPACE_ENVIRONMENT_VARIABLE = "RAY_NAMESPACE"
  122. RAY_RUNTIME_ENV_ENVIRONMENT_VARIABLE = "RAY_RUNTIME_ENV"
  123. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR = (
  124. "RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S"
  125. )
  126. # Ray populates this env var to the working dir in the creation of a runtime env.
  127. # For example, `pip` and `conda` users can use this environment variable to locate the
  128. # `requirements.txt` file.
  129. RAY_RUNTIME_ENV_CREATE_WORKING_DIR_ENV_VAR = "RAY_RUNTIME_ENV_CREATE_WORKING_DIR"
  130. # Defaults to 10 minutes. This should be longer than the total time it takes for
  131. # the local working_dir and py_modules to be uploaded, or these files might get
  132. # garbage collected before the job starts.
  133. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT = 10 * 60
  134. # If set to 1, then `.gitignore` files will not be parsed and loaded into "excludes"
  135. # when using a local working_dir or py_modules.
  136. RAY_RUNTIME_ENV_IGNORE_GITIGNORE = "RAY_RUNTIME_ENV_IGNORE_GITIGNORE"
  137. # Default directories to exclude when packaging working_dir.
  138. # Override by setting the RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES
  139. # (comma-separated) environment variable. Set to an empty string to disable.
  140. # `.git` is necessary since it is never in .gitignore.
  141. RAY_RUNTIME_ENV_DEFAULT_EXCLUDES = ".git,.venv,venv,__pycache__"
  142. def get_runtime_env_default_excludes() -> list[str]:
  143. """Get default excludes for working_dir, overridable via RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES environment variable."""
  144. val = os.environ.get(
  145. "RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES", RAY_RUNTIME_ENV_DEFAULT_EXCLUDES
  146. )
  147. return [x.strip() for x in val.split(",") if x.strip()]
  148. # Hook for running a user-specified runtime-env hook. This hook will be called
  149. # unconditionally given the runtime_env dict passed for ray.init. It must return
  150. # a rewritten runtime_env dict. Example: "your.module.runtime_env_hook".
  151. RAY_RUNTIME_ENV_HOOK = "RAY_RUNTIME_ENV_HOOK"
  152. # Hook that is invoked on `ray start`. It will be given the cluster parameters and
  153. # whether we are the head node as arguments. The function can modify the params class,
  154. # but otherwise returns void. Example: "your.module.ray_start_hook".
  155. RAY_START_HOOK = "RAY_START_HOOK"
  156. # Hook that is invoked on `ray job submit`. It will be given all the same args as the
  157. # job.cli.submit() function gets, passed as kwargs to this function.
  158. RAY_JOB_SUBMIT_HOOK = "RAY_JOB_SUBMIT_HOOK"
  159. # Headers to pass when using the Job CLI. It will be given to
  160. # instantiate a Job SubmissionClient.
  161. RAY_JOB_HEADERS = "RAY_JOB_HEADERS"
  162. # Timeout waiting for the dashboard to come alive during node startup.
  163. RAY_DASHBOARD_STARTUP_TIMEOUT_S = env_integer("RAY_DASHBOARD_STARTUP_TIMEOUT_S", 60)
  164. DEFAULT_DASHBOARD_IP = "127.0.0.1"
  165. DEFAULT_DASHBOARD_PORT = 8265
  166. DASHBOARD_ADDRESS = "dashboard"
  167. DASHBOARD_CLIENT_MAX_SIZE = 100 * 1024**2
  168. PROMETHEUS_SERVICE_DISCOVERY_FILE = "prom_metrics_service_discovery.json"
  169. DEFAULT_DASHBOARD_AGENT_LISTEN_PORT = 52365
  170. # Default resource requirements for actors when no resource requirements are
  171. # specified.
  172. DEFAULT_ACTOR_METHOD_CPU_SIMPLE = 1
  173. DEFAULT_ACTOR_CREATION_CPU_SIMPLE = 0
  174. # Default resource requirements for actors when some resource requirements are
  175. # specified in .
  176. DEFAULT_ACTOR_METHOD_CPU_SPECIFIED = 0
  177. DEFAULT_ACTOR_CREATION_CPU_SPECIFIED = 1
  178. # Default number of return values for each actor method.
  179. DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS = 1
  180. # Wait 30 seconds for client to reconnect after unexpected disconnection
  181. DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD = 30
  182. # If a remote function or actor (or some other export) has serialized size
  183. # greater than this quantity, print an warning.
  184. FUNCTION_SIZE_WARN_THRESHOLD = 10**7
  185. FUNCTION_SIZE_ERROR_THRESHOLD = env_integer("FUNCTION_SIZE_ERROR_THRESHOLD", (10**8))
  186. # If remote functions with the same source are imported this many times, then
  187. # print a warning.
  188. DUPLICATE_REMOTE_FUNCTION_THRESHOLD = 100
  189. # The maximum resource quantity that is allowed. TODO(rkn): This could be
  190. # relaxed, but the current implementation of the node manager will be slower
  191. # for large resource quantities due to bookkeeping of specific resource IDs.
  192. MAX_RESOURCE_QUANTITY = 100e12
  193. # Number of units 1 resource can be subdivided into.
  194. MIN_RESOURCE_GRANULARITY = 0.0001
  195. # Set this environment variable to populate the dashboard URL with
  196. # an external hosted Ray dashboard URL (e.g. because the
  197. # dashboard is behind a proxy or load balancer). This only overrides
  198. # the dashboard URL when returning or printing to a user through a public
  199. # API, but not in the internal KV store.
  200. RAY_OVERRIDE_DASHBOARD_URL = "RAY_OVERRIDE_DASHBOARD_URL"
  201. # Different types of Ray errors that can be pushed to the driver.
  202. # TODO(rkn): These should be defined in flatbuffers and must be synced with
  203. # the existing C++ definitions.
  204. PICKLING_LARGE_OBJECT_PUSH_ERROR = "pickling_large_object"
  205. WAIT_FOR_FUNCTION_PUSH_ERROR = "wait_for_function"
  206. VERSION_MISMATCH_PUSH_ERROR = "version_mismatch"
  207. WORKER_CRASH_PUSH_ERROR = "worker_crash"
  208. WORKER_DIED_PUSH_ERROR = "worker_died"
  209. WORKER_POOL_LARGE_ERROR = "worker_pool_large"
  210. PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
  211. RESOURCE_DEADLOCK_ERROR = "resource_deadlock"
  212. REMOVED_NODE_ERROR = "node_removed"
  213. MONITOR_DIED_ERROR = "monitor_died"
  214. LOG_MONITOR_DIED_ERROR = "log_monitor_died"
  215. DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died"
  216. DASHBOARD_DIED_ERROR = "dashboard_died"
  217. RAYLET_DIED_ERROR = "raylet_died"
  218. DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR = "detached_actor_anonymous_namespace"
  219. EXCESS_QUEUEING_WARNING = "excess_queueing_warning"
  220. # Used by autoscaler to set the node custom resources and labels
  221. # from cluster.yaml.
  222. RESOURCES_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_RESOURCES"
  223. LABELS_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_LABELS"
  224. # Temporary flag to disable log processing in the dashboard. This is useful
  225. # if the dashboard is overloaded by logs and failing to process other
  226. # dashboard API requests (e.g. Job Submission).
  227. DISABLE_DASHBOARD_LOG_INFO = env_integer("RAY_DISABLE_DASHBOARD_LOG_INFO", 0)
  228. LOGGER_FORMAT = "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
  229. LOGGER_FORMAT_ESCAPE = json.dumps(LOGGER_FORMAT.replace("%", "%%"))
  230. LOGGER_FORMAT_HELP = f"The logging format. default={LOGGER_FORMAT_ESCAPE}"
  231. # Configure the default logging levels for various Ray components.
  232. # TODO (kevin85421): Currently, I don't encourage Ray users to configure
  233. # `RAY_LOGGER_LEVEL` until its scope and expected behavior are clear and
  234. # easy to understand. Now, only Ray developers should use it.
  235. LOGGER_LEVEL = os.environ.get("RAY_LOGGER_LEVEL", "info")
  236. LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
  237. LOGGER_LEVEL_HELP = (
  238. "The logging level threshold, choices=['debug', 'info',"
  239. " 'warning', 'error', 'critical'], default='info'"
  240. )
  241. LOGGING_REDIRECT_STDERR_ENVIRONMENT_VARIABLE = "RAY_LOG_TO_STDERR"
  242. # Logging format when logging stderr. This should be formatted with the
  243. # component before setting the formatter, e.g. via
  244. # format = LOGGER_FORMAT_STDERR.format(component="dashboard")
  245. # handler.setFormatter(logging.Formatter(format))
  246. LOGGER_FORMAT_STDERR = (
  247. "%(asctime)s\t%(levelname)s ({component}) %(filename)s:%(lineno)s -- %(message)s"
  248. )
  249. # Constants used to define the different process types.
  250. PROCESS_TYPE_REAPER = "reaper"
  251. PROCESS_TYPE_MONITOR = "monitor"
  252. PROCESS_TYPE_RAY_CLIENT_SERVER = "ray_client_server"
  253. PROCESS_TYPE_LOG_MONITOR = "log_monitor"
  254. PROCESS_TYPE_DASHBOARD = "dashboard"
  255. PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent"
  256. PROCESS_TYPE_RUNTIME_ENV_AGENT = "runtime_env_agent"
  257. PROCESS_TYPE_WORKER = "worker"
  258. PROCESS_TYPE_RAYLET = "raylet"
  259. PROCESS_TYPE_REDIS_SERVER = "redis_server"
  260. PROCESS_TYPE_GCS_SERVER = "gcs_server"
  261. PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver"
  262. PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker"
  263. # Log file names
  264. MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log"
  265. LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log"
  266. # Enable log deduplication.
  267. RAY_DEDUP_LOGS = env_bool("RAY_DEDUP_LOGS", True)
  268. # How many seconds of messages to buffer for log deduplication.
  269. RAY_DEDUP_LOGS_AGG_WINDOW_S = env_integer("RAY_DEDUP_LOGS_AGG_WINDOW_S", 5)
  270. # Regex for log messages to never deduplicate, or None. This takes precedence over
  271. # the skip regex below. A default pattern is set for testing.
  272. TESTING_NEVER_DEDUP_TOKEN = "__ray_testing_never_deduplicate__"
  273. RAY_DEDUP_LOGS_ALLOW_REGEX = os.environ.get(
  274. "RAY_DEDUP_LOGS_ALLOW_REGEX", TESTING_NEVER_DEDUP_TOKEN
  275. )
  276. # Regex for log messages to always skip / suppress, or None.
  277. RAY_DEDUP_LOGS_SKIP_REGEX = os.environ.get("RAY_DEDUP_LOGS_SKIP_REGEX")
  278. AGENT_PROCESS_TYPE_DASHBOARD_AGENT = "ray::DashboardAgent"
  279. AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT = "ray::RuntimeEnvAgent"
  280. AGENT_PROCESS_LIST = [
  281. AGENT_PROCESS_TYPE_DASHBOARD_AGENT,
  282. AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT,
  283. ]
  284. WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE"
  285. WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker"
  286. WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker"
  287. WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE = (
  288. f"ray::IDLE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
  289. )
  290. WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE = (
  291. f"ray::IDLE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
  292. )
  293. WORKER_PROCESS_TYPE_SPILL_WORKER = f"ray::SPILL_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
  294. WORKER_PROCESS_TYPE_RESTORE_WORKER = (
  295. f"ray::RESTORE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
  296. )
  297. WORKER_PROCESS_TYPE_SPILL_WORKER_DELETE = (
  298. f"ray::DELETE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
  299. )
  300. WORKER_PROCESS_TYPE_RESTORE_WORKER_DELETE = (
  301. f"ray::DELETE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
  302. )
  303. # The number of files the log monitor will open. If more files exist, they will
  304. # be ignored.
  305. LOG_MONITOR_MAX_OPEN_FILES = int(
  306. os.environ.get("RAY_LOG_MONITOR_MAX_OPEN_FILES", "200")
  307. )
  308. # The maximum batch of lines to be read in a single iteration. We _always_ try
  309. # to read this number of lines even if there aren't any new lines.
  310. LOG_MONITOR_NUM_LINES_TO_READ = int(
  311. os.environ.get("RAY_LOG_MONITOR_NUM_LINES_TO_READ", "1000")
  312. )
  313. # Autoscaler events are denoted by the ":event_summary:" magic token.
  314. LOG_PREFIX_EVENT_SUMMARY = ":event_summary:"
  315. # Cluster-level info events are denoted by the ":info_message:" magic token. These may
  316. # be emitted in the stderr of Ray components.
  317. LOG_PREFIX_INFO_MESSAGE = ":info_message:"
  318. # Actor names are recorded in the logs with this magic token as a prefix.
  319. LOG_PREFIX_ACTOR_NAME = ":actor_name:"
  320. # Task names are recorded in the logs with this magic token as a prefix.
  321. LOG_PREFIX_TASK_NAME = ":task_name:"
  322. # Job ids are recorded in the logs with this magic token as a prefix.
  323. LOG_PREFIX_JOB_ID = ":job_id:"
  324. # The object metadata field uses the following format: It is a comma
  325. # separated list of fields. The first field is mandatory and is the
  326. # type of the object (see types below) or an integer, which is interpreted
  327. # as an error value. The second part is optional and if present has the
  328. # form DEBUG:<breakpoint_id>, it is used for implementing the debugger.
  329. # A constant used as object metadata to indicate the object is cross language.
  330. OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
  331. # A constant used as object metadata to indicate the object is python specific.
  332. OBJECT_METADATA_TYPE_PYTHON = b"PYTHON"
  333. # A constant used as object metadata to indicate the object is raw bytes.
  334. OBJECT_METADATA_TYPE_RAW = b"RAW"
  335. # A constant used as object metadata to indicate the object is an actor handle.
  336. # This value should be synchronized with the Java definition in
  337. # ObjectSerializer.java
  338. # TODO(fyrestone): Serialize the ActorHandle via the custom type feature
  339. # of XLANG.
  340. OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"
  341. # A constant indicating the debugging part of the metadata (see above).
  342. OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"
  343. AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"
  344. REDIS_DEFAULT_USERNAME = ""
  345. REDIS_DEFAULT_PASSWORD = ""
  346. # The Mach kernel page size in bytes.
  347. MACH_PAGE_SIZE_BYTES = 4096
  348. # The max number of bytes for task execution error message.
  349. MAX_APPLICATION_ERROR_LENGTH = env_integer("RAY_MAX_APPLICATION_ERROR_LENGTH", 500)
  350. # Max 64 bit integer value, which is needed to ensure against overflow
  351. # in C++ when passing integer values cross-language.
  352. MAX_INT64_VALUE = 9223372036854775807
  353. # Object Spilling related constants
  354. DEFAULT_OBJECT_PREFIX = "ray_spilled_objects"
  355. GCS_PORT_ENVIRONMENT_VARIABLE = "RAY_GCS_SERVER_PORT"
  356. HEALTHCHECK_EXPIRATION_S = os.environ.get("RAY_HEALTHCHECK_EXPIRATION_S", 10)
  357. # Filename of "shim process" that sets up Python worker environment.
  358. # Should be kept in sync with kSetupWorkerFilename in
  359. # src/ray/common/constants.h.
  360. SETUP_WORKER_FILENAME = "setup_worker.py"
  361. # Directory name where runtime_env resources will be created & cached.
  362. DEFAULT_RUNTIME_ENV_DIR_NAME = "runtime_resources"
  363. # The timeout seconds for the creation of runtime env,
  364. # dafault timeout is 10 minutes
  365. DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS = 600
  366. # The timeout seconds for the GCS server request.
  367. # Try fetching from the cpp environment variable first.
  368. GCS_SERVER_REQUEST_TIMEOUT_SECONDS = int(
  369. os.environ.get("RAY_gcs_server_request_timeout_seconds", "60")
  370. )
  371. # Used to separate lines when formatting the call stack where an ObjectRef was
  372. # created.
  373. CALL_STACK_LINE_DELIMITER = " | "
  374. # The default gRPC max message size is 4 MiB, we use a larger number of 512 MiB
  375. # NOTE: This is equal to the C++ limit of (RAY_CONFIG::max_grpc_message_size)
  376. GRPC_CPP_MAX_MESSAGE_SIZE = 512 * 1024 * 1024
  377. # The gRPC send & receive max length for "dashboard agent" server.
  378. # NOTE: This is equal to the C++ limit of RayConfig::max_grpc_message_size
  379. # and HAVE TO STAY IN SYNC with it (ie, meaning that both of these values
  380. # have to be set at the same time)
  381. AGENT_GRPC_MAX_MESSAGE_LENGTH = env_integer(
  382. "AGENT_GRPC_MAX_MESSAGE_LENGTH", 20 * 1024 * 1024 # 20MB
  383. )
  384. # GRPC options
  385. GRPC_ENABLE_HTTP_PROXY = (
  386. 1
  387. if os.environ.get("RAY_grpc_enable_http_proxy", "0").lower() in ("1", "true")
  388. else 0
  389. )
  390. GLOBAL_GRPC_OPTIONS = (("grpc.enable_http_proxy", GRPC_ENABLE_HTTP_PROXY),)
  391. # Internal kv namespaces
  392. KV_NAMESPACE_DASHBOARD = b"dashboard"
  393. KV_NAMESPACE_SESSION = b"session"
  394. KV_NAMESPACE_TRACING = b"tracing"
  395. KV_NAMESPACE_PDB = b"ray_pdb"
  396. KV_NAMESPACE_HEALTHCHECK = b"healthcheck"
  397. KV_NAMESPACE_JOB = b"job"
  398. KV_NAMESPACE_CLUSTER = b"cluster"
  399. KV_HEAD_NODE_ID_KEY = b"head_node_id"
  400. # TODO: Set package for runtime env
  401. # We need to update ray client for this since runtime env use ray client
  402. # This might introduce some compatibility issues so leave it here for now.
  403. KV_NAMESPACE_PACKAGE = None
  404. KV_NAMESPACE_FUNCTION_TABLE = b"fun"
  405. LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]
  406. NEURON_CORES = "neuron_cores"
  407. GPU = "GPU"
  408. TPU = "TPU"
  409. NPU = "NPU"
  410. HPU = "HPU"
  411. RAY_WORKER_NICENESS = "RAY_worker_niceness"
  412. # Default max_retries option in @ray.remote for non-actor
  413. # tasks.
  414. DEFAULT_TASK_MAX_RETRIES = 3
  415. # Default max_concurrency option in @ray.remote for threaded actors.
  416. DEFAULT_MAX_CONCURRENCY_THREADED = 1
  417. # Ray internal flags. These flags should not be set by users, and we strip them on job
  418. # submission.
  419. # This should be consistent with src/ray/common/ray_internal_flag_def.h
  420. RAY_INTERNAL_FLAGS = [
  421. "RAY_JOB_ID",
  422. "RAY_RAYLET_PID",
  423. "RAY_OVERRIDE_NODE_ID_FOR_TESTING",
  424. ]
  425. DEFAULT_RESOURCES = {"CPU", "GPU", "memory", "object_store_memory"}
  426. # Supported Python versions for runtime env's "conda" field. Ray downloads
  427. # Ray wheels into the conda environment, so the Ray wheels for these Python
  428. # versions must be available online.
  429. RUNTIME_ENV_CONDA_PY_VERSIONS = [(3, 9), (3, 10), (3, 11), (3, 12)]
  430. # Whether to enable Ray clusters (in addition to local Ray).
  431. # Ray clusters are not explicitly supported for Windows and OSX.
  432. IS_WINDOWS_OR_OSX = sys.platform == "darwin" or sys.platform == "win32"
  433. ENABLE_RAY_CLUSTERS_ENV_VAR = "RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER"
  434. ENABLE_RAY_CLUSTER = env_bool(
  435. ENABLE_RAY_CLUSTERS_ENV_VAR,
  436. not IS_WINDOWS_OR_OSX,
  437. )
  438. SESSION_LATEST = "session_latest"
  439. NUM_PORT_RETRIES = 40
  440. NUM_REDIS_GET_RETRIES = int(os.environ.get("RAY_NUM_REDIS_GET_RETRIES", "20"))
  441. # Turn this on if actor task log's offsets are expected to be recorded.
  442. # With this enabled, actor tasks' log could be queried with task id.
  443. RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING = env_bool(
  444. "RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING", False
  445. )
  446. # RuntimeEnv env var to indicate it exports a function
  447. WORKER_PROCESS_SETUP_HOOK_ENV_VAR = "__RAY_WORKER_PROCESS_SETUP_HOOK_ENV_VAR"
  448. RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT_ENV_VAR = (
  449. "RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT" # noqa
  450. )
  451. RAY_DEFAULT_LABEL_KEYS_PREFIX = "ray.io/"
  452. RAY_TPU_MAX_CONCURRENT_CONNECTIONS_ENV_VAR = "RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS"
  453. RAY_NODE_IP_FILENAME = "node_ip_address.json"
  454. RAY_LOGGING_CONFIG_ENCODING = os.environ.get("RAY_LOGGING_CONFIG_ENCODING")
  455. RAY_BACKEND_LOG_JSON_ENV_VAR = "RAY_BACKEND_LOG_JSON"
  456. # Write export API event of all resource types to file if enabled.
  457. # RAY_enable_export_api_write_config will not be considered if
  458. # this is enabled.
  459. RAY_ENABLE_EXPORT_API_WRITE = env_bool("RAY_enable_export_api_write", False)
  460. # Comma separated string containing individual resource
  461. # to write export API events for. This configuration is only used if
  462. # RAY_enable_export_api_write is not enabled. Full list of valid
  463. # resource types in ExportEvent.SourceType enum in
  464. # src/ray/protobuf/export_api/export_event.proto
  465. # Example config:
  466. # `export RAY_enable_export_api_write_config='EXPORT_SUBMISSION_JOB,EXPORT_ACTOR'`
  467. RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR = os.environ.get(
  468. "RAY_enable_export_api_write_config", ""
  469. )
  470. RAY_ENABLE_EXPORT_API_WRITE_CONFIG = RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR.split(",")
  471. RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES = env_bool(
  472. "RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES", 100 * 1e6
  473. )
  474. RAY_EXPORT_EVENT_MAX_BACKUP_COUNT = env_bool("RAY_EXPORT_EVENT_MAX_BACKUP_COUNT", 20)
  475. # If this flag is set and you run the driver with `uv run`, Ray propagates the `uv run`
  476. # environment to all workers. Ray does this by setting the `py_executable` to the
  477. # `uv run`` command line and by propagating the working directory
  478. # via the `working_dir` plugin so uv finds the pyproject.toml.
  479. # If you enable RAY_ENABLE_UV_RUN_RUNTIME_ENV AND you run the driver
  480. # with `uv run`, Ray deactivates the regular RAY_RUNTIME_ENV_HOOK
  481. # because in most cases the hooks wouldn't work unless you specifically make the code
  482. # for the runtime env hook available in your uv environment and make sure your hook
  483. # is compatible with your uv runtime environment. If you want to combine a custom
  484. # RAY_RUNTIME_ENV_HOOK with `uv run`, you should flag off RAY_ENABLE_UV_RUN_RUNTIME_ENV
  485. # and call ray._private.runtime_env.uv_runtime_env_hook.hook manually in your hook or
  486. # manually set the py_executable in your runtime environment hook.
  487. RAY_ENABLE_UV_RUN_RUNTIME_ENV = env_bool("RAY_ENABLE_UV_RUN_RUNTIME_ENV", True)
  488. # Prometheus metric cardinality level setting, either "legacy" or "recommended".
  489. #
  490. # Legacy: report all metrics to prometheus with the set of labels that are reported by
  491. # the component, including WorkerId, (task or actor) Name, etc. This is the default.
  492. # Recommended: report only the node level metrics to prometheus. This means that the
  493. # WorkerId will be removed from all metrics.
  494. # Low: Same as recommended, but also drop the Name label for tasks and actors.
  495. RAY_METRIC_CARDINALITY_LEVEL = os.environ.get(
  496. "RAY_metric_cardinality_level", "recommended"
  497. )
  498. # Whether enable OpenTelemetry as the metrics collection backend. The default is
  499. # using OpenCensus.
  500. RAY_ENABLE_OPEN_TELEMETRY = env_bool("RAY_enable_open_telemetry", True)
  501. # How long to wait for a fetch for an RDT object to complete during ray.get before timing out and raising an exception to the user.
  502. #
  503. # NOTE: This is a tenth of `RayConfig::fetch_fail_timeout_milliseconds` by default as RDT transfers are expected to be much faster.
  504. RDT_FETCH_FAIL_TIMEOUT_SECONDS = (
  505. env_integer("RAY_rdt_fetch_fail_timeout_milliseconds", 60000) / 1000
  506. )
  507. # Whether to enable zero-copy serialization for PyTorch tensors.
  508. # When enabled, Ray serializes PyTorch tensors by converting them to NumPy arrays
  509. # and leveraging pickle5's zero-copy buffer sharing. This avoids copying the
  510. # underlying tensor data, which can improve performance when passing large tensors
  511. # across tasks or actors. Note that this is experimental and should be used with caution
  512. # as we won't copy and allow a write to shared memory. One process changing a tensor
  513. # after ray.get could be reflected in another process.
  514. #
  515. # This feature is experimental and works best under the following conditions:
  516. # - The tensor has `requires_grad=False` (i.e., is detached from the autograd graph).
  517. # - The tensor is contiguous in memory
  518. # - Performance benefits from this are larger if the tensor resides in CPU memory
  519. # - You are not using Ray Direct Transport
  520. #
  521. # Tensors on GPU or non-contiguous tensors are still supported: Ray will
  522. # automatically move them to CPU and/or make them contiguous as needed.
  523. # While this incurs an initial copy, subsequent serialization may still benefit
  524. # from reduced overhead compared to the default path.
  525. #
  526. # Use with caution and ensure tensors meet the above criteria before enabling.
  527. # Default: False.
  528. RAY_ENABLE_ZERO_COPY_TORCH_TENSORS = env_bool(
  529. "RAY_ENABLE_ZERO_COPY_TORCH_TENSORS", False
  530. )