constants.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. import os
  2. from typing import List
  3. from ray.serve._private.constants_utils import (
  4. get_env_bool,
  5. get_env_float,
  6. get_env_float_non_negative,
  7. get_env_float_positive,
  8. get_env_int,
  9. get_env_int_non_negative,
  10. get_env_int_positive,
  11. get_env_str,
  12. parse_latency_buckets,
  13. str_to_list,
  14. )
  15. #: Logger used by serve components
  16. SERVE_LOGGER_NAME = "ray.serve"
  17. #: Actor name used to register controller
  18. SERVE_CONTROLLER_NAME = "SERVE_CONTROLLER_ACTOR"
  19. #: Actor name used to register HTTP proxy actor
  20. SERVE_PROXY_NAME = "SERVE_PROXY_ACTOR"
  21. #: Ray namespace used for all Serve actors
  22. SERVE_NAMESPACE = "serve"
  23. #: HTTP Host
  24. DEFAULT_HTTP_HOST = "127.0.0.1"
  25. #: HTTP Port
  26. DEFAULT_HTTP_PORT = 8000
  27. #: Uvicorn timeout_keep_alive Config
  28. DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S = 90
  29. #: gRPC Port
  30. DEFAULT_GRPC_PORT = 9000
  31. #: Default Serve application name
  32. SERVE_DEFAULT_APP_NAME = "default"
  33. #: Max concurrency
  34. ASYNC_CONCURRENCY = int(1e6)
  35. # How long to sleep between control loop cycles on the controller.
  36. CONTROL_LOOP_INTERVAL_S = get_env_float_non_negative(
  37. "RAY_SERVE_CONTROL_LOOP_INTERVAL_S", 0.1
  38. )
  39. #: Max time to wait for HTTP proxy in `serve.start()`.
  40. HTTP_PROXY_TIMEOUT = 60
  41. # Max retry on deployment constructor is
  42. # min(num_replicas * MAX_PER_REPLICA_RETRY_COUNT, max_constructor_retry_count)
  43. MAX_PER_REPLICA_RETRY_COUNT = get_env_int("RAY_SERVE_MAX_PER_REPLICA_RETRY_COUNT", 3)
  44. # If you are wondering why we are using histogram buckets, please refer to
  45. # https://prometheus.io/docs/practices/histograms/
  46. # short answer is that its cheaper to calculate percentiles on the histogram
  47. # than to calculate them on raw data, both in terms of time and space.
  48. #: Default histogram buckets for latency tracker.
  49. DEFAULT_LATENCY_BUCKET_MS = [
  50. 1,
  51. 2,
  52. 5,
  53. 10,
  54. 20,
  55. 50,
  56. 100,
  57. 200,
  58. 300,
  59. 400,
  60. 500,
  61. 1000,
  62. 2000,
  63. # 5 seconds
  64. 5000,
  65. # 10 seconds
  66. 10000,
  67. # 60 seconds
  68. 60000,
  69. # 2min
  70. 120000,
  71. # 5 min
  72. 300000,
  73. # 10 min
  74. 600000,
  75. ]
  76. # Example usage:
  77. # RAY_SERVE_REQUEST_LATENCY_BUCKET_MS="1,2,3,4"
  78. # RAY_SERVE_MODEL_LOAD_LATENCY_BUCKET_MS="1,2,3,4"
  79. #: Histogram buckets for request latency.
  80. REQUEST_LATENCY_BUCKETS_MS = parse_latency_buckets(
  81. get_env_str(
  82. "RAY_SERVE_REQUEST_LATENCY_BUCKETS_MS",
  83. get_env_str("REQUEST_LATENCY_BUCKETS_MS", ""),
  84. ),
  85. DEFAULT_LATENCY_BUCKET_MS,
  86. )
  87. #: Histogram buckets for model load/unload latency.
  88. MODEL_LOAD_LATENCY_BUCKETS_MS = parse_latency_buckets(
  89. get_env_str(
  90. "RAY_SERVE_MODEL_LOAD_LATENCY_BUCKETS_MS",
  91. get_env_str("MODEL_LOAD_LATENCY_BUCKETS_MS", ""),
  92. ),
  93. DEFAULT_LATENCY_BUCKET_MS,
  94. )
  95. #: Histogram buckets for replica startup and reconfigure latency.
  96. #: These are longer operations (constructor, model loading) so buckets start higher.
  97. DEFAULT_REPLICA_STARTUP_SHUTDOWN_LATENCY_BUCKETS_MS = [
  98. 5,
  99. 20,
  100. 50,
  101. 100,
  102. 250,
  103. 500,
  104. 1000,
  105. 2000,
  106. 5000,
  107. 10000,
  108. 20000,
  109. 30000,
  110. 60000,
  111. 120000,
  112. 240000,
  113. ]
  114. REPLICA_STARTUP_SHUTDOWN_LATENCY_BUCKETS_MS = parse_latency_buckets(
  115. get_env_str("RAY_SERVE_REPLICA_STARTUP_SHUTDOWN_LATENCY_BUCKETS_MS", ""),
  116. DEFAULT_REPLICA_STARTUP_SHUTDOWN_LATENCY_BUCKETS_MS,
  117. )
  118. #: Histogram buckets for batch execution time in milliseconds.
  119. BATCH_EXECUTION_TIME_BUCKETS_MS = REQUEST_LATENCY_BUCKETS_MS
  120. #: Histogram buckets for batch wait time in milliseconds.
  121. BATCH_WAIT_TIME_BUCKETS_MS = REQUEST_LATENCY_BUCKETS_MS
  122. #: Histogram buckets for batch utilization percentage.
  123. DEFAULT_BATCH_UTILIZATION_BUCKETS_PERCENT = [
  124. 5,
  125. 10,
  126. 20,
  127. 30,
  128. 40,
  129. 50,
  130. 60,
  131. 70,
  132. 80,
  133. 90,
  134. 95,
  135. 99,
  136. 100,
  137. ]
  138. BATCH_UTILIZATION_BUCKETS_PERCENT = parse_latency_buckets(
  139. get_env_str(
  140. "RAY_SERVE_BATCH_UTILIZATION_BUCKETS_PERCENT",
  141. "",
  142. ),
  143. DEFAULT_BATCH_UTILIZATION_BUCKETS_PERCENT,
  144. )
  145. #: Histogram buckets for actual batch size.
  146. DEFAULT_BATCH_SIZE_BUCKETS = [
  147. 1,
  148. 2,
  149. 4,
  150. 8,
  151. 16,
  152. 32,
  153. 64,
  154. 128,
  155. 256,
  156. 512,
  157. 1024,
  158. ]
  159. BATCH_SIZE_BUCKETS = parse_latency_buckets(
  160. get_env_str(
  161. "RAY_SERVE_BATCH_SIZE_BUCKETS",
  162. "",
  163. ),
  164. DEFAULT_BATCH_SIZE_BUCKETS,
  165. )
  166. #: Name of deployment health check method implemented by user.
  167. HEALTH_CHECK_METHOD = "check_health"
  168. #: Name of deployment reconfiguration method implemented by user.
  169. RECONFIGURE_METHOD = "reconfigure"
  170. #: Limit the number of cached handles because each handle has long poll
  171. #: overhead. See https://github.com/ray-project/ray/issues/18980
  172. MAX_CACHED_HANDLES = get_env_int_positive("RAY_SERVE_MAX_CACHED_HANDLES", 100)
  173. #: Because ServeController will accept one long poll request per handle, its
  174. #: concurrency needs to scale as O(num_handles)
  175. CONTROLLER_MAX_CONCURRENCY = get_env_int_positive(
  176. "RAY_SERVE_CONTROLLER_MAX_CONCURRENCY", 15_000
  177. )
  178. DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S = 20
  179. DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S = 2
  180. DEFAULT_HEALTH_CHECK_PERIOD_S = 10
  181. DEFAULT_HEALTH_CHECK_TIMEOUT_S = 30
  182. DEFAULT_MAX_ONGOING_REQUESTS = 5
  183. DEFAULT_TARGET_ONGOING_REQUESTS = 2
  184. DEFAULT_CONSUMER_CONCURRENCY = DEFAULT_MAX_ONGOING_REQUESTS
  185. DEFAULT_CONSTRUCTOR_RETRY_COUNT = 20
  186. # HTTP Proxy health check configs
  187. PROXY_HEALTH_CHECK_TIMEOUT_S = get_env_float_positive(
  188. "RAY_SERVE_PROXY_HEALTH_CHECK_TIMEOUT_S", 10.0
  189. )
  190. PROXY_HEALTH_CHECK_PERIOD_S = get_env_float_positive(
  191. "RAY_SERVE_PROXY_HEALTH_CHECK_PERIOD_S", 10.0
  192. )
  193. PROXY_READY_CHECK_TIMEOUT_S = get_env_float_positive(
  194. "RAY_SERVE_PROXY_READY_CHECK_TIMEOUT_S", 5.0
  195. )
  196. # Number of times in a row that a HTTP proxy must fail the health check before
  197. # being marked unhealthy.
  198. PROXY_HEALTH_CHECK_UNHEALTHY_THRESHOLD = 3
  199. # The minimum drain period for a HTTP proxy.
  200. PROXY_MIN_DRAINING_PERIOD_S = get_env_float_positive(
  201. "RAY_SERVE_PROXY_MIN_DRAINING_PERIOD_S", 30.0
  202. )
  203. # The time in seconds that the http proxy state waits before
  204. # rechecking whether the proxy actor is drained or not.
  205. PROXY_DRAIN_CHECK_PERIOD_S = 5
  206. #: Number of times in a row that a replica must fail the health check before
  207. #: being marked unhealthy.
  208. REPLICA_HEALTH_CHECK_UNHEALTHY_THRESHOLD = 3
  209. # The time in seconds that the Serve client waits before rechecking deployment state
  210. CLIENT_POLLING_INTERVAL_S = 1.0
  211. # The time in seconds that the Serve client waits before checking if
  212. # deployment has been created
  213. CLIENT_CHECK_CREATION_POLLING_INTERVAL_S = 0.1
  214. # Timeout for GCS internal KV service
  215. RAY_SERVE_KV_TIMEOUT_S = get_env_float_positive("RAY_SERVE_KV_TIMEOUT_S", None)
  216. # Timeout for GCS RPC request
  217. RAY_GCS_RPC_TIMEOUT_S = 3.0
  218. # Maximum duration to wait until broadcasting a long poll update if there are
  219. # still replicas in the RECOVERING state.
  220. RECOVERING_LONG_POLL_BROADCAST_TIMEOUT_S = 10.0
  221. # Minimum duration to wait until broadcasting model IDs.
  222. PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S = 0.1
  223. # Deprecation message for V1 migrations.
  224. MIGRATION_MESSAGE = (
  225. "See https://docs.ray.io/en/latest/serve/index.html for more information."
  226. )
  227. # Environment variable name for to specify the encoding of the log messages
  228. RAY_SERVE_LOG_ENCODING = "TEXT"
  229. # Setting RAY_SERVE_LOG_TO_STDERR=0 will disable logging to the stdout and stderr.
  230. # Also, redirect them to serve's log files.
  231. RAY_SERVE_LOG_TO_STDERR = get_env_bool("RAY_SERVE_LOG_TO_STDERR", "1")
  232. # Logging format attributes
  233. SERVE_LOG_REQUEST_ID = "request_id"
  234. SERVE_LOG_ROUTE = "route"
  235. SERVE_LOG_APPLICATION = "application"
  236. SERVE_LOG_DEPLOYMENT = "deployment"
  237. SERVE_LOG_REPLICA = "replica"
  238. SERVE_LOG_COMPONENT = "component_name"
  239. SERVE_LOG_COMPONENT_ID = "component_id"
  240. SERVE_LOG_MESSAGE = "message"
  241. # This is a reserved for python logging module attribute, it should not be changed.
  242. SERVE_LOG_LEVEL_NAME = "levelname"
  243. SERVE_LOG_TIME = "asctime"
  244. # Logging format with record key to format string dict
  245. SERVE_LOG_RECORD_FORMAT = {
  246. SERVE_LOG_REQUEST_ID: "%(request_id)s",
  247. SERVE_LOG_APPLICATION: "%(application)s",
  248. SERVE_LOG_MESSAGE: "-- %(message)s",
  249. SERVE_LOG_LEVEL_NAME: "%(levelname)s",
  250. SERVE_LOG_TIME: "%(asctime)s",
  251. }
  252. # There are some attributes that we only use internally or don't provide values to the
  253. # users. Adding to this set will remove them from structured logs.
  254. SERVE_LOG_UNWANTED_ATTRS = {
  255. "serve_access_log",
  256. "task_id",
  257. "job_id",
  258. "skip_context_filter",
  259. }
  260. RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S = get_env_int_non_negative(
  261. "RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S", 0
  262. )
  263. RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S = 0.0
  264. SERVE_LOG_EXTRA_FIELDS = "ray_serve_extra_fields"
  265. # Serve HTTP request header key for routing requests.
  266. SERVE_MULTIPLEXED_MODEL_ID = "serve_multiplexed_model_id"
  267. # HTTP request ID
  268. SERVE_HTTP_REQUEST_ID_HEADER = "x-request-id"
  269. # Feature flag to turn on node locality routing for proxies. On by default.
  270. RAY_SERVE_PROXY_PREFER_LOCAL_NODE_ROUTING = get_env_bool(
  271. "RAY_SERVE_PROXY_PREFER_LOCAL_NODE_ROUTING", "1"
  272. )
  273. # Feature flag to turn on AZ locality routing for proxies. On by default.
  274. RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING = get_env_bool(
  275. "RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING", "1"
  276. )
  277. # Serve HTTP proxy callback import path.
  278. RAY_SERVE_HTTP_PROXY_CALLBACK_IMPORT_PATH = get_env_str(
  279. "RAY_SERVE_HTTP_PROXY_CALLBACK_IMPORT_PATH", None
  280. )
  281. # Serve controller callback import path.
  282. RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH = get_env_str(
  283. "RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH", None
  284. )
  285. # Maximum timeout allowed for record_autoscaling_stats to run.
  286. RAY_SERVE_RECORD_AUTOSCALING_STATS_TIMEOUT_S = get_env_float(
  287. "RAY_SERVE_RECORD_AUTOSCALING_STATS_TIMEOUT_S", 10.0
  288. )
  289. # How often autoscaling metrics are recorded on Serve replicas.
  290. RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_INTERVAL_S = get_env_float(
  291. "RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_INTERVAL_S", 0.5
  292. )
  293. # Replica autoscaling metrics push interval.
  294. RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S = get_env_float(
  295. "RAY_SERVE_REPLICA_AUTOSCALING_METRIC_PUSH_INTERVAL_S", 10.0
  296. )
  297. # How often autoscaling metrics are recorded on Serve handles.
  298. RAY_SERVE_HANDLE_AUTOSCALING_METRIC_RECORD_INTERVAL_S = get_env_float(
  299. "RAY_SERVE_HANDLE_AUTOSCALING_METRIC_RECORD_INTERVAL_S", 0.5
  300. )
  301. # Handle autoscaling metrics push interval. (This interval will affect the cold start time period)
  302. RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S = get_env_float(
  303. "RAY_SERVE_HANDLE_AUTOSCALING_METRIC_PUSH_INTERVAL_S",
  304. 10.0,
  305. )
  306. # Serve multiplexed matching timeout.
  307. # This is the timeout for the matching process of multiplexed requests. To avoid
  308. # thundering herd problem, the timeout value will be randomized between this value
  309. # and this value * 2. The unit is second.
  310. # If the matching process takes longer than the timeout, the request will be
  311. # fallen to the default routing strategy.
  312. RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S = get_env_float_non_negative(
  313. "RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S", 1.0
  314. )
  315. # Enable memray in all Serve actors.
  316. RAY_SERVE_ENABLE_MEMORY_PROFILING = get_env_bool(
  317. "RAY_SERVE_ENABLE_MEMORY_PROFILING", "0"
  318. )
  319. # Max value allowed for max_replicas_per_node option.
  320. # TODO(jjyao) the <= 100 limitation is an artificial one
  321. # and is due to the fact that Ray core only supports resource
  322. # precision up to 0.0001.
  323. # This limitation should be lifted in the long term.
  324. MAX_REPLICAS_PER_NODE_MAX_VALUE = 100
  325. # Argument name for passing in the gRPC context into a replica.
  326. GRPC_CONTEXT_ARG_NAME = "grpc_context"
  327. # Whether or not to forcefully kill replicas that fail health checks.
  328. RAY_SERVE_FORCE_STOP_UNHEALTHY_REPLICAS = get_env_bool(
  329. "RAY_SERVE_FORCE_STOP_UNHEALTHY_REPLICAS", "0"
  330. )
  331. # Initial deadline for queue length responses in the router.
  332. RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S = get_env_float(
  333. "RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S", 0.1
  334. )
  335. # Maximum deadline for queue length responses in the router (in backoff).
  336. RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S = get_env_float(
  337. "RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S", 1.0
  338. )
  339. # Length of time to respect entries in the queue length cache when routing requests.
  340. RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S = get_env_float_non_negative(
  341. "RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S", 10.0
  342. )
  343. # Minimum interval between router queue length gauge updates per replica.
  344. # Throttling reduces metrics overhead on the hot path. Set to 0 to disable throttling.
  345. RAY_SERVE_ROUTER_QUEUE_LEN_GAUGE_THROTTLE_S = get_env_float_non_negative(
  346. "RAY_SERVE_ROUTER_QUEUE_LEN_GAUGE_THROTTLE_S", 0.1
  347. )
  348. # Backoff seconds when choosing router failed, backoff time is calculated as
  349. # initial_backoff_s * backoff_multiplier ** attempt.
  350. # The default backoff time is [0, 0.025, 0.05, 0.1, 0.2, 0.4, 0.5, 0.5 ... ].
  351. RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S = get_env_float(
  352. "RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S", 0.025
  353. )
  354. RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER = get_env_int(
  355. "RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER", 2
  356. )
  357. RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S = get_env_float(
  358. "RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S", 0.5
  359. )
  360. # The default autoscaling policy to use if none is specified.
  361. DEFAULT_AUTOSCALING_POLICY_NAME = (
  362. "ray.serve.autoscaling_policy:default_autoscaling_policy"
  363. )
  364. # Feature flag to enable collecting all queued and ongoing request
  365. # metrics at handles instead of replicas. ON by default.
  366. RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE = get_env_bool(
  367. "RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE", "1"
  368. )
  369. RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S = get_env_float_non_negative(
  370. "RAY_SERVE_MIN_HANDLE_METRICS_TIMEOUT_S", 10.0
  371. )
  372. # Default is 2GiB, the max for a signed int.
  373. RAY_SERVE_GRPC_MAX_MESSAGE_SIZE = get_env_int(
  374. "RAY_SERVE_GRPC_MAX_MESSAGE_SIZE", (2 * 1024 * 1024 * 1024) - 1
  375. )
  376. RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH = get_env_int(
  377. # Default max message length in gRPC is 4MB, we keep that default
  378. "RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH",
  379. 4 * 1024 * 1024,
  380. )
  381. # Default options passed when constructing gRPC servers.
  382. DEFAULT_GRPC_SERVER_OPTIONS = [
  383. ("grpc.max_send_message_length", RAY_SERVE_GRPC_MAX_MESSAGE_SIZE),
  384. ("grpc.max_receive_message_length", RAY_SERVE_GRPC_MAX_MESSAGE_SIZE),
  385. ]
  386. # Timeout for gracefully shutting down metrics pusher, e.g. in routers or replicas
  387. METRICS_PUSHER_GRACEFUL_SHUTDOWN_TIMEOUT_S = 10
  388. # Feature flag to set `enable_task_events=True` on Serve-managed actors.
  389. RAY_SERVE_ENABLE_TASK_EVENTS = get_env_bool("RAY_SERVE_ENABLE_TASK_EVENTS", "0")
  390. # This is deprecated and will be removed in the future.
  391. RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY = get_env_bool(
  392. "RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY", "0"
  393. )
  394. # Use pack instead of spread scheduling strategy.
  395. RAY_SERVE_USE_PACK_SCHEDULING_STRATEGY = get_env_bool(
  396. "RAY_SERVE_USE_PACK_SCHEDULING_STRATEGY",
  397. os.environ.get("RAY_SERVE_USE_COMPACT_SCHEDULING_STRATEGY", "0"),
  398. )
  399. # Comma-separated list of custom resources prioritized in scheduling. Sorted from highest to lowest priority.
  400. # Example: "customx,customy"
  401. RAY_SERVE_HIGH_PRIORITY_CUSTOM_RESOURCES: List[str] = str_to_list(
  402. get_env_str("RAY_SERVE_HIGH_PRIORITY_CUSTOM_RESOURCES", "")
  403. )
  404. # Feature flag to always override local_testing_mode to True in serve.run.
  405. # This is used for internal testing to avoid passing the flag to every invocation.
  406. RAY_SERVE_FORCE_LOCAL_TESTING_MODE = get_env_bool(
  407. "RAY_SERVE_FORCE_LOCAL_TESTING_MODE", "0"
  408. )
  409. # Run sync methods defined in the replica in a thread pool by default.
  410. RAY_SERVE_RUN_SYNC_IN_THREADPOOL = get_env_bool("RAY_SERVE_RUN_SYNC_IN_THREADPOOL", "0")
  411. RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING = (
  412. "Calling sync method '{method_name}' directly on the "
  413. "asyncio loop. In a future version, sync methods will be run in a "
  414. "threadpool by default. Ensure your sync methods are thread safe "
  415. "or keep the existing behavior by making them `async def`. Opt "
  416. "into the new behavior by setting "
  417. "RAY_SERVE_RUN_SYNC_IN_THREADPOOL=1."
  418. )
  419. # Feature flag to turn off GC optimizations in the proxy (in case there is a
  420. # memory leak or negative performance impact).
  421. RAY_SERVE_ENABLE_PROXY_GC_OPTIMIZATIONS = get_env_bool(
  422. "RAY_SERVE_ENABLE_PROXY_GC_OPTIMIZATIONS", "1"
  423. )
  424. # Used for gc.set_threshold() when proxy GC optimizations are enabled.
  425. RAY_SERVE_PROXY_GC_THRESHOLD = get_env_int("RAY_SERVE_PROXY_GC_THRESHOLD", 700)
  426. # Interval at which cached metrics will be exported using the Ray metric API.
  427. # Set to `0` to disable caching entirely.
  428. RAY_SERVE_METRICS_EXPORT_INTERVAL_MS = get_env_int(
  429. "RAY_SERVE_METRICS_EXPORT_INTERVAL_MS", 100
  430. )
  431. # The default request router class to use if none is specified.
  432. DEFAULT_REQUEST_ROUTER_PATH = (
  433. "ray.serve._private.request_router:PowerOfTwoChoicesRequestRouter"
  434. )
  435. # The default request routing period to use if none is specified.
  436. DEFAULT_REQUEST_ROUTING_STATS_PERIOD_S = 10
  437. # The default request routing timeout to use if none is specified.
  438. DEFAULT_REQUEST_ROUTING_STATS_TIMEOUT_S = 30
  439. # Name of deployment request routing stats method implemented by user.
  440. REQUEST_ROUTING_STATS_METHOD = "record_routing_stats"
  441. # By default, we run user code in a separate event loop.
  442. # This flag can be set to 0 to run user code in the same event loop as the
  443. # replica's main event loop.
  444. RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD = get_env_bool(
  445. "RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD", "1"
  446. )
  447. # By default, we run the router in a separate event loop.
  448. # This flag can be set to 0 to run the router in the same event loop as the
  449. # replica's main event loop.
  450. RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP = get_env_bool(
  451. "RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP", "1"
  452. )
  453. # For now, this is used only for testing. In the suite of tests that
  454. # use gRPC to send requests, we flip this flag on.
  455. RAY_SERVE_USE_GRPC_BY_DEFAULT = (
  456. os.environ.get("RAY_SERVE_USE_GRPC_BY_DEFAULT", "0") == "1"
  457. )
  458. RAY_SERVE_PROXY_USE_GRPC = os.environ.get("RAY_SERVE_PROXY_USE_GRPC") == "1" or (
  459. not os.environ.get("RAY_SERVE_PROXY_USE_GRPC") == "0"
  460. and RAY_SERVE_USE_GRPC_BY_DEFAULT
  461. )
  462. # The default buffer size for request path logs. Setting to 1 will ensure
  463. # logs are flushed to file handler immediately, otherwise it will be buffered
  464. # and flushed to file handler when the buffer is full or when there is a log
  465. # line with level ERROR.
  466. RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE = get_env_int(
  467. "RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE", 1
  468. )
  469. # Feature flag to fail the deployment if the rank is not set.
  470. # TODO (abrar): Remove this flag after the feature is stable.
  471. RAY_SERVE_FAIL_ON_RANK_ERROR = get_env_bool("RAY_SERVE_FAIL_ON_RANK_ERROR", "0")
  472. # The message to return when the replica is healthy.
  473. HEALTHY_MESSAGE = "success"
  474. # Feature flag to enable a limited form of direct ingress where ingress applications
  475. # listen on port 8000 (HTTP) and 9000 (gRPC). No proxies will be started.
  476. RAY_SERVE_ENABLE_DIRECT_INGRESS = (
  477. os.environ.get("RAY_SERVE_ENABLE_DIRECT_INGRESS", "0") == "1"
  478. )
  479. RAY_SERVE_DIRECT_INGRESS_MIN_HTTP_PORT = int(
  480. os.environ.get("RAY_SERVE_DIRECT_INGRESS_MIN_HTTP_PORT", "30000")
  481. )
  482. RAY_SERVE_DIRECT_INGRESS_MIN_GRPC_PORT = int(
  483. os.environ.get("RAY_SERVE_DIRECT_INGRESS_MIN_GRPC_PORT", "40000")
  484. )
  485. RAY_SERVE_DIRECT_INGRESS_MAX_HTTP_PORT = int(
  486. os.environ.get("RAY_SERVE_DIRECT_INGRESS_MAX_HTTP_PORT", "31000")
  487. )
  488. RAY_SERVE_DIRECT_INGRESS_MAX_GRPC_PORT = int(
  489. os.environ.get("RAY_SERVE_DIRECT_INGRESS_MAX_GRPC_PORT", "41000")
  490. )
  491. RAY_SERVE_DIRECT_INGRESS_PORT_RETRY_COUNT = int(
  492. os.environ.get("RAY_SERVE_DIRECT_INGRESS_PORT_RETRY_COUNT", "100")
  493. )
  494. # The minimum drain period for a HTTP proxy.
  495. # If RAY_SERVE_FORCE_STOP_UNHEALTHY_REPLICAS is set to 1,
  496. # then the minimum draining period is 0.
  497. RAY_SERVE_DIRECT_INGRESS_MIN_DRAINING_PERIOD_S = float(
  498. os.environ.get("RAY_SERVE_DIRECT_INGRESS_MIN_DRAINING_PERIOD_S", "30")
  499. )
  500. # HTTP request timeout
  501. SERVE_HTTP_REQUEST_TIMEOUT_S_HEADER = "x-request-timeout-seconds"
  502. # HTTP request disconnect disabled
  503. SERVE_HTTP_REQUEST_DISCONNECT_DISABLED_HEADER = "x-request-disconnect-disabled"
  504. # If throughput optimized Ray Serve is enabled, set the following constants.
  505. # This should be at the end.
  506. RAY_SERVE_THROUGHPUT_OPTIMIZED = get_env_bool("RAY_SERVE_THROUGHPUT_OPTIMIZED", "0")
  507. if RAY_SERVE_THROUGHPUT_OPTIMIZED:
  508. RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD = get_env_bool(
  509. "RAY_SERVE_RUN_USER_CODE_IN_SEPARATE_THREAD", "0"
  510. )
  511. RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE = get_env_int(
  512. "RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE", 1000
  513. )
  514. RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP = get_env_bool(
  515. "RAY_SERVE_RUN_ROUTER_IN_SEPARATE_LOOP", "0"
  516. )
  517. RAY_SERVE_LOG_TO_STDERR = get_env_bool("RAY_SERVE_LOG_TO_STDERR", "0")
  518. # The maximum allowed RPC latency in milliseconds.
  519. # This is used to detect and warn about long RPC latencies
  520. # between the controller and the replicas.
  521. RAY_SERVE_RPC_LATENCY_WARNING_THRESHOLD_MS = 2000
  522. # Feature flag to aggregate metrics at the controller instead of the replicas or handles.
  523. RAY_SERVE_AGGREGATE_METRICS_AT_CONTROLLER = get_env_bool(
  524. "RAY_SERVE_AGGREGATE_METRICS_AT_CONTROLLER", "0"
  525. )
  526. # Key for the decision counters in default autoscaling policy state
  527. SERVE_AUTOSCALING_DECISION_COUNTERS_KEY = "__decision_counters"
  528. # Event loop monitoring interval in seconds.
  529. # This is how often the event loop lag is measured.
  530. RAY_SERVE_EVENT_LOOP_MONITORING_INTERVAL_S = get_env_float_positive(
  531. "RAY_SERVE_EVENT_LOOP_MONITORING_INTERVAL_S", 5.0
  532. )
  533. # Histogram buckets for event loop scheduling latency in milliseconds.
  534. # These are tuned for detecting event loop blocking:
  535. # - < 10ms: healthy
  536. # - 10-50ms: acceptable under load
  537. # - 50-100ms: concerning, investigate
  538. # - 100-500ms: problematic, likely blocking code
  539. # - > 500ms: severe, definitely blocking
  540. # - > 5s: catastrophic
  541. SERVE_EVENT_LOOP_LATENCY_HISTOGRAM_BOUNDARIES_MS = [
  542. 1, # 1ms
  543. 5, # 5ms
  544. 10, # 10ms
  545. 25, # 25ms
  546. 50, # 50ms
  547. 100, # 100ms
  548. 250, # 250ms
  549. 500, # 500ms
  550. 1000, # 1s
  551. 2500, # 2.5s
  552. 5000, # 5s
  553. 10000, # 10s
  554. ]