constants.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import os
  2. ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT = int(
  3. os.getenv("RAYLLM_ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT", "1")
  4. )
  5. # Timeout before download in multiplex deployment fails. <=0 means no timeout.
  6. DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S = float(
  7. os.getenv("DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S", "30")
  8. )
  9. if DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S <= 0:
  10. DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S = None
  11. # Number of retries for downloading a model in multiplex deployment.
  12. DEFAULT_MULTIPLEX_DOWNLOAD_TRIES = int(
  13. os.getenv("DEFAULT_MULTIPLEX_DOWNLOAD_RETRIES", "3")
  14. )
  15. # If true, a default runtime_env will be injected to import rayllm on worker startup.
  16. # This is a startup time optimization to avoid the latency penalty of sequentially
  17. # importing rayllm in multiple layers of worker processes.
  18. ENABLE_WORKER_PROCESS_SETUP_HOOK = (
  19. os.environ.get("RAYLLM_ENABLE_WORKER_PROCESS_SETUP_HOOK", "1") == "1"
  20. )
  21. CLOUD_OBJECT_MISSING_EXPIRE_S = 30
  22. CLOUD_OBJECT_EXISTS_EXPIRE_S = 60 * 60
  23. # Sentinel object used to indicate that a LoRA adapter config file is missing.
  24. LORA_ADAPTER_CONFIG_NAME = "adapter_config.json"
  25. DEFAULT_HEALTH_CHECK_PERIOD_S = int(
  26. os.getenv("RAY_SERVE_LLM_DEFAULT_HEALTH_CHECK_PERIOD_S", "10")
  27. )
  28. DEFAULT_HEALTH_CHECK_TIMEOUT_S = int(
  29. os.getenv("RAY_SERVE_LLM_DEFAULT_HEALTH_CHECK_TIMEOUT_S", "10")
  30. )
  31. DEFAULT_MAX_ONGOING_REQUESTS = int(
  32. os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_ONGOING_REQUESTS", str(int(1e9)))
  33. )
  34. DEFAULT_MAX_REPLICAS = int(os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_REPLICAS", "10"))
  35. DEFAULT_MAX_TARGET_ONGOING_REQUESTS = int(
  36. os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_TARGET_ONGOING_REQUESTS", str(int(1e9)))
  37. )
  38. ENGINE_START_TIMEOUT_S = int(os.getenv("RAYLLM_ENGINE_START_TIMEOUT_S", str(60 * 60)))
  39. MIN_NUM_TOPLOGPROBS_ALLOWED = 0
  40. MAX_NUM_TOPLOGPROBS_ALLOWED = 5
  41. MODEL_RESPONSE_BATCH_TIMEOUT_MS = float(
  42. os.getenv("RAYLLM_MODEL_RESPONSE_BATCH_TIMEOUT_MS", "50")
  43. )
  44. RAYLLM_ENABLE_REQUEST_PROMPT_LOGS = (
  45. os.environ.get("RAYLLM_ENABLE_REQUEST_PROMPT_LOGS", "1") == "1"
  46. )
  47. RAYLLM_GUIDED_DECODING_BACKEND = os.environ.get(
  48. "RAYLLM_GUIDED_DECODING_BACKEND", "xgrammar"
  49. )
  50. MAX_NUM_STOPPING_SEQUENCES = int(os.getenv("RAYLLM_MAX_NUM_STOPPING_SEQUENCES", "8"))
  51. ENV_VARS_TO_PROPAGATE = {
  52. "HUGGING_FACE_HUB_TOKEN",
  53. "HF_TOKEN",
  54. }
  55. # timeout in 10 minutes. Streaming can take longer than 3 min
  56. DEFAULT_LLM_ROUTER_HTTP_TIMEOUT = float(
  57. os.environ.get("RAY_SERVE_LLM_ROUTER_HTTP_TIMEOUT", 600)
  58. )
  59. ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0")))
  60. RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS"
  61. # The ratio of number of router replicas to number of model replicas.
  62. # Default to 2 meaning that there are 2 router replicas for every model replica.
  63. DEFAULT_ROUTER_TO_MODEL_REPLICA_RATIO = float(
  64. os.getenv("RAY_SERVE_LLM_ROUTER_TO_MODEL_REPLICA_RATIO", "2")
  65. )
  66. DEFAULT_LLM_ROUTER_MIN_REPLICAS = int(
  67. os.environ.get("RAY_SERVE_LLM_ROUTER_MIN_REPLICAS", 2)
  68. )
  69. DEFAULT_LLM_ROUTER_INITIAL_REPLICAS = int(
  70. os.environ.get("RAY_SERVE_LLM_ROUTER_INITIAL_REPLICAS", 2)
  71. )
  72. DEFAULT_LLM_ROUTER_MAX_REPLICAS = int(
  73. os.environ.get("RAY_SERVE_LLM_ROUTER_MAX_REPLICAS", 1000)
  74. )
  75. DEFAULT_LLM_ROUTER_TARGET_ONGOING_REQUESTS = int(
  76. os.environ.get(
  77. "RAY_SERVE_LLM_ROUTER_TARGET_ONGOING_REQUESTS",
  78. DEFAULT_MAX_TARGET_ONGOING_REQUESTS,
  79. )
  80. )
  81. # HOME DIR
  82. RAYLLM_HOME_DIR = os.environ.get("RAYLLM_HOME_DIR", os.path.expanduser("~/.ray/llm"))