| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- import os
- ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT = int(
- os.getenv("RAYLLM_ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT", "1")
- )
- # Timeout before download in multiplex deployment fails. <=0 means no timeout.
- DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S = float(
- os.getenv("DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S", "30")
- )
- if DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S <= 0:
- DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S = None
- # Number of retries for downloading a model in multiplex deployment.
- DEFAULT_MULTIPLEX_DOWNLOAD_TRIES = int(
- os.getenv("DEFAULT_MULTIPLEX_DOWNLOAD_RETRIES", "3")
- )
- # If true, a default runtime_env will be injected to import rayllm on worker startup.
- # This is a startup time optimization to avoid the latency penalty of sequentially
- # importing rayllm in multiple layers of worker processes.
- ENABLE_WORKER_PROCESS_SETUP_HOOK = (
- os.environ.get("RAYLLM_ENABLE_WORKER_PROCESS_SETUP_HOOK", "1") == "1"
- )
- CLOUD_OBJECT_MISSING_EXPIRE_S = 30
- CLOUD_OBJECT_EXISTS_EXPIRE_S = 60 * 60
- # Sentinel object used to indicate that a LoRA adapter config file is missing.
- LORA_ADAPTER_CONFIG_NAME = "adapter_config.json"
- DEFAULT_HEALTH_CHECK_PERIOD_S = int(
- os.getenv("RAY_SERVE_LLM_DEFAULT_HEALTH_CHECK_PERIOD_S", "10")
- )
- DEFAULT_HEALTH_CHECK_TIMEOUT_S = int(
- os.getenv("RAY_SERVE_LLM_DEFAULT_HEALTH_CHECK_TIMEOUT_S", "10")
- )
- DEFAULT_MAX_ONGOING_REQUESTS = int(
- os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_ONGOING_REQUESTS", str(int(1e9)))
- )
- DEFAULT_MAX_REPLICAS = int(os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_REPLICAS", "10"))
- DEFAULT_MAX_TARGET_ONGOING_REQUESTS = int(
- os.getenv("RAY_SERVE_LLM_DEFAULT_MAX_TARGET_ONGOING_REQUESTS", str(int(1e9)))
- )
- ENGINE_START_TIMEOUT_S = int(os.getenv("RAYLLM_ENGINE_START_TIMEOUT_S", str(60 * 60)))
- MIN_NUM_TOPLOGPROBS_ALLOWED = 0
- MAX_NUM_TOPLOGPROBS_ALLOWED = 5
- MODEL_RESPONSE_BATCH_TIMEOUT_MS = float(
- os.getenv("RAYLLM_MODEL_RESPONSE_BATCH_TIMEOUT_MS", "50")
- )
- RAYLLM_ENABLE_REQUEST_PROMPT_LOGS = (
- os.environ.get("RAYLLM_ENABLE_REQUEST_PROMPT_LOGS", "1") == "1"
- )
- RAYLLM_GUIDED_DECODING_BACKEND = os.environ.get(
- "RAYLLM_GUIDED_DECODING_BACKEND", "xgrammar"
- )
- MAX_NUM_STOPPING_SEQUENCES = int(os.getenv("RAYLLM_MAX_NUM_STOPPING_SEQUENCES", "8"))
- ENV_VARS_TO_PROPAGATE = {
- "HUGGING_FACE_HUB_TOKEN",
- "HF_TOKEN",
- }
- # timeout in 10 minutes. Streaming can take longer than 3 min
- DEFAULT_LLM_ROUTER_HTTP_TIMEOUT = float(
- os.environ.get("RAY_SERVE_LLM_ROUTER_HTTP_TIMEOUT", 600)
- )
- ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0")))
- RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS"
- # The ratio of number of router replicas to number of model replicas.
- # Default to 2 meaning that there are 2 router replicas for every model replica.
- DEFAULT_ROUTER_TO_MODEL_REPLICA_RATIO = float(
- os.getenv("RAY_SERVE_LLM_ROUTER_TO_MODEL_REPLICA_RATIO", "2")
- )
- DEFAULT_LLM_ROUTER_MIN_REPLICAS = int(
- os.environ.get("RAY_SERVE_LLM_ROUTER_MIN_REPLICAS", 2)
- )
- DEFAULT_LLM_ROUTER_INITIAL_REPLICAS = int(
- os.environ.get("RAY_SERVE_LLM_ROUTER_INITIAL_REPLICAS", 2)
- )
- DEFAULT_LLM_ROUTER_MAX_REPLICAS = int(
- os.environ.get("RAY_SERVE_LLM_ROUTER_MAX_REPLICAS", 1000)
- )
- DEFAULT_LLM_ROUTER_TARGET_ONGOING_REQUESTS = int(
- os.environ.get(
- "RAY_SERVE_LLM_ROUTER_TARGET_ONGOING_REQUESTS",
- DEFAULT_MAX_TARGET_ONGOING_REQUESTS,
- )
- )
- # HOME DIR
- RAYLLM_HOME_DIR = os.environ.get("RAYLLM_HOME_DIR", os.path.expanduser("~/.ray/llm"))
|