"""Ray constants used in the Python code."""

import json
import logging
import os
import sys

logger = logging.getLogger(__name__)


def env_integer(key, default):
    if key in os.environ:
        value = os.environ[key]
        if value.isdigit():
            return int(os.environ[key])

        logger.debug(
            f"Found {key} in environment, but value must "
            f"be an integer. Got: {value}. Returning "
            f"provided default {default}."
        )
        return default
    return default


def env_float(key, default):
    if key in os.environ:
        value = os.environ[key]
        try:
            return float(value)
        except ValueError:
            logger.debug(
                f"Found {key} in environment, but value must "
                f"be a float. Got: {value}. Returning "
                f"provided default {default}."
            )
            return default
    return default


def env_bool(key, default):
    if key in os.environ:
        return (
            True
            if os.environ[key].lower() == "true" or os.environ[key] == "1"
            else False
        )
    return default


def env_set_by_user(key):
    return key in os.environ


# Whether event logging to driver is enabled. Set to 0 to disable.
AUTOSCALER_EVENTS = env_integer("RAY_SCHEDULER_EVENTS", 1)

# Whether to disable the C++ failure signal handler that provides stack traces
# on crashes. Disabling this is necessary when using Java libraries
# because Ray's signal handler conflicts with the JVM's signal handling.
RAY_DISABLE_FAILURE_SIGNAL_HANDLER = env_bool(
    "RAY_DISABLE_FAILURE_SIGNAL_HANDLER", False
)

RAY_LOG_TO_DRIVER = env_bool("RAY_LOG_TO_DRIVER", True)

# Filter level under which events will be filtered out, i.e. not printing to driver
RAY_LOG_TO_DRIVER_EVENT_LEVEL = os.environ.get("RAY_LOG_TO_DRIVER_EVENT_LEVEL", "INFO")

# Internal kv keys for storing monitor debug status.
DEBUG_AUTOSCALING_ERROR = "__autoscaling_error"
DEBUG_AUTOSCALING_STATUS = "__autoscaling_status"
DEBUG_AUTOSCALING_STATUS_LEGACY = "__autoscaling_status_legacy"

ID_SIZE = 28

# The following constants are used to create default values for
# resource isolation when it is enabled.
# TODO(54703): Link to OSS documentation about the feature once it's available.
DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"
# The default proportion of cpu cores to reserve for ray system processes.
DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float(
    "RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05
)
# The default minimum number of cpu cores to reserve for ray system processes.
# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION < this value.
DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES = env_float(
    "RAY_DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES", 1.0
)
# The default maximum number of cpu cores to reserve for ray system processes.
# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION > this value.
DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES = env_float(
    "RAY_DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES", 3.0
)
# The values for SYSTEM_RESERVED_MEMORY do not include the memory reserveed
# for the object store.
# The default proportion available memory to reserve for ray system processes.
DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_float(
    "RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
)
# The default minimum number of bytes to reserve for ray system processes.
# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION < this value.
DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
    "RAY_DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES", (500) * (1024**2)
)
# The default maximum number of bytes to reserve for ray system processes.
# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION > this value.
DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
    "RAY_DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES", (10) * (1024**3)
)

# The default maximum number of bytes to allocate to the object store unless
# overridden by the user.
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = env_integer(
    "RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES", (200) * (10**9)  # 200 GB
)
# The default proportion of available memory allocated to the object store
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = env_float(
    "RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION",
    0.3,
)
# The smallest cap on the memory used by the object store that we allow.
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES
OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
# Each ObjectRef currently uses about 3KB of caller memory.
CALLER_MEMORY_USAGE_PER_OBJECT_REF = 3000
# Above this number of bytes, raise an error by default unless the user sets
# RAY_ALLOW_SLOW_STORAGE=1. This avoids swapping with large object stores.
REQUIRE_SHM_SIZE_THRESHOLD = 10**10
# Mac with 16GB memory has degraded performance when the object store size is
# greater than 2GB.
# (see https://github.com/ray-project/ray/issues/20388 for details)
# The workaround here is to limit capacity to 2GB for Mac by default,
# and raise error if the capacity is overwritten by user.
MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT = (2) * (2**30)
# If a user does not specify a port for the primary Ray service,
# we attempt to start the service running at this port.
DEFAULT_PORT = 6379

RAY_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_ADDRESS"
RAY_API_SERVER_ADDRESS_ENVIRONMENT_VARIABLE = "RAY_API_SERVER_ADDRESS"
RAY_NAMESPACE_ENVIRONMENT_VARIABLE = "RAY_NAMESPACE"
RAY_RUNTIME_ENV_ENVIRONMENT_VARIABLE = "RAY_RUNTIME_ENV"
RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR = (
    "RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S"
)
# Ray populates this env var to the working dir in the creation of a runtime env.
# For example, `pip` and `conda` users can use this environment variable to locate the
# `requirements.txt` file.
RAY_RUNTIME_ENV_CREATE_WORKING_DIR_ENV_VAR = "RAY_RUNTIME_ENV_CREATE_WORKING_DIR"
# Defaults to 10 minutes. This should be longer than the total time it takes for
# the local working_dir and py_modules to be uploaded, or these files might get
# garbage collected before the job starts.
RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT = 10 * 60
# If set to 1, then `.gitignore` files will not be parsed and loaded into "excludes"
# when using a local working_dir or py_modules.
RAY_RUNTIME_ENV_IGNORE_GITIGNORE = "RAY_RUNTIME_ENV_IGNORE_GITIGNORE"
# Default directories to exclude when packaging working_dir.
# Override by setting the RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES
# (comma-separated) environment variable. Set to an empty string to disable.
# `.git` is necessary since it is never in .gitignore.
RAY_RUNTIME_ENV_DEFAULT_EXCLUDES = ".git,.venv,venv,__pycache__"


def get_runtime_env_default_excludes() -> list[str]:
    """Get default excludes for working_dir, overridable via RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES environment variable."""
    val = os.environ.get(
        "RAY_OVERRIDE_RUNTIME_ENV_DEFAULT_EXCLUDES", RAY_RUNTIME_ENV_DEFAULT_EXCLUDES
    )
    return [x.strip() for x in val.split(",") if x.strip()]


# Hook for running a user-specified runtime-env hook. This hook will be called
# unconditionally given the runtime_env dict passed for ray.init. It must return
# a rewritten runtime_env dict. Example: "your.module.runtime_env_hook".
RAY_RUNTIME_ENV_HOOK = "RAY_RUNTIME_ENV_HOOK"
# Hook that is invoked on `ray start`. It will be given the cluster parameters and
# whether we are the head node as arguments. The function can modify the params class,
# but otherwise returns void. Example: "your.module.ray_start_hook".
RAY_START_HOOK = "RAY_START_HOOK"
# Hook that is invoked on `ray job submit`. It will be given all the same args as the
# job.cli.submit() function gets, passed as kwargs to this function.
RAY_JOB_SUBMIT_HOOK = "RAY_JOB_SUBMIT_HOOK"
# Headers to pass when using the Job CLI. It will be given to
# instantiate a Job SubmissionClient.
RAY_JOB_HEADERS = "RAY_JOB_HEADERS"

# Timeout waiting for the dashboard to come alive during node startup.
RAY_DASHBOARD_STARTUP_TIMEOUT_S = env_integer("RAY_DASHBOARD_STARTUP_TIMEOUT_S", 60)

DEFAULT_DASHBOARD_IP = "127.0.0.1"
DEFAULT_DASHBOARD_PORT = 8265
DASHBOARD_ADDRESS = "dashboard"
DASHBOARD_CLIENT_MAX_SIZE = 100 * 1024**2
PROMETHEUS_SERVICE_DISCOVERY_FILE = "prom_metrics_service_discovery.json"
DEFAULT_DASHBOARD_AGENT_LISTEN_PORT = 52365
# Default resource requirements for actors when no resource requirements are
# specified.
DEFAULT_ACTOR_METHOD_CPU_SIMPLE = 1
DEFAULT_ACTOR_CREATION_CPU_SIMPLE = 0
# Default resource requirements for actors when some resource requirements are
# specified in .
DEFAULT_ACTOR_METHOD_CPU_SPECIFIED = 0
DEFAULT_ACTOR_CREATION_CPU_SPECIFIED = 1
# Default number of return values for each actor method.
DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS = 1

# Wait 30 seconds for client to reconnect after unexpected disconnection
DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD = 30

# If a remote function or actor (or some other export) has serialized size
# greater than this quantity, print an warning.
FUNCTION_SIZE_WARN_THRESHOLD = 10**7
FUNCTION_SIZE_ERROR_THRESHOLD = env_integer("FUNCTION_SIZE_ERROR_THRESHOLD", (10**8))

# If remote functions with the same source are imported this many times, then
# print a warning.
DUPLICATE_REMOTE_FUNCTION_THRESHOLD = 100

# The maximum resource quantity that is allowed. TODO(rkn): This could be
# relaxed, but the current implementation of the node manager will be slower
# for large resource quantities due to bookkeeping of specific resource IDs.
MAX_RESOURCE_QUANTITY = 100e12

# Number of units 1 resource can be subdivided into.
MIN_RESOURCE_GRANULARITY = 0.0001

# Set this environment variable to populate the dashboard URL with
# an external hosted Ray dashboard URL (e.g. because the
# dashboard is behind a proxy or load balancer). This only overrides
# the dashboard URL when returning or printing to a user through a public
# API, but not in the internal KV store.
RAY_OVERRIDE_DASHBOARD_URL = "RAY_OVERRIDE_DASHBOARD_URL"


# Different types of Ray errors that can be pushed to the driver.
# TODO(rkn): These should be defined in flatbuffers and must be synced with
# the existing C++ definitions.
PICKLING_LARGE_OBJECT_PUSH_ERROR = "pickling_large_object"
WAIT_FOR_FUNCTION_PUSH_ERROR = "wait_for_function"
VERSION_MISMATCH_PUSH_ERROR = "version_mismatch"
WORKER_CRASH_PUSH_ERROR = "worker_crash"
WORKER_DIED_PUSH_ERROR = "worker_died"
WORKER_POOL_LARGE_ERROR = "worker_pool_large"
PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
RESOURCE_DEADLOCK_ERROR = "resource_deadlock"
REMOVED_NODE_ERROR = "node_removed"
MONITOR_DIED_ERROR = "monitor_died"
LOG_MONITOR_DIED_ERROR = "log_monitor_died"
DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died"
DASHBOARD_DIED_ERROR = "dashboard_died"
RAYLET_DIED_ERROR = "raylet_died"
DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR = "detached_actor_anonymous_namespace"
EXCESS_QUEUEING_WARNING = "excess_queueing_warning"

# Used by autoscaler to set the node custom resources and labels
# from cluster.yaml.
RESOURCES_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_RESOURCES"
LABELS_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_LABELS"

# Temporary flag to disable log processing in the dashboard.  This is useful
# if the dashboard is overloaded by logs and failing to process other
# dashboard API requests (e.g. Job Submission).
DISABLE_DASHBOARD_LOG_INFO = env_integer("RAY_DISABLE_DASHBOARD_LOG_INFO", 0)

LOGGER_FORMAT = "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
LOGGER_FORMAT_ESCAPE = json.dumps(LOGGER_FORMAT.replace("%", "%%"))
LOGGER_FORMAT_HELP = f"The logging format. default={LOGGER_FORMAT_ESCAPE}"
# Configure the default logging levels for various Ray components.
# TODO (kevin85421): Currently, I don't encourage Ray users to configure
# `RAY_LOGGER_LEVEL` until its scope and expected behavior are clear and
# easy to understand. Now, only Ray developers should use it.
LOGGER_LEVEL = os.environ.get("RAY_LOGGER_LEVEL", "info")
LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
LOGGER_LEVEL_HELP = (
    "The logging level threshold, choices=['debug', 'info',"
    " 'warning', 'error', 'critical'], default='info'"
)

LOGGING_REDIRECT_STDERR_ENVIRONMENT_VARIABLE = "RAY_LOG_TO_STDERR"
# Logging format when logging stderr. This should be formatted with the
# component before setting the formatter, e.g. via
#   format = LOGGER_FORMAT_STDERR.format(component="dashboard")
#   handler.setFormatter(logging.Formatter(format))
LOGGER_FORMAT_STDERR = (
    "%(asctime)s\t%(levelname)s ({component}) %(filename)s:%(lineno)s -- %(message)s"
)

# Constants used to define the different process types.
PROCESS_TYPE_REAPER = "reaper"
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_RAY_CLIENT_SERVER = "ray_client_server"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
PROCESS_TYPE_DASHBOARD = "dashboard"
PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent"
PROCESS_TYPE_RUNTIME_ENV_AGENT = "runtime_env_agent"
PROCESS_TYPE_WORKER = "worker"
PROCESS_TYPE_RAYLET = "raylet"
PROCESS_TYPE_REDIS_SERVER = "redis_server"
PROCESS_TYPE_GCS_SERVER = "gcs_server"
PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver"
PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker"

# Log file names
MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log"
LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log"

# Enable log deduplication.
RAY_DEDUP_LOGS = env_bool("RAY_DEDUP_LOGS", True)

# How many seconds of messages to buffer for log deduplication.
RAY_DEDUP_LOGS_AGG_WINDOW_S = env_integer("RAY_DEDUP_LOGS_AGG_WINDOW_S", 5)

# Regex for log messages to never deduplicate, or None. This takes precedence over
# the skip regex below. A default pattern is set for testing.
TESTING_NEVER_DEDUP_TOKEN = "__ray_testing_never_deduplicate__"
RAY_DEDUP_LOGS_ALLOW_REGEX = os.environ.get(
    "RAY_DEDUP_LOGS_ALLOW_REGEX", TESTING_NEVER_DEDUP_TOKEN
)

# Regex for log messages to always skip / suppress, or None.
RAY_DEDUP_LOGS_SKIP_REGEX = os.environ.get("RAY_DEDUP_LOGS_SKIP_REGEX")

AGENT_PROCESS_TYPE_DASHBOARD_AGENT = "ray::DashboardAgent"
AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT = "ray::RuntimeEnvAgent"

AGENT_PROCESS_LIST = [
    AGENT_PROCESS_TYPE_DASHBOARD_AGENT,
    AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT,
]

WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE"
WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker"
WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker"
WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE = (
    f"ray::IDLE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
)
WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE = (
    f"ray::IDLE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
)
WORKER_PROCESS_TYPE_SPILL_WORKER = f"ray::SPILL_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
WORKER_PROCESS_TYPE_RESTORE_WORKER = (
    f"ray::RESTORE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
)
WORKER_PROCESS_TYPE_SPILL_WORKER_DELETE = (
    f"ray::DELETE_{WORKER_PROCESS_TYPE_SPILL_WORKER_NAME}"
)
WORKER_PROCESS_TYPE_RESTORE_WORKER_DELETE = (
    f"ray::DELETE_{WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME}"
)

# The number of files the log monitor will open. If more files exist, they will
# be ignored.
LOG_MONITOR_MAX_OPEN_FILES = int(
    os.environ.get("RAY_LOG_MONITOR_MAX_OPEN_FILES", "200")
)

# The maximum batch of lines to be read in a single iteration. We _always_ try
# to read this number of lines even if there aren't any new lines.
LOG_MONITOR_NUM_LINES_TO_READ = int(
    os.environ.get("RAY_LOG_MONITOR_NUM_LINES_TO_READ", "1000")
)

# Autoscaler events are denoted by the ":event_summary:" magic token.
LOG_PREFIX_EVENT_SUMMARY = ":event_summary:"
# Cluster-level info events are denoted by the ":info_message:" magic token. These may
# be emitted in the stderr of Ray components.
LOG_PREFIX_INFO_MESSAGE = ":info_message:"
# Actor names are recorded in the logs with this magic token as a prefix.
LOG_PREFIX_ACTOR_NAME = ":actor_name:"
# Task names are recorded in the logs with this magic token as a prefix.
LOG_PREFIX_TASK_NAME = ":task_name:"
# Job ids are recorded in the logs with this magic token as a prefix.
LOG_PREFIX_JOB_ID = ":job_id:"

# The object metadata field uses the following format: It is a comma
# separated list of fields. The first field is mandatory and is the
# type of the object (see types below) or an integer, which is interpreted
# as an error value. The second part is optional and if present has the
# form DEBUG:<breakpoint_id>, it is used for implementing the debugger.

# A constant used as object metadata to indicate the object is cross language.
OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
# A constant used as object metadata to indicate the object is python specific.
OBJECT_METADATA_TYPE_PYTHON = b"PYTHON"
# A constant used as object metadata to indicate the object is raw bytes.
OBJECT_METADATA_TYPE_RAW = b"RAW"

# A constant used as object metadata to indicate the object is an actor handle.
# This value should be synchronized with the Java definition in
# ObjectSerializer.java
# TODO(fyrestone): Serialize the ActorHandle via the custom type feature
# of XLANG.
OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"

# A constant indicating the debugging part of the metadata (see above).
OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"

AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"

REDIS_DEFAULT_USERNAME = ""

REDIS_DEFAULT_PASSWORD = ""

# The Mach kernel page size in bytes.
MACH_PAGE_SIZE_BYTES = 4096

# The max number of bytes for task execution error message.
MAX_APPLICATION_ERROR_LENGTH = env_integer("RAY_MAX_APPLICATION_ERROR_LENGTH", 500)

# Max 64 bit integer value, which is needed to ensure against overflow
# in C++ when passing integer values cross-language.
MAX_INT64_VALUE = 9223372036854775807

# Object Spilling related constants
DEFAULT_OBJECT_PREFIX = "ray_spilled_objects"

GCS_PORT_ENVIRONMENT_VARIABLE = "RAY_GCS_SERVER_PORT"

HEALTHCHECK_EXPIRATION_S = os.environ.get("RAY_HEALTHCHECK_EXPIRATION_S", 10)

# Filename of "shim process" that sets up Python worker environment.
# Should be kept in sync with kSetupWorkerFilename in
# src/ray/common/constants.h.
SETUP_WORKER_FILENAME = "setup_worker.py"

# Directory name where runtime_env resources will be created & cached.
DEFAULT_RUNTIME_ENV_DIR_NAME = "runtime_resources"

# The timeout seconds for the creation of runtime env,
# dafault timeout is 10 minutes
DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS = 600

# The timeout seconds for the GCS server request.
# Try fetching from the cpp environment variable first.
GCS_SERVER_REQUEST_TIMEOUT_SECONDS = int(
    os.environ.get("RAY_gcs_server_request_timeout_seconds", "60")
)

# Used to separate lines when formatting the call stack where an ObjectRef was
# created.
CALL_STACK_LINE_DELIMITER = " | "

# The default gRPC max message size is 4 MiB, we use a larger number of 512 MiB
# NOTE: This is equal to the C++ limit of (RAY_CONFIG::max_grpc_message_size)
GRPC_CPP_MAX_MESSAGE_SIZE = 512 * 1024 * 1024

# The gRPC send & receive max length for "dashboard agent" server.
# NOTE: This is equal to the C++ limit of RayConfig::max_grpc_message_size
#       and HAVE TO STAY IN SYNC with it (ie, meaning that both of these values
#       have to be set at the same time)
AGENT_GRPC_MAX_MESSAGE_LENGTH = env_integer(
    "AGENT_GRPC_MAX_MESSAGE_LENGTH", 20 * 1024 * 1024  # 20MB
)


# GRPC options
GRPC_ENABLE_HTTP_PROXY = (
    1
    if os.environ.get("RAY_grpc_enable_http_proxy", "0").lower() in ("1", "true")
    else 0
)
GLOBAL_GRPC_OPTIONS = (("grpc.enable_http_proxy", GRPC_ENABLE_HTTP_PROXY),)

# Internal kv namespaces
KV_NAMESPACE_DASHBOARD = b"dashboard"
KV_NAMESPACE_SESSION = b"session"
KV_NAMESPACE_TRACING = b"tracing"
KV_NAMESPACE_PDB = b"ray_pdb"
KV_NAMESPACE_HEALTHCHECK = b"healthcheck"
KV_NAMESPACE_JOB = b"job"
KV_NAMESPACE_CLUSTER = b"cluster"
KV_HEAD_NODE_ID_KEY = b"head_node_id"
# TODO: Set package for runtime env
# We need to update ray client for this since runtime env use ray client
# This might introduce some compatibility issues so leave it here for now.
KV_NAMESPACE_PACKAGE = None
KV_NAMESPACE_FUNCTION_TABLE = b"fun"

LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]

NEURON_CORES = "neuron_cores"
GPU = "GPU"
TPU = "TPU"
NPU = "NPU"
HPU = "HPU"


RAY_WORKER_NICENESS = "RAY_worker_niceness"

# Default max_retries option in @ray.remote for non-actor
# tasks.
DEFAULT_TASK_MAX_RETRIES = 3

# Default max_concurrency option in @ray.remote for threaded actors.
DEFAULT_MAX_CONCURRENCY_THREADED = 1

# Ray internal flags. These flags should not be set by users, and we strip them on job
# submission.
# This should be consistent with src/ray/common/ray_internal_flag_def.h
RAY_INTERNAL_FLAGS = [
    "RAY_JOB_ID",
    "RAY_RAYLET_PID",
    "RAY_OVERRIDE_NODE_ID_FOR_TESTING",
]

DEFAULT_RESOURCES = {"CPU", "GPU", "memory", "object_store_memory"}

# Supported Python versions for runtime env's "conda" field. Ray downloads
# Ray wheels into the conda environment, so the Ray wheels for these Python
# versions must be available online.
RUNTIME_ENV_CONDA_PY_VERSIONS = [(3, 9), (3, 10), (3, 11), (3, 12)]

# Whether to enable Ray clusters (in addition to local Ray).
# Ray clusters are not explicitly supported for Windows and OSX.
IS_WINDOWS_OR_OSX = sys.platform == "darwin" or sys.platform == "win32"
ENABLE_RAY_CLUSTERS_ENV_VAR = "RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER"
ENABLE_RAY_CLUSTER = env_bool(
    ENABLE_RAY_CLUSTERS_ENV_VAR,
    not IS_WINDOWS_OR_OSX,
)

SESSION_LATEST = "session_latest"
NUM_PORT_RETRIES = 40
NUM_REDIS_GET_RETRIES = int(os.environ.get("RAY_NUM_REDIS_GET_RETRIES", "20"))

# Turn this on if actor task log's offsets are expected to be recorded.
# With this enabled, actor tasks' log could be queried with task id.
RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING = env_bool(
    "RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING", False
)

# RuntimeEnv env var to indicate it exports a function
WORKER_PROCESS_SETUP_HOOK_ENV_VAR = "__RAY_WORKER_PROCESS_SETUP_HOOK_ENV_VAR"
RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT_ENV_VAR = (
    "RAY_WORKER_PROCESS_SETUP_HOOK_LOAD_TIMEOUT"  # noqa
)

RAY_DEFAULT_LABEL_KEYS_PREFIX = "ray.io/"

RAY_TPU_MAX_CONCURRENT_CONNECTIONS_ENV_VAR = "RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS"

RAY_NODE_IP_FILENAME = "node_ip_address.json"

RAY_LOGGING_CONFIG_ENCODING = os.environ.get("RAY_LOGGING_CONFIG_ENCODING")

RAY_BACKEND_LOG_JSON_ENV_VAR = "RAY_BACKEND_LOG_JSON"

# Write export API event of all resource types to file if enabled.
# RAY_enable_export_api_write_config will not be considered if
# this is enabled.
RAY_ENABLE_EXPORT_API_WRITE = env_bool("RAY_enable_export_api_write", False)

# Comma separated string containing individual resource
# to write export API events for. This configuration is only used if
# RAY_enable_export_api_write is not enabled. Full list of valid
# resource types in ExportEvent.SourceType enum in
# src/ray/protobuf/export_api/export_event.proto
# Example config:
# `export RAY_enable_export_api_write_config='EXPORT_SUBMISSION_JOB,EXPORT_ACTOR'`
RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR = os.environ.get(
    "RAY_enable_export_api_write_config", ""
)
RAY_ENABLE_EXPORT_API_WRITE_CONFIG = RAY_ENABLE_EXPORT_API_WRITE_CONFIG_STR.split(",")

RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES = env_bool(
    "RAY_EXPORT_EVENT_MAX_FILE_SIZE_BYTES", 100 * 1e6
)

RAY_EXPORT_EVENT_MAX_BACKUP_COUNT = env_bool("RAY_EXPORT_EVENT_MAX_BACKUP_COUNT", 20)

# If this flag is set and you run the driver with `uv run`, Ray propagates the `uv run`
# environment to all workers. Ray does this by setting the `py_executable` to the
# `uv run`` command line and by propagating the working directory
# via the `working_dir` plugin so uv finds the pyproject.toml.
# If you enable RAY_ENABLE_UV_RUN_RUNTIME_ENV AND you run the driver
# with `uv run`, Ray deactivates the regular RAY_RUNTIME_ENV_HOOK
# because in most cases the hooks wouldn't work unless you specifically make the code
# for the runtime env hook available in your uv environment and make sure your hook
# is compatible with your uv runtime environment. If you want to combine a custom
# RAY_RUNTIME_ENV_HOOK with `uv run`, you should flag off RAY_ENABLE_UV_RUN_RUNTIME_ENV
# and call ray._private.runtime_env.uv_runtime_env_hook.hook manually in your hook or
# manually set the py_executable in your runtime environment hook.
RAY_ENABLE_UV_RUN_RUNTIME_ENV = env_bool("RAY_ENABLE_UV_RUN_RUNTIME_ENV", True)

# Prometheus metric cardinality level setting, either "legacy" or "recommended".
#
# Legacy: report all metrics to prometheus with the set of labels that are reported by
#   the component, including WorkerId, (task or actor) Name, etc. This is the default.
# Recommended: report only the node level metrics to prometheus. This means that the
#   WorkerId will be removed from all metrics.
# Low: Same as recommended, but also drop the Name label for tasks and actors.
RAY_METRIC_CARDINALITY_LEVEL = os.environ.get(
    "RAY_metric_cardinality_level", "recommended"
)

# Whether enable OpenTelemetry as the metrics collection backend. The default is
# using OpenCensus.
RAY_ENABLE_OPEN_TELEMETRY = env_bool("RAY_enable_open_telemetry", True)

# How long to wait for a fetch for an RDT object to complete during ray.get before timing out and raising an exception to the user.
#
# NOTE: This is a tenth of `RayConfig::fetch_fail_timeout_milliseconds` by default as RDT transfers are expected to be much faster.
RDT_FETCH_FAIL_TIMEOUT_SECONDS = (
    env_integer("RAY_rdt_fetch_fail_timeout_milliseconds", 60000) / 1000
)

# Whether to enable zero-copy serialization for PyTorch tensors.
# When enabled, Ray serializes PyTorch tensors by converting them to NumPy arrays
# and leveraging pickle5's zero-copy buffer sharing. This avoids copying the
# underlying tensor data, which can improve performance when passing large tensors
# across tasks or actors. Note that this is experimental and should be used with caution
# as we won't copy and allow a write to shared memory. One process changing a tensor
# after ray.get could be reflected in another process.
#
# This feature is experimental and works best under the following conditions:
# - The tensor has `requires_grad=False` (i.e., is detached from the autograd graph).
# - The tensor is contiguous in memory
# - Performance benefits from this are larger if the tensor resides in CPU memory
# - You are not using Ray Direct Transport
#
# Tensors on GPU or non-contiguous tensors are still supported: Ray will
# automatically move them to CPU and/or make them contiguous as needed.
# While this incurs an initial copy, subsequent serialization may still benefit
# from reduced overhead compared to the default path.
#
# Use with caution and ensure tensors meet the above criteria before enabling.
# Default: False.
RAY_ENABLE_ZERO_COPY_TORCH_TENSORS = env_bool(
    "RAY_ENABLE_ZERO_COPY_TORCH_TENSORS", False
)