yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
							from pathlib import Path
from typing import Any

import ray
from ray._private.ray_constants import env_bool
from ray.air.constants import (  # noqa: F401
    COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
    EVALUATION_DATASET_KEY,
    MODEL_KEY,
    PREPROCESSOR_KEY,
    TRAIN_DATASET_KEY,
)


def _get_ray_train_session_dir() -> str:
    assert ray.is_initialized(), "Ray must be initialized to get the session dir."
    return Path(
        ray._private.worker._global_node.get_session_dir_path(), "artifacts"
    ).as_posix()


DEFAULT_STORAGE_PATH = Path("~/ray_results").expanduser().as_posix()

# Autofilled ray.train.report() metrics. Keys should be consistent with Tune.
CHECKPOINT_DIR_NAME = "checkpoint_dir_name"
TIME_TOTAL_S = "_time_total_s"
WORKER_HOSTNAME = "_hostname"
WORKER_NODE_IP = "_node_ip"
WORKER_PID = "_pid"

# Will not be reported unless ENABLE_DETAILED_AUTOFILLED_METRICS_ENV
# env var is not 0
DETAILED_AUTOFILLED_KEYS = {WORKER_HOSTNAME, WORKER_NODE_IP, WORKER_PID, TIME_TOTAL_S}

# Default filename for JSON logger
RESULT_FILE_JSON = "results.json"

# The name of the subdirectory inside the trainer run_dir to store checkpoints.
TRAIN_CHECKPOINT_SUBDIR = "checkpoints"

# The key to use to specify the checkpoint id for Tune.
# This needs to be added to the checkpoint dictionary so if the Tune trial
# is restarted, the checkpoint_id can continue to increment.
TUNE_CHECKPOINT_ID = "_current_checkpoint_id"

# Deprecated configs can use this value to detect if the user has set it.
# This has type Any to allow it to be assigned to any annotated parameter
# without causing type errors.
_DEPRECATED_VALUE: Any = "DEPRECATED"


# ==================================================
#               Train V2 constants
# ==================================================

# Set this to 1 to enable deprecation warnings for V2 migration.
ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR = "RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS"


V2_MIGRATION_GUIDE_MESSAGE = (
    "See this issue for more context and migration options: "
    "https://github.com/ray-project/ray/issues/49454. "
    "Disable these warnings by setting the environment variable: "
    f"{ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR}=0"
)


def _v2_migration_warnings_enabled() -> bool:
    return env_bool(ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR, True)


# ==================================================
#               Environment Variables
# ==================================================

ENABLE_DETAILED_AUTOFILLED_METRICS_ENV = (
    "TRAIN_RESULT_ENABLE_DETAILED_AUTOFILLED_METRICS"
)

# Integer value which if set will override the value of
# Backend.share_cuda_visible_devices. 1 for True, 0 for False.
ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES"

# Integer value which if set will not share HIP accelerator visible devices
# across workers. 1 for True (default), 0 for False.
ENABLE_SHARE_HIP_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_HIP_VISIBLE_DEVICES"

# Integer value which if set will not share neuron-core accelerator visible cores
# across workers. 1 for True (default), 0 for False.
ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = (
    "TRAIN_ENABLE_SHARE_NEURON_CORES_ACCELERATOR"
)

# Integer value which if set will not share npu visible devices
# across workers. 1 for True (default), 0 for False.
ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ASCEND_RT_VISIBLE_DEVICES"

# Integer value which indicates the number of seconds to wait when creating
# the worker placement group before timing out.
TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV = "TRAIN_PLACEMENT_GROUP_TIMEOUT_S"

# Integer value which if set will change the placement group strategy from
# PACK to SPREAD. 1 for True, 0 for False.
TRAIN_ENABLE_WORKER_SPREAD_ENV = "TRAIN_ENABLE_WORKER_SPREAD"

# Set this to 0 to disable changing the working directory of each Tune Trainable
# or Train worker to the trial directory. Defaults to 1.
RAY_CHDIR_TO_TRIAL_DIR = "RAY_CHDIR_TO_TRIAL_DIR"

# Set this to 1 to count preemption errors toward `FailureConfig(max_failures)`.
# Defaults to 0, which always retries on node preemption failures.
RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE = "RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE"

# Set this to 1 to start a StateActor and collect information Train Runs
# Defaults to 0
RAY_TRAIN_ENABLE_STATE_TRACKING = "RAY_TRAIN_ENABLE_STATE_TRACKING"

# Set this to 1 to only store the checkpoint score attribute with the Checkpoint
# in the CheckpointManager. The Result will only have the checkpoint score attribute
# but files written to disk like result.json will still have all the metrics.
# Defaults to 0.
# TODO: this is a temporary solution to avoid CheckpointManager OOM.
# See https://github.com/ray-project/ray/pull/54642#issue-3234029360 for more details.
TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE = (
    "TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE"
)

# Seconds to wait for torch process group to shut down.
# Shutting down a healthy torch process group, which we may want to do for reasons
# like restarting a group of workers if an async checkpoint upload fails, can hang.
# This is a workaround until we figure out how to avoid this hang.
TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = "TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S"
DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = 30

# Seconds to wait for JAX distributed shutdown.
JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S = "JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S"
DEFAULT_JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S = 30

# NOTE: When adding a new environment variable, please track it in this list.
TRAIN_ENV_VARS = {
    ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
    ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
    TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
    TRAIN_ENABLE_WORKER_SPREAD_ENV,
    RAY_CHDIR_TO_TRIAL_DIR,
    RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
    RAY_TRAIN_ENABLE_STATE_TRACKING,
    TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE,
    TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
    JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S,
}

# Key for AIR Checkpoint metadata in TrainingResult metadata
CHECKPOINT_METADATA_KEY = "checkpoint_metadata"

# Key for AIR Checkpoint world rank in TrainingResult metadata
CHECKPOINT_RANK_KEY = "checkpoint_rank"