| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- from pathlib import Path
- from typing import Any
- import ray
- from ray._private.ray_constants import env_bool
- from ray.air.constants import ( # noqa: F401
- COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
- EVALUATION_DATASET_KEY,
- MODEL_KEY,
- PREPROCESSOR_KEY,
- TRAIN_DATASET_KEY,
- )
- def _get_ray_train_session_dir() -> str:
- assert ray.is_initialized(), "Ray must be initialized to get the session dir."
- return Path(
- ray._private.worker._global_node.get_session_dir_path(), "artifacts"
- ).as_posix()
- DEFAULT_STORAGE_PATH = Path("~/ray_results").expanduser().as_posix()
- # Autofilled ray.train.report() metrics. Keys should be consistent with Tune.
- CHECKPOINT_DIR_NAME = "checkpoint_dir_name"
- TIME_TOTAL_S = "_time_total_s"
- WORKER_HOSTNAME = "_hostname"
- WORKER_NODE_IP = "_node_ip"
- WORKER_PID = "_pid"
- # Will not be reported unless ENABLE_DETAILED_AUTOFILLED_METRICS_ENV
- # env var is not 0
- DETAILED_AUTOFILLED_KEYS = {WORKER_HOSTNAME, WORKER_NODE_IP, WORKER_PID, TIME_TOTAL_S}
- # Default filename for JSON logger
- RESULT_FILE_JSON = "results.json"
- # The name of the subdirectory inside the trainer run_dir to store checkpoints.
- TRAIN_CHECKPOINT_SUBDIR = "checkpoints"
- # The key to use to specify the checkpoint id for Tune.
- # This needs to be added to the checkpoint dictionary so if the Tune trial
- # is restarted, the checkpoint_id can continue to increment.
- TUNE_CHECKPOINT_ID = "_current_checkpoint_id"
- # Deprecated configs can use this value to detect if the user has set it.
- # This has type Any to allow it to be assigned to any annotated parameter
- # without causing type errors.
- _DEPRECATED_VALUE: Any = "DEPRECATED"
- # ==================================================
- # Train V2 constants
- # ==================================================
- # Set this to 1 to enable deprecation warnings for V2 migration.
- ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR = "RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS"
- V2_MIGRATION_GUIDE_MESSAGE = (
- "See this issue for more context and migration options: "
- "https://github.com/ray-project/ray/issues/49454. "
- "Disable these warnings by setting the environment variable: "
- f"{ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR}=0"
- )
- def _v2_migration_warnings_enabled() -> bool:
- return env_bool(ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR, True)
- # ==================================================
- # Environment Variables
- # ==================================================
- ENABLE_DETAILED_AUTOFILLED_METRICS_ENV = (
- "TRAIN_RESULT_ENABLE_DETAILED_AUTOFILLED_METRICS"
- )
- # Integer value which if set will override the value of
- # Backend.share_cuda_visible_devices. 1 for True, 0 for False.
- ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES"
- # Integer value which if set will not share HIP accelerator visible devices
- # across workers. 1 for True (default), 0 for False.
- ENABLE_SHARE_HIP_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_HIP_VISIBLE_DEVICES"
- # Integer value which if set will not share neuron-core accelerator visible cores
- # across workers. 1 for True (default), 0 for False.
- ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = (
- "TRAIN_ENABLE_SHARE_NEURON_CORES_ACCELERATOR"
- )
- # Integer value which if set will not share npu visible devices
- # across workers. 1 for True (default), 0 for False.
- ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ASCEND_RT_VISIBLE_DEVICES"
- # Integer value which indicates the number of seconds to wait when creating
- # the worker placement group before timing out.
- TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV = "TRAIN_PLACEMENT_GROUP_TIMEOUT_S"
- # Integer value which if set will change the placement group strategy from
- # PACK to SPREAD. 1 for True, 0 for False.
- TRAIN_ENABLE_WORKER_SPREAD_ENV = "TRAIN_ENABLE_WORKER_SPREAD"
- # Set this to 0 to disable changing the working directory of each Tune Trainable
- # or Train worker to the trial directory. Defaults to 1.
- RAY_CHDIR_TO_TRIAL_DIR = "RAY_CHDIR_TO_TRIAL_DIR"
- # Set this to 1 to count preemption errors toward `FailureConfig(max_failures)`.
- # Defaults to 0, which always retries on node preemption failures.
- RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE = "RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE"
- # Set this to 1 to start a StateActor and collect information Train Runs
- # Defaults to 0
- RAY_TRAIN_ENABLE_STATE_TRACKING = "RAY_TRAIN_ENABLE_STATE_TRACKING"
- # Set this to 1 to only store the checkpoint score attribute with the Checkpoint
- # in the CheckpointManager. The Result will only have the checkpoint score attribute
- # but files written to disk like result.json will still have all the metrics.
- # Defaults to 0.
- # TODO: this is a temporary solution to avoid CheckpointManager OOM.
- # See https://github.com/ray-project/ray/pull/54642#issue-3234029360 for more details.
- TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE = (
- "TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE"
- )
- # Seconds to wait for torch process group to shut down.
- # Shutting down a healthy torch process group, which we may want to do for reasons
- # like restarting a group of workers if an async checkpoint upload fails, can hang.
- # This is a workaround until we figure out how to avoid this hang.
- TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = "TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S"
- DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = 30
- # Seconds to wait for JAX distributed shutdown.
- JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S = "JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S"
- DEFAULT_JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S = 30
- # NOTE: When adding a new environment variable, please track it in this list.
- TRAIN_ENV_VARS = {
- ENABLE_DETAILED_AUTOFILLED_METRICS_ENV,
- ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
- ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
- TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
- TRAIN_ENABLE_WORKER_SPREAD_ENV,
- RAY_CHDIR_TO_TRIAL_DIR,
- RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
- RAY_TRAIN_ENABLE_STATE_TRACKING,
- TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE,
- TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
- JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S,
- }
- # Key for AIR Checkpoint metadata in TrainingResult metadata
- CHECKPOINT_METADATA_KEY = "checkpoint_metadata"
- # Key for AIR Checkpoint world rank in TrainingResult metadata
- CHECKPOINT_RANK_KEY = "checkpoint_rank"
|