# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
"""
import contextlib
import functools
import glob
import inspect
import json
import math
import os
import random
import shutil
import sys
import tempfile
import time
import warnings
from collections.abc import Callable, Iterator, Mapping
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any
# Integrations must be imported before ML frameworks:
# ruff: isort: off
from .integrations import (
get_reporting_integration_callbacks,
)
# ruff: isort: on
import numpy as np
import safetensors.torch
import torch
import torch.distributed as dist
from huggingface_hub import CommitInfo, ModelCard, create_repo, upload_folder
from packaging import version
from torch import nn
from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler
from . import __version__
from .configuration_utils import PreTrainedConfig
from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
from .debug_utils import DebugOption, DebugUnderflowOverflow
from .feature_extraction_sequence_utils import SequenceFeatureExtractor
from .feature_extraction_utils import FeatureExtractionMixin
from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
from .image_processing_utils import BaseImageProcessor
from .integrations.deepspeed import (
deepspeed_init,
deepspeed_load_checkpoint,
deepspeed_sp_compute_loss,
is_deepspeed_available,
propagate_args_to_deepspeed,
)
from .integrations.fsdp import get_fsdp_ckpt_kwargs, update_fsdp_plugin_peft
from .integrations.liger import apply_liger_kernel
from .integrations.neftune import activate_neftune, deactivate_neftune
from .integrations.peft import MIN_PEFT_VERSION
from .integrations.tpu import save_tpu_checkpoint, tpu_spmd_dataloader, wrap_model_xla_fsdp
from .modelcard import TrainingSummary
from .modeling_utils import PreTrainedModel, unwrap_model
from .models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_MAPPING_NAMES,
)
from .optimization import GreedyLR, get_scheduler
from .processing_utils import ProcessorMixin
from .tokenization_utils_base import PreTrainedTokenizerBase
from .trainer_callback import (
CallbackHandler,
DefaultFlowCallback,
ExportableState,
PrinterCallback,
ProgressCallback,
TrainerCallback,
TrainerControl,
TrainerState,
)
from .trainer_optimizer import (
_OPTIMIZER_HANDLERS,
OptimizerContext,
_parse_optim_args,
is_optimizer_factory,
)
from .trainer_pt_utils import (
EvalLoopContainer,
IterableDatasetShard,
LabelSmoother,
LengthGroupedSampler,
distributed_broadcast_scalars,
find_batch_size,
get_model_param_count,
get_parameter_names,
is_attention_mask_causal,
nested_detach,
nested_gather,
reissue_pt_warnings,
remove_dummy_checkpoint,
safe_globals,
set_rng_state_for_device,
)
from .trainer_utils import (
PREFIX_CHECKPOINT_DIR,
BestRun,
EvalLoopOutput,
EvalPrediction,
HPSearchBackend,
HubStrategy,
PredictionOutput,
RemoveColumnsCollator,
SaveStrategy,
TrainerMemoryTracker,
TrainOutput,
_is_peft_model,
align_special_tokens,
compare_trainer_and_checkpoint_args,
default_compute_objective,
denumpify_detensorize,
enable_full_determinism,
find_executable_batch_size,
get_last_checkpoint,
has_length,
load_sharded_checkpoint,
number_of_arguments,
rotate_checkpoints,
seed_worker,
set_seed,
sort_checkpoints,
speed_metrics,
suppress_progress_bars,
unwrap_peft_model,
validate_quantization_for_training,
)
from .training_args import OptimizerNames, ParallelMode, TrainingArguments
from .utils import (
ADAPTER_CONFIG_NAME,
ADAPTER_SAFE_WEIGHTS_NAME,
ADAPTER_WEIGHTS_NAME,
CONFIG_NAME,
GENERATION_CONFIG_NAME,
SAFE_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_NAME,
WEIGHTS_INDEX_NAME,
WEIGHTS_NAME,
XLA_FSDPV2_MIN_VERSION,
PushInProgress,
can_return_loss,
check_torch_load_is_safe,
find_labels,
is_accelerate_available,
is_datasets_available,
is_in_notebook,
is_peft_available,
is_sagemaker_dp_enabled,
is_sagemaker_mp_enabled,
is_torch_hpu_available,
is_torch_mlu_available,
is_torch_musa_available,
is_torch_npu_available,
is_torch_xla_available,
logging,
)
from .utils.import_utils import requires
from .utils.quantization_config import QuantizationMethod
DEFAULT_CALLBACKS = [DefaultFlowCallback]
DEFAULT_PROGRESS_CALLBACK = ProgressCallback
if is_in_notebook():
from .utils.notebook import NotebookProgressCallback
DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
if is_datasets_available():
import datasets
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
import torch_xla.debug.metrics as met
import torch_xla.runtime as xr
from torch_xla import __version__ as XLA_VERSION
IS_XLA_FSDPV2_POST_2_2 = version.parse(XLA_VERSION) >= version.parse(XLA_FSDPV2_MIN_VERSION)
if IS_XLA_FSDPV2_POST_2_2:
import torch_xla.distributed.spmd as xs
else:
IS_XLA_FSDPV2_POST_2_2 = False
if is_sagemaker_mp_enabled():
import smdistributed.modelparallel.torch as smp
from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_nested_concat
if is_peft_available():
from peft import PeftModel
if is_accelerate_available():
from accelerate import Accelerator, skip_first_batches
from accelerate.state import AcceleratorState
from accelerate.utils import (
DataLoaderConfiguration,
DistributedDataParallelKwargs,
DistributedType,
GradientAccumulationPlugin,
load_fsdp_model,
load_fsdp_optimizer,
release_memory,
save_fsdp_model,
save_fsdp_optimizer,
)
from accelerate.utils.memory import clear_device_cache
if is_deepspeed_available():
from accelerate.utils import DeepSpeedSchedulerWrapper
if TYPE_CHECKING:
import optuna
logger = logging.get_logger(__name__)
# Name of the files used for checkpointing
TRAINING_ARGS_NAME = "training_args.bin"
TRAINER_STATE_NAME = "trainer_state.json"
OPTIMIZER_NAME = "optimizer.pt"
SCALER_NAME = "scaler.pt"
OPTIMIZER_NAME_BIN = "optimizer.bin"
SCHEDULER_NAME = "scheduler.pt"
FSDP_MODEL_NAME = "pytorch_model_fsdp"
@requires(
backends=(
"torch",
"accelerate",
)
)
class Trainer:
"""
Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
Args:
model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
[`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
models.
args ([`TrainingArguments`], *optional*):
The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
`output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
data_collator (`DataCollator`, *optional*):
The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
default to [`default_data_collator`] if no `processing_class` is provided, an instance of
[`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer.
train_dataset (`torch.utils.data.Dataset` | `torch.utils.data.IterableDataset` | `datasets.Dataset`, *optional*):
The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
`model.forward()` method are automatically removed.
Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
`torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
sets the seed of the RNGs used.
eval_dataset (`torch.utils.data.Dataset` | dict[str, `torch.utils.data.Dataset`] | `datasets.Dataset`, *optional*):
The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
`model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
dataset prepending the dictionary key to the metric name.
processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
Processing class used to process the data. If provided, will be used to automatically process the inputs
for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
reuse the fine-tuned model.
model_init (`Callable[[], PreTrainedModel]`, *optional*):
A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
from a new instance of the model as given by this function.
The function may have zero argument, or a single one containing the optuna/Ray Tune trial object, to
be able to choose different architectures according to hyperparameters (such as layer count, sizes of
inner layers, dropout probabilities etc).
compute_loss_func (`Callable`, *optional*):
A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`].
compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
`True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
after the last eval batch to signal that the function needs to calculate and return the global summary
statistics rather than accumulating the batch-level statistics
callbacks (List of [`TrainerCallback`], *optional*):
A list of callbacks to customize the training loop. Will add those to the list of default callbacks
detailed in [here](callback).
If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], dict[str, Any]]`, *optional*):
A tuple containing the optimizer class and keyword arguments to use.
Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument.
Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer.
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
A function that preprocess the logits right before caching them at each evaluation step. Must take two
tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
by this function will be reflected in the predictions received by `compute_metrics`.
Note that the labels (second parameter) will be `None` if the dataset does not have them.
Important attributes:
- **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
subclass.
- **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
- **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
data parallelism, this means some of the model layers are split on different GPUs).
- **place_model_on_device** -- Whether or not to automatically place the model on the device. Defaults to
`True` unless model parallel, DeepSpeed, FSDP, full fp16/bf16 eval, or SageMaker MP is active. Can be
overridden by subclassing `TrainingArguments` and overriding the `place_model_on_device` property.
- **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
in `train`)
"""
# Those methods are not used in Trainer itself but are available as methods for external use.
from .trainer_pt_utils import (
get_learning_rates,
get_num_trainable_parameters,
get_optimizer_group,
log_metrics,
metrics_format,
save_metrics,
save_state,
)
# ---- Initialization & Validation ----
def __init__(
self,
model: PreTrainedModel | nn.Module | None = None,
args: TrainingArguments | None = None,
data_collator: DataCollator | None = None,
train_dataset: "Dataset | IterableDataset | datasets.Dataset | None" = None,
eval_dataset: "Dataset | dict[str, Dataset] | datasets.Dataset | None" = None,
processing_class: PreTrainedTokenizerBase
| BaseImageProcessor
| FeatureExtractionMixin
| ProcessorMixin
| None = None,
model_init: Callable[..., PreTrainedModel] | None = None,
compute_loss_func: Callable | None = None,
compute_metrics: Callable[[EvalPrediction], dict] | None = None,
callbacks: list[TrainerCallback] | None = None,
optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None,
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
):
# Init flow:
# 1. Args & seed – defaults, determinism
# 2. Accelerator & logging – accelerator, memory tracker, log level, device setup
# 3. Model resolution – model / model_init, Liger Kernel, quantization checks
# 4. Distributed strategy – model-parallel, FSDP, SageMaker MP flags
# 5. Device placement – move model to device, model wrapping
# 6. Model introspection – loss kwargs, label names, label smoother
# 7. Store init arguments – data, callables, optimizer, scheduler, validation
# 8. Callbacks – reporting integrations, JIT checkpoint, progress bar
# 9. Hub & output – repo init, output directory
# 10. Training state – TrainerState, TrainerControl, internal bookkeeping
# 11. Finalize – use_cache, XLA FSDPv2 mesh, memory tracker stop
# ---- 1. Args & seed --------------------------------------------------------
if args is None:
output_dir = "tmp_trainer"
logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
args = TrainingArguments(output_dir=output_dir)
self.args = args
# Seed must be set before instantiating the model when using model_init
enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
# ---- 2. Accelerator & logging ----------------------------------------------
# `create_accelerator_and_postprocess` reads self.model and self.args,
# and may set self.deepspeed — store temporary refs before calling it.
self.deepspeed = None
self.model = model
self.create_accelerator_and_postprocess()
self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
self._memory_tracker.start()
log_level = args.get_process_log_level()
logging.set_verbosity(log_level)
args._setup_devices # force device and distributed setup init explicitly
# ---- 3. Model resolution ----------------------------------------------------
if model is None:
if model_init is not None:
self.model_init = model_init
model = self.call_model_init()
else:
raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
else:
if model_init is not None:
raise ValueError("`Trainer` requires either a `model` or `model_init` argument, but not both.")
self.model_init = model_init
if model.__class__.__name__ in MODEL_MAPPING_NAMES:
raise ValueError(
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
"computes hidden states and does not accept any labels. You should choose a model with a head "
"suitable for your task like any of the `AutoModelForXxx` listed at "
"https://huggingface.co/docs/transformers/model_doc/auto"
)
validate_quantization_for_training(model)
# ---- 4. Distributed strategy ------------------------------------------------
self.is_model_parallel = False
if getattr(model, "hf_device_map", None) is not None:
devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
if len(devices) > 1:
self.is_model_parallel = True
elif len(devices) == 1:
self.is_model_parallel = self.args.device != torch.device(devices[0])
self.is_fsdp_xla_enabled = args.fsdp_config["xla"]
if len(args.fsdp) > 0:
if self.is_deepspeed_enabled:
raise ValueError(
"Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
)
if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
raise ValueError("Using fsdp only works in distributed training.")
# Postpone switching model to cuda when MP, DeepSpeed, full bf16/fp16 eval, or FSDP
if args.place_model_on_device is not None:
self.place_model_on_device = args.place_model_on_device
elif (
self.is_model_parallel
or self.is_deepspeed_enabled
or (args.fp16_full_eval or args.bf16_full_eval)
or self.is_fsdp_xla_enabled
or self.is_fsdp_enabled
or is_sagemaker_mp_enabled()
):
self.place_model_on_device = False
else:
self.place_model_on_device = True
# ---- 5. Device placement ----------------------------------------------------
# Bnb Quantized models don't support `.to` operation.
if (
self.place_model_on_device
and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
):
self._move_model_to_device(model, args.device)
# Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
if self.is_model_parallel:
self.args._n_gpu = 1
# `self.model is self.model_wrapped` is used later to check if it's wrapped
self.model_wrapped = model
self.model = model
# ---- 6. Model introspection -------------------------------------------------
unwrapped_model = unwrap_peft_model(self.accelerator.unwrap_model(model))
if hasattr(unwrapped_model, "accepts_loss_kwargs"):
self.model_accepts_loss_kwargs = unwrapped_model.accepts_loss_kwargs
else:
forward_params = inspect.signature(unwrapped_model.forward).parameters
self.model_accepts_loss_kwargs = any(
k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values()
)
# Sequence Parallelism computes its own good_tokens count
pc = getattr(self.accelerator, "parallelism_config", None)
if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled:
self.model_accepts_loss_kwargs = False
model_to_inspect = unwrap_peft_model(self.model)
default_label_names = find_labels(model_to_inspect.__class__)
self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
self.can_return_loss = can_return_loss(model_to_inspect.__class__)
if self.args.label_smoothing_factor != 0:
if getattr(self.model.config, "problem_type", None) == "multi_label_classification":
warnings.warn(
"Label smoothing is not compatible with multi-label classification. "
"Disabling label smoothing for this training run.",
UserWarning,
)
self.label_smoother = None
else:
self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
else:
self.label_smoother = None
# ---- 7. Store init arguments ------------------------------------------------
# Data
default_collator = (
DataCollatorWithPadding(processing_class)
if processing_class is not None
and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
else default_data_collator
)
self.data_collator = data_collator if data_collator is not None else default_collator
self.train_dataset = train_dataset
self.eval_dataset = eval_dataset
self.processing_class = processing_class
self.neftune_noise_alpha = args.neftune_noise_alpha
# Callables
self.compute_loss_func = compute_loss_func
self.compute_metrics = compute_metrics
self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
# Optimizer & scheduler
self.optimizer, self.lr_scheduler = optimizers
self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
self._validate_args()
# ---- 8. Callbacks -----------------------------------------------------------
default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
if self.args.enable_jit_checkpoint:
from .trainer_jit_checkpoint import JITCheckpointCallback
jit_callback = JITCheckpointCallback()
default_callbacks = default_callbacks + [jit_callback]
jit_callback.set_trainer(self)
callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
self.callback_handler = CallbackHandler(
callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
)
self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
# ---- 9. Hub & output ---------------------------------------------------------
self.hub_model_id = None # Set by init_hf_repo() when push_to_hub is enabled
if self.args.push_to_hub:
self.init_hf_repo()
if self.args.should_save:
os.makedirs(self.args.output_dir, exist_ok=True)
# ---- 10. Training state -----------------------------------------------------
self.control = TrainerControl()
self.state = TrainerState(
is_local_process_zero=self.is_local_process_zero(),
is_world_process_zero=self.is_world_process_zero(),
stateful_callbacks=[
cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
],
)
self.is_in_train = False # True between train() entry and exit
self.hp_name = None # Set by hyperparameter_search() to label the trial
self.hp_search_backend = None # Set by hyperparameter_search() (optuna / ray / wandb)
# Per-process FLOP counter; accumulated into self.state.total_flos then reset
self.current_flos = 0
# Set True by _setup_loggers() on first call to self.log()
self._loggers_initialized = False
# Lazily filled by _set_signature_columns_if_needed(); caches model.forward param names
self._signature_columns = None
# Effective batch size; may be reduced by find_executable_batch_size
self._train_batch_size = args.train_batch_size
# Guards one-time LR scheduler creation in create_optimizer_and_scheduler
self._created_lr_scheduler = False
self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
# ---- 11. Finalize -----------------------------------------------------------
if getattr(self.model, "config", None) is not None:
self.model.config.use_cache = self.args.use_cache
self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
if self.is_fsdp_xla_v2_enabled:
if not IS_XLA_FSDPV2_POST_2_2:
raise ValueError("FSDPv2 requires `torch_xla` 2.2 or higher.")
num_devices = xr.global_runtime_device_count()
xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor")))
self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled
self._memory_tracker.stop_and_update_metrics()
def _validate_args(self) -> None:
"""Validate constructor arguments and fail fast on incompatible combinations."""
args = self.args
# --- SageMaker Model Parallel mixed-precision validation ---
if is_sagemaker_mp_enabled():
if args.bf16:
raise ValueError("SageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead ")
if args.fp16 != smp.state.cfg.fp16:
logger.warning(
f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
f"but FP16 provided in trainer argument is {args.fp16}, "
f"setting to {smp.state.cfg.fp16}"
)
args.fp16 = smp.state.cfg.fp16
# --- Training-argument validations ---
if args.batch_eval_metrics and self.compute_metrics is not None:
if "compute_result" not in inspect.signature(self.compute_metrics).parameters:
raise ValueError(
"When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`"
" boolean argument which will be triggered after the last batch of the eval set to signal that the"
" summary statistics should be returned by the function."
)
if args.eval_strategy is not None and args.eval_strategy != "no" and self.eval_dataset is None:
raise ValueError(
f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. "
)
if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end:
if args.metric_for_best_model is None:
raise ValueError(
"`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`."
)
# --- Optimizer validations ---
if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None:
raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.")
if self.model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None):
raise RuntimeError(
"Passing a `model_init` is incompatible with providing the `optimizers` argument. "
"You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
)
if is_torch_xla_available() and self.optimizer is not None:
for param in self.model.parameters():
model_device = param.device
break
for param_group in self.optimizer.param_groups:
if len(param_group["params"]) > 0:
optimizer_device = param_group["params"][0].device
break
if model_device != optimizer_device:
raise ValueError(
"The model and the optimizer parameters are not on the same device, which probably means you"
" created an optimizer around your model **before** putting on the device and passing it to the"
" `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
" `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
)
if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
self.optimizer is not None or self.lr_scheduler is not None
):
raise RuntimeError(
"Passing `optimizers` is not allowed if PyTorch FSDP is enabled. "
"You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
)
# --- Dataset validations ---
if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
raise TypeError("The `data_collator` should be a simple callable (function, class with `__call__`).")
if args.max_steps > 0 and args.num_train_epochs > 0:
logger.info("max_steps is given, it will override any value given in num_train_epochs")
if self.train_dataset is not None and not has_length(self.train_dataset) and args.max_steps <= 0:
raise ValueError(
"The train_dataset does not implement __len__, max_steps has to be specified. "
"The number of steps needs to be known in advance for the learning rate scheduler."
)
if self.train_dataset is not None and isinstance(self.train_dataset, torch.utils.data.IterableDataset):
logger.info(
f"The `train_sampling_strategy='{args.train_sampling_strategy}'` option is ignored when using an `IterableDataset`. "
"Samplers cannot be used with IterableDataset as they require indexed access to the dataset."
)
def _build_accelerator_args(self, **kwargs) -> dict[str, Any]:
"""Helper method to build accelerator-specific keyword arguments."""
args = {
"mixed_precision": self.args.mixed_precision,
"deepspeed_plugin": self.args.deepspeed_plugin,
}
args.update(kwargs)
if self.args.ddp_find_unused_parameters is not None:
find_unused = self.args.ddp_find_unused_parameters
elif isinstance(self.model, PreTrainedModel):
# find_unused_parameters breaks checkpointing as per
# https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
find_unused = not (self.model.is_gradient_checkpointing or self.args.gradient_checkpointing)
else:
find_unused = True
ddp_kwargs = {"find_unused_parameters": find_unused}
if self.args.ddp_bucket_cap_mb is not None:
ddp_kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
if self.args.ddp_broadcast_buffers is not None:
ddp_kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
args["kwargs_handlers"] = [DistributedDataParallelKwargs(**ddp_kwargs)]
# We defer compatibility checks to accelerator
if self.args.parallelism_config is not None:
min_accelerate_version = "1.12.0"
if not is_accelerate_available(min_accelerate_version):
raise ImportError(
f"ParallelismConfig requires accelerate>={min_accelerate_version}). Please upgrade accelerate to use this feature."
)
args["parallelism_config"] = self.args.parallelism_config
if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
if self.args.parallelism_config is None:
if is_accelerate_available("1.12.0"):
if self.args.parallelism_config is None:
from accelerate import ParallelismConfig
args["parallelism_config"] = ParallelismConfig(tp_size=self.model.tp_size)
else:
raise ValueError("Requires accelerate>1.12.0 to use Tensor Parallelism.")
elif args["parallelism_config"].tp_size != self.model.tp_size:
args["parallelism_config"].tp_size = self.model.tp_size
if is_accelerate_available("1.2.0"):
# it we don't have the correct version, we will rely on env var instead that were set in TrainingArguments
from accelerate.utils import TorchDynamoPlugin
dynamo_plugin = TorchDynamoPlugin(
backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode
)
args["dynamo_plugin"] = dynamo_plugin
return args
def create_accelerator_and_postprocess(self) -> None:
"""Create the accelerator and perform post-creation setup (FSDP, DeepSpeed, etc.)."""
# We explicitly don't rely on the `Accelerator` to do gradient accumulation
grad_acc_kwargs = {}
if self.args.accelerator_config.gradient_accumulation_kwargs is not None:
grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs
# check if num_steps is attempted to be passed in gradient_accumulation_kwargs
if "num_steps" in grad_acc_kwargs:
if self.args.gradient_accumulation_steps > 1:
# raise because we do not know which setting is intended.
raise ValueError(
"The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
"If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
)
else:
self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"]
# The Trainer handles GAS itself, so GAS=1 in Accelerate to avoid any double-division
grad_acc_kwargs["num_steps"] = 1
# Just making sure that gradient_state have the correct values passed.
# We don't rely on `accumulate` from accelerate to set sync_gradients in gradient_state.
# Rather, we do it ourselves by setting self.accelerator.gradient_state._set_sync_gradients.
gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
accelerator_config = self.args.accelerator_config.to_dict()
# Extract dataloader config params from accelerator config
dataloader_params = ["split_batches", "dispatch_batches", "even_batches", "use_seedable_sampler"]
dataloader_config = DataLoaderConfiguration(
**{param: accelerator_config.pop(param) for param in dataloader_params}
)
dataloader_config.data_seed = self.args.data_seed
non_blocking = accelerator_config.pop("non_blocking")
if non_blocking and not self.args.dataloader_pin_memory:
logger.warning(
"`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both."
)
dataloader_config.non_blocking = non_blocking
# this would have been updated above, no need for it anymore
accelerator_config.pop("gradient_accumulation_kwargs")
fsdp_plugin = None
if self.args.fsdp_plugin_args is not None:
from accelerate.utils import FullyShardedDataParallelPlugin
fsdp_plugin = FullyShardedDataParallelPlugin(**self.args.fsdp_plugin_args)
args = self._build_accelerator_args(
dataloader_config=dataloader_config,
fsdp_plugin=fsdp_plugin,
gradient_accumulation_plugin=gradient_accumulation_plugin,
)
# create accelerator object
self.accelerator = Accelerator(**args)
# some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
self.gather_function = self.accelerator.gather_for_metrics
if "use_gather_object" in inspect.signature(self.gather_function).parameters:
self.gather_function = functools.partial(
self.gather_function, use_gather_object=self.args.eval_use_gather_object
)
# deepspeed and accelerate flags covering both trainer args and accelerate launcher
self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
# post accelerator creation setup
if self.is_fsdp_enabled:
fsdp_plugin = self.accelerator.state.fsdp_plugin
for param in ["limit_all_gathers", "activation_checkpointing"]:
setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
raise ValueError(
"The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
"can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
"when using FSDP."
)
if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
propagate_args_to_deepspeed(self.accelerator, self.args)
# `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
if (
self.args.save_only_model
and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
and self.args.load_best_model_at_end
):
wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.")
# `auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3
if (
self.is_deepspeed_enabled
and self.accelerator.state.deepspeed_plugin.zero_stage == 3
and self.args.auto_find_batch_size
):
raise ValueError(
"`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
)
if (
self.args.save_only_model
and self.is_fsdp_enabled
and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
):
raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
# ---- Data Loading ----
def get_train_dataloader(self) -> DataLoader:
"""
Returns the training [`~torch.utils.data.DataLoader`].
Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
training if necessary) otherwise.
Subclass and override this method if you want to inject some custom behavior.
"""
if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.")
return self._get_dataloader(
dataset=self.train_dataset,
description="Training",
batch_size=self._train_batch_size,
sampler_fn=self._get_train_sampler,
is_training=True,
)
def get_eval_dataloader(self, eval_dataset: str | Dataset | None = None) -> DataLoader:
"""
Returns the evaluation [`~torch.utils.data.DataLoader`].
Subclass and override this method if you want to inject some custom behavior.
Args:
eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
"""
if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.")
# If we have persistent workers, don't do a fork bomb especially as eval datasets
# don't change during training
dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
if (
hasattr(self, "_eval_dataloaders")
and dataloader_key in self._eval_dataloaders
and self.args.dataloader_persistent_workers
):
return self._eval_dataloaders[dataloader_key]
eval_dataset = (
self.eval_dataset[eval_dataset]
if isinstance(eval_dataset, str)
else eval_dataset
if eval_dataset is not None
else self.eval_dataset
)
return self._get_dataloader(
dataset=eval_dataset,
description="Evaluation",
batch_size=self.args.eval_batch_size,
sampler_fn=self._get_eval_sampler,
dataloader_key=dataloader_key,
)
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
"""
Returns the test [`~torch.utils.data.DataLoader`].
Subclass and override this method if you want to inject some custom behavior.
Args:
test_dataset (`torch.utils.data.Dataset`, *optional*):
The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
`model.forward()` method are automatically removed. It must implement `__len__`.
"""
return self._get_dataloader(
dataset=test_dataset,
description="test",
batch_size=self.args.eval_batch_size,
sampler_fn=self._get_eval_sampler,
)
def num_examples(self, dataloader: DataLoader) -> int:
"""
Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
dataloader.dataset does not exist or has no length, estimates as best it can
"""
try:
dataset = dataloader.dataset
# Special case for IterableDatasetShard, we need to dig deeper
if isinstance(dataset, IterableDatasetShard):
return len(dataloader.dataset.dataset)
return len(dataloader.dataset)
except (NameError, AttributeError, TypeError): # no dataset or length, estimate by length of dataloader
return len(dataloader) * self.args.per_device_train_batch_size
def _get_dataloader(
self,
dataset: Dataset,
description: str,
batch_size: int,
sampler_fn: Callable[[Dataset], torch.utils.data.Sampler] | None = None,
is_training: bool = False,
dataloader_key: str | None = None,
) -> DataLoader:
"""Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
data_collator = self.data_collator
if is_datasets_available() and isinstance(dataset, datasets.Dataset):
dataset = self._remove_unused_columns(dataset, description=description)
else:
data_collator = self._get_collator_with_removed_columns(self.data_collator, description=description)
# MPS requrires forking if multiple workers are specified
should_fork = torch.backends.mps.is_available() and self.args.dataloader_num_workers > 1
dataloader_params = {
"batch_size": batch_size,
"collate_fn": data_collator,
"num_workers": self.args.dataloader_num_workers,
"pin_memory": self.args.dataloader_pin_memory,
"persistent_workers": self.args.dataloader_persistent_workers,
"multiprocessing_context": "fork" if should_fork else None,
}
if not isinstance(dataset, torch.utils.data.IterableDataset):
if sampler_fn is not None:
dataloader_params["sampler"] = sampler_fn(dataset)
dataloader_params["drop_last"] = self.args.dataloader_drop_last
dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
if is_training:
dataloader_params["worker_init_fn"] = partial(
seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
)
dataloader = self.accelerator.prepare(DataLoader(dataset, **dataloader_params))
# Store the prepared dataloader for subsequent evaluations if using persistent workers.
if dataloader_key is not None and self.args.dataloader_persistent_workers:
if hasattr(self, "_eval_dataloaders"):
self._eval_dataloaders[dataloader_key] = dataloader
else:
self._eval_dataloaders = {dataloader_key: dataloader}
return dataloader
def _get_train_sampler(self, train_dataset: Dataset | None = None) -> torch.utils.data.Sampler | None:
"""Return the training sampler based on `train_sampling_strategy`."""
if train_dataset is None:
train_dataset = self.train_dataset
if train_dataset is None or not has_length(train_dataset):
return None
# Build the sampler.
if self.args.train_sampling_strategy == "group_by_length":
if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
lengths = (
train_dataset[self.args.length_column_name]
if self.args.length_column_name in train_dataset.column_names
else None
)
else:
lengths = None
model_input_name = (
self.processing_class.model_input_names[0] if self.processing_class is not None else None
)
return LengthGroupedSampler(
self.args.train_batch_size * self.args.gradient_accumulation_steps,
dataset=train_dataset,
lengths=lengths,
model_input_name=model_input_name,
)
elif self.args.train_sampling_strategy == "sequential":
return SequentialSampler(train_dataset)
else:
return RandomSampler(train_dataset)
def _get_eval_sampler(self, eval_dataset: Dataset) -> torch.utils.data.Sampler | None:
"""Return the evaluation sampler, using sequential ordering when not distributed."""
if eval_dataset is None or not has_length(eval_dataset):
return None
if self.args.train_sampling_strategy == "group_by_length":
if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
lengths = (
eval_dataset[self.args.length_column_name]
if self.args.length_column_name in eval_dataset.column_names
else None
)
else:
lengths = None
model_input_name = (
self.processing_class.model_input_names[0] if self.processing_class is not None else None
)
return LengthGroupedSampler(
self.args.eval_batch_size,
dataset=eval_dataset,
lengths=lengths,
model_input_name=model_input_name,
)
if self.args.world_size <= 1:
return SequentialSampler(eval_dataset)
else:
return None
def _set_signature_columns_if_needed(self) -> None:
"""Populate `_signature_columns` from the model's forward signature if not already set."""
if self._signature_columns is None:
# Inspect model forward signature to keep only the arguments it accepts.
model_to_inspect = self.model
if _is_peft_model(self.model):
if hasattr(self.model, "get_base_model"):
model_to_inspect = self.model.get_base_model()
else:
# PeftMixedModel do not provide a `get_base_model` method
model_to_inspect = self.model.base_model.model
signature = inspect.signature(model_to_inspect.forward)
self._signature_columns = list(signature.parameters.keys())
# Labels may be named label or label_ids, the default data collator handles that.
self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
def _remove_unused_columns(
self, dataset: "datasets.Dataset", description: str | None = None
) -> "datasets.Dataset":
"""Remove dataset columns not accepted by the model's forward method."""
if not self.args.remove_unused_columns:
return dataset
self._set_signature_columns_if_needed()
signature_columns = self._signature_columns
ignored_columns = list(set(dataset.column_names) - set(signature_columns))
if len(ignored_columns) > 0:
dset_description = "" if description is None else f"in the {description} set"
logger.info(
f"The following columns {dset_description} don't have a corresponding argument in "
f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, "
" you can safely ignore this message."
)
columns = [k for k in signature_columns if k in dataset.column_names]
if len(columns) == 0:
raise ValueError(
f"No columns in the dataset match the model's forward method signature: ({', '.join(signature_columns)}). "
f"The following columns have been ignored: [{', '.join(ignored_columns)}]. "
"Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`."
)
if version.parse(datasets.__version__) < version.parse("1.4.0"):
dataset.set_format(
type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"]
)
return dataset
else:
return dataset.remove_columns(ignored_columns)
def _get_collator_with_removed_columns(self, data_collator: Callable, description: str | None = None) -> Callable:
"""Wrap the data collator in a callable removing unused columns."""
if not self.args.remove_unused_columns:
return data_collator
self._set_signature_columns_if_needed()
signature_columns = self._signature_columns
remove_columns_collator = RemoveColumnsCollator(
data_collator=data_collator,
signature_columns=signature_columns,
logger=logger,
description=description,
model_name=self.model.__class__.__name__,
)
return remove_columns_collator
# ---- Optimizer & Scheduler & Learning rate ----
def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
"""
Setup the optimizer and the learning rate scheduler.
We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
`create_scheduler`) in a subclass.
"""
self.create_optimizer()
self.create_scheduler(num_training_steps=num_training_steps)
def create_optimizer(self, model=None) -> torch.optim.Optimizer:
"""
Setup the optimizer.
We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
Trainer's init through `optimizers`, or subclass and override this method in a subclass.
Returns:
`torch.optim.Optimizer`: The optimizer instance.
"""
opt_model = self.model if model is None else model
if self.optimizer is None:
decay_parameters = self.get_decay_parameter_names(opt_model)
optimizer_grouped_parameters = [
{
"params": [
p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
],
"weight_decay": self.args.weight_decay,
},
{
"params": [
p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
],
"weight_decay": 0.0,
},
]
if self.optimizer_cls_and_kwargs is not None:
optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
else:
optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
# Check if this is a factory (for complex optimizers like Muon, Dion)
# Factories are instantiated first, then called with (opt_model, **kwargs)
if is_optimizer_factory(optimizer_cls):
self.optimizer = optimizer_cls()(opt_model, **optimizer_kwargs)
else:
# Standard optimizer class instantiation
# Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
# e.g. for GaLore optimizer.
if "params" in optimizer_kwargs:
optimizer_grouped_parameters = optimizer_kwargs.pop("params")
# Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
# e.g. for LOMO optimizer.
if "model" in optimizer_kwargs:
optimizer_grouped_parameters = optimizer_kwargs.pop("model")
# For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
# to avoid arguments conflicts.
if "optimizer_dict" in optimizer_kwargs:
optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
if "bitsandbytes" in str(optimizer_cls) and optimizer_kwargs.get("optim_bits", None) == 8:
import bitsandbytes
manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
skipped = 0
for module in opt_model.modules():
if isinstance(module, nn.Embedding):
skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
logger.info(f"skipped {module}: {skipped / 2**20}M params")
manager.register_module_override(module, "weight", {"optim_bits": 32})
logger.debug(f"bitsandbytes: will optimize {module} in fp32")
logger.info(f"skipped: {skipped / 2**20}M params")
if is_sagemaker_mp_enabled():
self.optimizer = smp.DistributedOptimizer(self.optimizer)
return self.optimizer
def create_scheduler(
self, num_training_steps: int, optimizer: torch.optim.Optimizer | None = None
) -> torch.optim.lr_scheduler.LRScheduler:
"""
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
passed as an argument.
Args:
num_training_steps (int): The number of training steps to do.
Returns:
`torch.optim.lr_scheduler.LRScheduler`: The learning rate scheduler instance.
"""
if self.lr_scheduler is None:
if optimizer is None:
if is_sagemaker_mp_enabled() and smp.state.cfg.fp16:
# If fp16 is enabled, we unwrap the optimizer
optimizer = self.optimizer.optimizer
else:
optimizer = self.optimizer
self.lr_scheduler = get_scheduler(
self.args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
scheduler_specific_kwargs=self.args.lr_scheduler_kwargs,
)
self._created_lr_scheduler = True
return self.lr_scheduler
@staticmethod
def get_optimizer_cls_and_kwargs(args: TrainingArguments, model: PreTrainedModel | None = None) -> tuple[Any, Any]:
"""
Returns the optimizer class and optimizer parameters based on the training arguments.
Args:
args (`transformers.training_args.TrainingArguments`):
The training arguments for the training session.
model (`PreTrainedModel`, *optional*):
The model being trained. Required for some optimizers (GaLore, Apollo, LOMO).
Returns:
A tuple containing the optimizer class and a dictionary of optimizer keyword arguments.
"""
ctx = OptimizerContext(
args=args,
model=model,
optimizer_kwargs={"lr": args.learning_rate},
adam_kwargs={
"betas": (args.adam_beta1, args.adam_beta2),
"eps": args.adam_epsilon,
},
optim_args=_parse_optim_args(args.optim_args),
)
handler = _OPTIMIZER_HANDLERS.get(args.optim)
if handler is None:
raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
return handler(ctx)
def get_decay_parameter_names(self, model: nn.Module) -> list[str]:
"""
Get all parameter names that weight decay will be applied to.
This function filters out parameters in two ways:
1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
2. By parameter name patterns (containing 'bias', or variation of 'norm')
"""
forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
return decay_parameters
def _get_learning_rate(self) -> float:
"""
Returns the current learning rate from the scheduler.
Handles DeepSpeed's dynamic loss scaling warmup period where `get_last_lr` may fail.
"""
if self.is_deepspeed_enabled:
# with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
# not run for the first few dozen steps while loss scale is too large, and thus during
# that time `get_last_lr` will fail if called during that warm up stage, so work around it:
try:
last_lr = self.lr_scheduler.get_last_lr()[0]
except AssertionError as e:
if "need to call step" in str(e):
logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
last_lr = 0
else:
raise
else:
if isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR)):
last_lr = self.optimizer.param_groups[0]["lr"]
else:
last_lr = self.lr_scheduler.get_last_lr()[0]
if torch.is_tensor(last_lr):
last_lr = last_lr.item()
return last_lr
# ---- Training ----
def train(
self,
resume_from_checkpoint: str | bool | None = None,
trial: "optuna.Trial | dict[str, Any] | None" = None,
ignore_keys_for_eval: list[str] | None = None,
) -> TrainOutput:
"""
Main training entry point.
Args:
resume_from_checkpoint (`str` or `bool`, *optional*):
If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
`bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
trial (`optuna.Trial` or `dict[str, Any]`, *optional*):
The trial run or the hyperparameter dictionary for hyperparameter search.
ignore_keys_for_eval (`list[str]`, *optional*)
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions for evaluation during the training.
Returns:
[`~trainer_utils.TrainOutput`]: Object containing the global step count, training loss, and metrics.
"""
if resume_from_checkpoint is False:
resume_from_checkpoint = None
# memory metrics - must set up as early as possible
self._memory_tracker.start()
args = self.args
self.is_in_train = True
# Model re-init
if self.model_init is not None:
# Seed must be set before instantiating the model when using model_init.
enable_full_determinism(args.seed) if args.full_determinism else set_seed(args.seed)
self.model = self.call_model_init(trial)
# Reinitializes optimizer and scheduler
self.optimizer, self.lr_scheduler = None, None
if self.place_model_on_device:
self._move_model_to_device(self.model, args.device)
self.model_wrapped = self.model
if self.args.use_liger_kernel:
apply_liger_kernel(self.model, self.args.liger_kernel_config)
# When fp16/bf16 full eval is enabled, __init__ skips device placement so that
# evaluation_loop can cast dtype and move in one step. Move the model now for training.
if (args.fp16_full_eval or args.bf16_full_eval) and not self.is_model_parallel and self.model_init is None:
self._move_model_to_device(self.model, args.device)
# Activate gradient checkpointing if needed
if args.gradient_checkpointing:
self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs)
# If the model uses a tokenizer, it may have a new tokens for fine-tuning purposes.
if isinstance(self.processing_class, (PreTrainedTokenizerBase, ProcessorMixin)) and hasattr(
self.model, "config"
):
align_special_tokens(self.model, self.processing_class)
# Attach NEFTune hooks if necessary
if self.neftune_noise_alpha is not None:
self.neftune_hook_handle = activate_neftune(self.model, self.neftune_noise_alpha, self.accelerator)
# This might change the seed so needs to run first.
self._hp_search_setup(trial)
if DebugOption.UNDERFLOW_OVERFLOW in args.debug:
if args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module
# references registered here no longer work on other gpus, breaking the module
raise ValueError(
"Currently --debug underflow_overflow is not supported under DP. Please use DDP with torchrun"
)
else:
DebugUnderflowOverflow(self.model)
# Load potential model checkpoint
if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
resume_from_checkpoint = get_last_checkpoint(args.output_dir)
if resume_from_checkpoint is None:
raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
if resume_from_checkpoint is not None:
# Load model checkpoint before accelerator.prepare() for regular models,
# so that buffers and parameters are on the right device after prepare.
# Deepspeed/FSDP models are loaded after prepare in _prepare_for_training.
if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
self._load_from_checkpoint(resume_from_checkpoint)
state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
if state.train_batch_size is not None and args.auto_find_batch_size:
# Only restore the checkpoint's train_batch_size when using auto_find_batch_size,
self._train_batch_size = state.train_batch_size
inner_training_loop = find_executable_batch_size(
self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
)
# Disable progress bars when uploading models during checkpoints to avoid polluting stdout
ctx = suppress_progress_bars() if args.push_to_hub else contextlib.nullcontext()
with ctx:
return inner_training_loop(
args=args,
resume_from_checkpoint=resume_from_checkpoint,
trial=trial,
ignore_keys_for_eval=ignore_keys_for_eval,
)
def _inner_training_loop(
self,
batch_size: int | None = None,
args: TrainingArguments | None = None,
resume_from_checkpoint: str | None = None,
trial: "optuna.Trial | dict[str, Any] | None" = None,
ignore_keys_for_eval: list[str] | None = None,
) -> TrainOutput:
"""Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing."""
# reset everything
self.accelerator.free_memory()
if args.auto_find_batch_size:
self._update_auto_batch_size(batch_size)
# Data loader and number of training steps
train_dataloader = self.get_train_dataloader()
if self.is_fsdp_xla_v2_enabled:
train_dataloader = tpu_spmd_dataloader(train_dataloader)
# Setting up training control variables:
(
num_train_epochs,
num_update_steps_per_epoch,
num_examples,
num_train_samples,
total_train_batch_size,
steps_in_epoch,
max_steps,
) = self.set_initial_training_values(args, train_dataloader)
epochs_trained, steps_trained_in_current_epoch = self._init_training_state(
max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial
)
model, train_dataloader = self._prepare_for_training(max_steps, train_dataloader, resume_from_checkpoint)
# Train!
logger.info("***** Running training *****")
logger.info(f" Num examples = {num_examples:,}")
logger.info(f" Num Epochs = {num_train_epochs:,}")
logger.info(f" Num update steps per epoch = {num_update_steps_per_epoch:,}")
logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
if self.args.per_device_train_batch_size != self._train_batch_size:
logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {max_steps:,}")
logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
if resume_from_checkpoint is not None:
logger.info(
f" Resuming training from checkpoint with epoch {epochs_trained} and global step {self.state.global_step}"
)
if not self.args.ignore_data_skip:
logger.info(
f" Fast-forwarding the dataloader past {epochs_trained} epochs and"
f" {steps_trained_in_current_epoch} batches to resume from the exact training state."
)
start_time = time.time()
# needed to calculate tokens/s
self._initial_num_input_tokens_seen = self.state.num_input_tokens_seen
# Logging state: _tr_loss accumulates on-device between logging steps (avoiding costly .item() syncs
# on TPUs), then gets drained into _total_loss_scalar at each logging step.
self._tr_loss = torch.tensor(0.0, device=args.device)
self._total_loss_scalar = 0.0
self._globalstep_last_logged = self.state.global_step
model.zero_grad()
self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
if args.eval_on_start:
self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
for epoch in range(epochs_trained, num_train_epochs):
self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
self._run_epoch(
model=model,
epoch=epoch,
train_dataloader=train_dataloader,
steps_in_epoch=steps_in_epoch,
num_update_steps_per_epoch=num_update_steps_per_epoch,
trial=trial,
ignore_keys_for_eval=ignore_keys_for_eval,
start_time=start_time,
resume_from_checkpoint=resume_from_checkpoint,
epochs_trained=epochs_trained,
steps_trained_in_current_epoch=steps_trained_in_current_epoch,
)
if self.control.should_training_stop:
break
return self._finalize_training(trial, num_train_samples, start_time)
def _init_training_state(
self, max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial
) -> tuple[int, int]:
"""Initialize TrainerState, optionally restoring from checkpoint. Returns (epochs_trained, steps_trained_in_current_epoch)."""
self.state = TrainerState(
stateful_callbacks=[
cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
]
)
self.state.is_hyper_param_search = trial is not None
self.state.train_batch_size = self._train_batch_size
self.state.compute_steps(self.args, max_steps)
epochs_trained = 0
steps_trained_in_current_epoch = 0
if resume_from_checkpoint is not None and os.path.isfile(
os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
):
self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
compare_trainer_and_checkpoint_args(self.args, self.state)
self._load_callback_state()
epochs_trained = int(self.state.global_step // num_update_steps_per_epoch)
if not self.args.ignore_data_skip:
steps_trained_in_current_epoch = self.state.global_step % num_update_steps_per_epoch
steps_trained_in_current_epoch *= self.args.gradient_accumulation_steps
self.state.init_training_references(self, max_steps, num_train_epochs, trial)
return epochs_trained, steps_trained_in_current_epoch
def _prepare_for_training(self, max_steps, train_dataloader, resume_from_checkpoint):
"""Wrap model, create optimizer and scheduler, and run accelerator.prepare. Returns (model, train_dataloader)."""
delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
# Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
if is_fsdp2:
delay_optimizer_creation = False
# We need to reset the scheduler, as its parameters may be different on subsequent calls
if self._created_lr_scheduler:
self.lr_scheduler = None
self._created_lr_scheduler = False
if self.is_deepspeed_enabled:
self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
if not delay_optimizer_creation:
self.create_optimizer()
# Pass `self.model_wrapped` so that `_wrap_model` can detect if the model is already
# wrapped (e.g. in DataParallel) on subsequent `train()` calls and avoid double wrapping.
model = self._wrap_model(self.model_wrapped)
# If the model is wrapped, don't use `accelerator.prepare`
# this is for unhandled cases in accelerate such as FSDP-XLA, SageMaker MP/DP, DataParallel
use_accelerator_prepare = model is self.model
# prepare using `accelerator` prepare
if use_accelerator_prepare:
if delay_optimizer_creation:
# TODO: check if we can move this somewhere else
if self.is_fsdp_enabled and _is_peft_model(self.model):
update_fsdp_plugin_peft(self.model, self.accelerator)
# we only prepare the model as we don't have an optimizer
model = self.accelerator.prepare(self.model)
# using the model we prepared to create the optimizer
self.create_optimizer(model)
self.optimizer = self.accelerator.prepare(self.optimizer)
elif self.is_deepspeed_enabled and type(self.lr_scheduler).__name__ == "DummyScheduler":
model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
self.model, self.optimizer, self.lr_scheduler
)
else:
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
else:
self.optimizer = self.accelerator.prepare(self.optimizer)
# Create scheduler now that the optimizer won't change anymore
self.create_scheduler(num_training_steps=max_steps)
# updating self.model_wrapped
self.model_wrapped = model
if self.is_fsdp_enabled or self.is_fsdp_xla_enabled:
# breaking convention for FSDP model
# TODO: check if this is really needed
self.model = self.model_wrapped = model
# backward compatibility
# TODO: check if we really need this
if self.is_deepspeed_enabled:
self.deepspeed = self.model_wrapped
# Important: at this point:
# self.model is the Transformers Model except when we are using FSDP
# self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
# FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
if self.is_fsdp_enabled:
# Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
if hasattr(self.model, "generate"):
dist.fsdp.register_fsdp_forward_method(self.model, "generate")
# since DataLoader was Accelerate prepared w/o a model arg in the same call, we now have to complete the DL wrapping for ALST/UlyssesSP, after model has been prepared
pc = getattr(self.accelerator, "parallelism_config", None)
if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled:
train_dataloader = self.accelerator.deepspeed_ulysses_dl_adapter(train_dataloader, model)
# load checkpoint
if resume_from_checkpoint is not None:
if self.is_deepspeed_enabled:
deepspeed_load_checkpoint(
self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model)
)
elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
self._load_optimizer_and_scheduler(resume_from_checkpoint)
self._load_scaler(resume_from_checkpoint)
# Update the references for the callback_handler
for attr in ("model", "optimizer", "lr_scheduler"):
setattr(self.callback_handler, attr, getattr(self, attr))
self.callback_handler.train_dataloader = train_dataloader
return model, train_dataloader
def _run_epoch(
self,
model,
epoch,
train_dataloader,
steps_in_epoch,
num_update_steps_per_epoch,
trial,
ignore_keys_for_eval,
start_time,
resume_from_checkpoint,
epochs_trained,
steps_trained_in_current_epoch,
):
"""Run one full pass over the dataloader."""
step = -1
grad_norm = None
learning_rate = None
rng_to_sync = False
# Handle resumption from checkpoint: skip already-trained batches in the resumed epoch
num_update_steps_trained = 0
if epoch == epochs_trained and resume_from_checkpoint is not None:
if steps_trained_in_current_epoch > 0 and not self.args.ignore_data_skip:
train_dataloader = skip_first_batches(train_dataloader, steps_trained_in_current_epoch)
step = steps_trained_in_current_epoch - 1
num_update_steps_trained = steps_trained_in_current_epoch // self.args.gradient_accumulation_steps
rng_to_sync = True
elif steps_trained_in_current_epoch == 0:
self._load_rng_state(resume_from_checkpoint)
if hasattr(train_dataloader, "set_epoch"):
train_dataloader.set_epoch(epoch)
epoch_iterator = iter(train_dataloader)
# We chunkify the epoch iterator into gradient accumulation steps `n` batches
remainder = steps_in_epoch % self.args.gradient_accumulation_steps
if remainder == 0:
remainder = self.args.gradient_accumulation_steps
# Outer loop: one iteration per optimizer step. Each iteration prefetches
# `gradient_accumulation_steps` batches (fewer for the last step if the epoch
# doesn't divide evenly).
for update_step in range(num_update_steps_trained, num_update_steps_per_epoch):
num_batches = (
self.args.gradient_accumulation_steps if update_step != (num_update_steps_per_epoch - 1) else remainder
)
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, self.args.device)
# This is used to correctly scale the loss when the last accumulation step has fewer batches.
# Not used if `num_items_in_batch` is not None.
self.current_gradient_accumulation_steps = len(batch_samples)
# need to sync after if we skipped the batches in `get_batch_samples` for shuffle order reason
if rng_to_sync:
self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False
# Inner loop: forward + backward for each micro-batch. Gradients are
# accumulated without syncing until the last micro-batch, then we clip,
# step the optimizer, and log/save/evaluate.
for i, inputs in enumerate(batch_samples):
step += 1
do_sync_step = (step + 1) % self.args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
# Since we perform prefetching, we need to manually set sync_gradients
self.accelerator.gradient_state._set_sync_gradients(do_sync_step)
if step % self.args.gradient_accumulation_steps == 0:
self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control)
# We sync the gradients in the following cases: 1. sync_each_batch set to True 2. Using deepspeed 3. when we are at the last batch sample
if (
self.accelerator.gradient_state.plugin_kwargs.get("sync_each_batch", False)
or self.accelerator.distributed_type == DistributedType.DEEPSPEED
or i == len(batch_samples) - 1
):
sync_context = contextlib.nullcontext
else:
sync_context = functools.partial(self.accelerator.no_sync, model=model)
with sync_context():
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
if (
self.args.logging_nan_inf_filter
and not is_torch_xla_available()
and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
):
# if loss is nan or inf simply add the average of previous logged losses
self._tr_loss += self._tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
else:
if self._tr_loss.device != tr_loss_step.device:
raise ValueError(
f"Calculated loss must be on the original device: {self._tr_loss.device} but device in use is {tr_loss_step.device}"
)
self._tr_loss += tr_loss_step
self.current_flos += float(self.floating_point_ops(inputs))
self._track_num_input_tokens(inputs)
if do_sync_step:
grad_norm = None
if self.args.max_grad_norm > 0:
grad_norm = self._clip_grad_norm(model)
grad_norm = self._get_grad_norm(model, grad_norm=grad_norm)
self.control = self.callback_handler.on_pre_optimizer_step(self.args, self.state, self.control)
self.optimizer.step()
self.control = self.callback_handler.on_optimizer_step(self.args, self.state, self.control)
# get leaning rate before update
learning_rate = self._get_learning_rate()
if not self.accelerator.optimizer_step_was_skipped:
# Delay optimizer scheduling until metrics are generated
if not isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR)):
self.lr_scheduler.step()
model.zero_grad()
self.state.global_step += 1
self.state.epoch = epoch + (step + 1) / steps_in_epoch
self.control = self.callback_handler.on_step_end(self.args, self.state, self.control)
self._maybe_log_save_evaluate(
self._tr_loss,
grad_norm,
model,
trial,
epoch,
ignore_keys_for_eval,
start_time,
learning_rate=learning_rate,
)
else:
self.control = self.callback_handler.on_substep_end(self.args, self.state, self.control)
if self.control.should_epoch_stop or self.control.should_training_stop:
break
if self.control.should_epoch_stop or self.control.should_training_stop:
break
# PyTorch/XLA relies on the dataloader to insert mark_step each iteration.
# When we break out of the loop early, we flush the pending graph manually.
if is_torch_xla_available():
xm.mark_step()
if step < 0:
logger.warning(
"There seems not to be a single sample in your epoch_iterator, stopping training at step"
f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
f" num_steps ({self.state.max_steps}) higher than the number of available samples."
)
self.control.should_training_stop = True
self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control)
self._maybe_log_save_evaluate(
self._tr_loss,
grad_norm,
model,
trial,
epoch,
ignore_keys_for_eval,
start_time,
learning_rate=learning_rate,
)
def _finalize_training(self, trial, num_train_samples, start_time):
"""Finalize training: metrics, best-model loading, cleanup. Returns TrainOutput."""
logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
# add remaining tr_loss
self._total_loss_scalar += self._tr_loss.item()
effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError
train_loss = self._total_loss_scalar / effective_global_step
metrics = speed_metrics(
"train",
start_time,
num_samples=num_train_samples,
num_steps=self.state.max_steps,
)
self.store_flos()
metrics["total_flos"] = self.state.total_flos
metrics["train_loss"] = train_loss
self._memory_tracker.stop_and_update_metrics(metrics)
self.log(metrics)
if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
self._load_best_model()
checkpoints_sorted = sort_checkpoints(
output_dir=self._get_output_dir(trial), best_model_checkpoint=self.state.best_model_checkpoint
)
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
for checkpoint in checkpoints_sorted:
if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
shutil.rmtree(checkpoint, ignore_errors=True)
self.control = self.callback_handler.on_train_end(self.args, self.state, self.control)
# Wait for the checkpoint to be uploaded.
self._finish_current_push()
# After training we make sure to retrieve back the original forward pass method
# for the embedding layer by removing the forward post hook.
if self.neftune_noise_alpha is not None:
deactivate_neftune(self.model, self.neftune_hook_handle, self.accelerator)
self.is_in_train = False
return TrainOutput(self.state.global_step, train_loss, metrics)
def training_step(
self,
model: nn.Module,
inputs: dict[str, torch.Tensor | Any],
num_items_in_batch: torch.Tensor | int | None = None,
) -> torch.Tensor:
"""
Perform a training step on a batch of inputs.
Subclass and override to inject custom behavior.
Args:
model (`nn.Module`):
The model to train.
inputs (`dict[str, torch.Tensor | Any]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument `labels`. Check your model's documentation for all accepted arguments.
Return:
`torch.Tensor`: The tensor with training loss on this batch.
"""
# Prepare buffers for context parallelism
cp_context, inputs = self._prepare_context_parallel_inputs(model, inputs)
# Context manager is no-op if CP isn't enabled
with cp_context():
model.train()
if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
self.optimizer.train()
inputs = self._prepare_inputs(inputs)
if is_sagemaker_mp_enabled():
loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
return loss_mb.reduce_mean().detach().to(self.args.device)
with self.compute_loss_context_manager():
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
del inputs
if (
self.args.torch_empty_cache_steps is not None
and self.state.global_step % self.args.torch_empty_cache_steps == 0
):
clear_device_cache()
kwargs = {}
# For LOMO optimizers you need to explicitly use the learning rate
if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
kwargs["learning_rate"] = self._get_learning_rate()
if self.args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
# Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
if (not self.model_accepts_loss_kwargs or num_items_in_batch is None) and self.compute_loss_func is None:
# If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
loss = loss / self.current_gradient_accumulation_steps
# Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
# https://github.com/huggingface/transformers/pull/35808
if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
kwargs["scale_wrt_gas"] = False
self.accelerator.backward(loss, **kwargs)
return loss.detach()
def compute_loss(
self,
model: nn.Module,
inputs: dict[str, torch.Tensor | Any],
return_outputs: bool = False,
num_items_in_batch: torch.Tensor | int | None = None,
) -> torch.Tensor | tuple[torch.Tensor, Any]:
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Args:
model (`nn.Module`):
The model to compute the loss for.
inputs (`dict[str, torch.Tensor | Any]`):
The input data for the model.
return_outputs (`bool`, *optional*, defaults to `False`):
Whether to return the model outputs along with the loss.
num_items_in_batch (Optional[torch.Tensor], *optional*):
The number of items in the batch. If not passed, the loss is computed
using the default batch size reduction logic.
Returns:
The loss of the model along with its output if return_outputs was set to True
Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculation might be slightly inaccurate when performing gradient accumulation.
"""
pc = getattr(self.accelerator, "parallelism_config", None)
if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled and self.model.training:
return deepspeed_sp_compute_loss(self.accelerator, model, inputs, return_outputs, pc)
if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
labels = inputs.pop("labels")
else:
labels = None
if self.model_accepts_loss_kwargs:
kwargs = {}
if num_items_in_batch is not None:
kwargs["num_items_in_batch"] = num_items_in_batch
inputs = {**inputs, **kwargs}
outputs = model(**inputs)
# User-defined compute_loss function
if self.compute_loss_func is not None:
if labels is None:
logger.warning(
"Trainer: `compute_loss_func` is defined but `labels=None`. "
"Your custom loss function will still be called with labels=None. "
)
loss = self.compute_loss_func(
outputs,
labels,
num_items_in_batch=num_items_in_batch,
)
# Default HF loss handling (label smoothing) if no custom loss function
elif labels is not None:
unwrapped_model = self.accelerator.unwrap_model(model)
model_name = (
unwrapped_model.base_model.model._get_name()
if _is_peft_model(unwrapped_model)
else unwrapped_model._get_name()
)
if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
loss = self.label_smoother(outputs, labels, shift_labels=True)
else:
loss = self.label_smoother(outputs, labels)
else:
if isinstance(outputs, dict) and "loss" not in outputs:
raise ValueError(
"The model did not return a loss from the inputs, only the following keys: "
f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
)
# We don't use .loss here since the model may return tuples instead of ModelOutput.
loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
if (
self.args.average_tokens_across_devices
and (self.model_accepts_loss_kwargs or self.compute_loss_func)
and num_items_in_batch is not None
):
loss *= self.accelerator.num_processes if self.args.n_gpu <= 1 else self.args.n_gpu
return (loss, outputs) if return_outputs else loss
def compute_loss_context_manager(self) -> contextlib.ExitStack:
"""
A helper wrapper to group together context managers.
"""
ctx_stack = contextlib.ExitStack()
autocast_ctx = self.autocast_smart_context_manager()
if not isinstance(autocast_ctx, contextlib.nullcontext):
ctx_stack.enter_context(autocast_ctx)
return ctx_stack
def autocast_smart_context_manager(self, cache_enabled: bool | None = True) -> contextlib.AbstractContextManager:
"""
A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
arguments, depending on the situation. We rely on accelerate for autocast, hence we do nothing here.
"""
return contextlib.nullcontext()
def _maybe_log_save_evaluate(
self,
tr_loss: torch.Tensor,
grad_norm: torch.Tensor | float | None,
model: nn.Module,
trial: "optuna.Trial | dict[str, Any] | None",
epoch: float,
ignore_keys_for_eval: list[str] | None,
start_time: float,
learning_rate: float | None = None,
) -> None:
"""Log metrics, run evaluation, and save checkpoints if the current training state requires it."""
if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
if is_torch_xla_available():
xm.mark_step()
logs: dict[str, float] = {}
# all_gather + mean() to get average loss over all processes
tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()
# reset tr_loss to zero
tr_loss -= tr_loss
logs["loss"] = tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged)
if grad_norm is not None:
logs["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm
if learning_rate is not None:
logs["learning_rate"] = learning_rate
else:
logs["learning_rate"] = self._get_learning_rate()
self._total_loss_scalar += tr_loss_scalar
self._globalstep_last_logged = self.state.global_step
self.store_flos()
self.log(logs, start_time)
metrics = None
if self.control.should_evaluate:
metrics = self._evaluate(trial, ignore_keys_for_eval)
is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
if self.args.save_strategy == SaveStrategy.BEST:
self.control.should_save = is_new_best_metric
if self.control.should_save:
self._save_checkpoint(model, trial)
self.control = self.callback_handler.on_save(self.args, self.state, self.control)
# ---- Training Utilites ----
def get_batch_samples(
self, epoch_iterator: Iterator, num_batches: int, device: torch.device
) -> tuple[list, torch.Tensor | int | None]:
"""
Collects a specified number of batches from the epoch iterator and optionally counts the number of items in the batches to properly scale the loss.
"""
batch_samples = []
for _ in range(num_batches):
try:
batch_samples.append(next(epoch_iterator))
except StopIteration:
break
num_items_in_batch = self._get_num_items_in_batch(batch_samples, device)
return batch_samples, num_items_in_batch
def _get_num_items_in_batch(self, batch_samples: list, device: torch.device) -> torch.Tensor | int | None:
"""
Counts the number of items in the batches to properly scale the loss.
Args:
batch_samples (`list`): List of batches
device (`torch.device`): The device on which the number of items in the batch should be.
Returns:
None if the number of items in the batch doesn't need to be computed else the number of items in the batch
"""
num_items_in_batch = None
count_num_items_in_batch = (
len(batch_samples) > 0
and "labels" in batch_samples[0]
and (
# num_items_in_batch is passed to model forward
# https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3757
self.model_accepts_loss_kwargs
# num_items_in_batch is passed to compute_loss_func
# https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3773
or self.compute_loss_func is not None
# num_items_in_batch is also verified if (self.model_accepts_loss_kwargs or self.compute_loss_func)
# https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3790
)
)
if count_num_items_in_batch:
# For now we don't support object detection
try:
num_items_in_batch = sum((batch["labels"].ne(-100)).sum() for batch in batch_samples)
except (TypeError, AttributeError):
pass
if num_items_in_batch is not None:
if self.args.average_tokens_across_devices:
if self.args.world_size > 1:
num_items_in_batch = self.accelerator.gather(num_items_in_batch.to(device)).sum()
elif self.args.n_gpu > 1:
# In DP case, if we don't average, we need to divide by the number of gpu. This is the simplest approximation.
# Otherwise, we would have to scatter labels and calculate num_items_in_batch for each gpu.
num_items_in_batch = num_items_in_batch // self.args.n_gpu
if torch.is_tensor(num_items_in_batch):
num_items_in_batch = num_items_in_batch.to(device)
if self.args.n_gpu > 1 and num_items_in_batch.dim() == 0:
# In the DataParallel case, convert the scalar tensor into a 2-dim tensor with the same value repeated
num_items_in_batch = num_items_in_batch.unsqueeze(0).expand(self.args.n_gpu, -1)
# Divide by number of devices with the same batch
if pc := getattr(self.accelerator, "parallelism_config", None):
num_items_in_batch = num_items_in_batch // pc.non_data_parallel_size
return num_items_in_batch
def _prepare_input(self, data: torch.Tensor | Any) -> torch.Tensor | Any:
"""
Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
"""
if isinstance(data, Mapping):
return type(data)({k: self._prepare_input(v) for k, v in data.items()})
elif isinstance(data, (tuple, list)):
return type(data)(self._prepare_input(v) for v in data)
elif isinstance(data, torch.Tensor):
kwargs = {"device": self.args.device}
if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
# NLP models inputs are int/uint and those get adjusted to the right dtype of the
# embedding. Other models such as wav2vec2's inputs are already float and thus
# may need special handling to match the dtypes of the model
kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
return data.to(**kwargs)
return data
def _prepare_inputs(self, inputs: dict[str, torch.Tensor | Any]) -> dict[str, torch.Tensor | Any]:
"""
Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
handling potential state.
"""
inputs = self._prepare_input(inputs)
if len(inputs) == 0:
raise ValueError(
"The batch received was empty, your model won't be able to train on it. Double-check that your "
f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}."
)
return inputs
def _prepare_context_parallel_inputs(
self, model: nn.Module, inputs: dict[str, torch.Tensor | Any]
) -> tuple[Callable, dict[str, torch.Tensor | Any]]:
"""
Prepare inputs for context parallelism by setting up buffers and validation.
Args:
model: The model being trained
inputs: Input tensors to prepare
Returns:
tuple: (context_manager, prepared_inputs) where context_manager is either
the context parallelism wrapper or a no-op context
"""
if (
getattr(self.accelerator, "parallelism_config", None) is not None
and self.accelerator.parallelism_config.cp_enabled
):
if self.accelerator.parallelism_config.cp_backend == "torch":
if hasattr(model, "config"):
if model.config._attn_implementation != "sdpa":
raise ValueError(
f"Context parallelism is supported only with SDPA attention, you are using {model.config._attn_implementation}."
)
if "shift_labels" not in inputs:
logger.warning_once("Shift labels not found in the inputs, shifting manually")
if "labels" in inputs:
_ignore_index = -100
labels = nn.functional.pad(inputs["labels"], (0, 1), value=_ignore_index)
inputs["shift_labels"] = labels[:, 1:].contiguous()
# note: we don't do anything for accelerator.parallelism_config.sp_backend == "deepspeed" since:
# - accelerator.parallelism_config performs the `model.config._attn_implementation` checks already and it supports more than `dspa`
# - UlyssesSPDataLoaderAdapter called from Accelerate performs the `shift_label` creation - must not interfere
# - position_ids generation should be done by HF Trainer if it wasn't done by the user
if "position_ids" not in inputs:
logger.warning_once("Position IDs not found in the inputs, generating manually")
inputs["position_ids"] = torch.arange(
inputs["input_ids"].size(1), device=inputs["input_ids"].device
).expand(inputs["input_ids"].size(0), -1)
buffers = []
buffer_seq_dims = []
if "input_ids" in inputs:
buffers.append(inputs["input_ids"])
buffer_seq_dims.append(1) # Sequence dimension
if "labels" in inputs:
buffers.append(inputs["labels"])
buffer_seq_dims.append(1)
if "shift_labels" in inputs:
buffers.append(inputs["shift_labels"])
buffer_seq_dims.append(1)
# Add attention_mask to buffers for context parallel splitting (only if causal)
if "attention_mask" in inputs:
# Only validate causal mask once for performance
if not getattr(self, "_attn_mask_causal_checked", False):
# Context parallel currently doesn't support other masks than causal
# Accelerate applies hooks to replace mask with is_causal arg in SDPA
# Check if the mask is really causal and if not throw an error
attention_mask = inputs["attention_mask"]
if not is_attention_mask_causal(attention_mask):
raise ValueError(
"Context parallelism only supports causal attention masks. "
"The provided attention_mask is not causal. "
"Please ensure your data uses causal masking (lower triangular) "
"or remove the attention_mask to use the model's default causal masking."
)
self._attn_mask_causal_checked = True
if self._attn_mask_causal_checked:
# Add to buffers only after validation (or if validation already passed)
attention_mask = inputs["attention_mask"]
if attention_mask.dim() == 2:
buffers.append(attention_mask)
buffer_seq_dims.append(1)
else:
# Other dimensionality; keep as-is without sharding to avoid incorrect splits
pass
# Include position_ids in context parallelism splitting
if "position_ids" in inputs and inputs["position_ids"] is not None:
buffers.append(inputs["position_ids"])
buffer_seq_dims.append(1)
return partial(
self.accelerator.maybe_context_parallel,
buffers=buffers,
buffer_seq_dims=buffer_seq_dims,
no_restore_buffers=set(buffers),
), inputs
return contextlib.nullcontext, inputs
def set_initial_training_values(
self, args: TrainingArguments, dataloader: DataLoader
) -> tuple[int, int, int, int, int, int | None, int]:
"""
Calculates and returns the following values:
- `num_train_epochs`
- `num_update_steps_per_epoch`
- `num_examples`
- `num_train_samples`
- `total_train_batch_size`
- `steps_in_epoch` (total batches per epoch)
- `max_steps`
"""
# Case 1: we rely on `args.max_steps` first
max_steps = args.max_steps
# If max_steps is negative, we use the number of epochs to determine the number of total steps later
epoch_based = max_steps < 0
len_dataloader = len(dataloader) if has_length(dataloader) else None
total_train_batch_size = self.get_total_train_batch_size(args)
# Account for Sequence Parallelism (SP) dataloader adapter's effect
sp_size = self.get_sp_size()
if sp_size > 1 and len_dataloader is not None:
len_dataloader = len_dataloader * sp_size
# Case 2: We have a dataloader length and can extrapolate
if len_dataloader is not None:
num_update_steps_per_epoch = max(
len_dataloader // args.gradient_accumulation_steps
+ int(len_dataloader % args.gradient_accumulation_steps > 0),
1,
)
# Case 3: We have a length but are using epochs, we can extrapolate the number of steps
if epoch_based:
max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
# Now we figure out `num_examples`, `num_train_epochs`, and `train_samples`
if len_dataloader:
num_examples = self.num_examples(dataloader)
if args.max_steps > 0:
num_train_epochs = max_steps // num_update_steps_per_epoch + int(
max_steps % num_update_steps_per_epoch > 0
)
# May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
# the best we can do.
num_train_samples = max_steps * total_train_batch_size
else:
num_train_epochs = math.ceil(args.num_train_epochs)
num_train_samples = self.num_examples(dataloader) * args.num_train_epochs
elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size
# Setting a very large number of epochs so we go as many times as necessary over the iterator.
num_train_epochs = sys.maxsize
num_update_steps_per_epoch = max_steps
num_examples = total_train_batch_size * args.max_steps
num_train_samples = args.max_steps * total_train_batch_size
else:
raise ValueError(
"args.max_steps must be set to a positive value if dataloader does not have a length, was"
f" {args.max_steps}"
)
steps_in_epoch = len_dataloader if len_dataloader is not None else max_steps * args.gradient_accumulation_steps
return (
num_train_epochs,
num_update_steps_per_epoch,
num_examples,
num_train_samples,
total_train_batch_size,
steps_in_epoch,
max_steps,
)
def get_total_train_batch_size(self, args: TrainingArguments) -> int:
"""Calculates total batch size (micro_batch * grad_accum * dp_world_size).
Accounts for all parallelism dimensions: TP, CP, and SP.
Formula: dp_world_size = world_size // (tp_size * cp_size * sp_size)
Where:
- TP (Tensor Parallelism): Model layers split across GPUs
- CP (Context Parallelism): Sequences split using Ring Attention (FSDP2)
- SP (Sequence Parallelism): Sequences split using ALST/Ulysses (DeepSpeed)
All dimensions are separate and multiplicative: world_size = dp_size * tp_size * cp_size * sp_size
"""
dp_world_size = args.world_size // self.get_tp_size() // self.get_cp_size() // self.get_sp_size()
return self._train_batch_size * args.gradient_accumulation_steps * dp_world_size
def get_sp_size(self) -> int:
"""Get the sequence parallel size"""
if getattr(self.accelerator, "parallelism_config", None) is None:
return 1
else:
pc = self.accelerator.parallelism_config
return pc.sp_size
def get_cp_size(self) -> int:
"""Get the context parallel size"""
if getattr(self.accelerator, "parallelism_config", None) is None:
return 1
else:
pc = self.accelerator.parallelism_config
return pc.cp_size
def get_tp_size(self) -> int:
"""Get the tensor parallel size from either the model or DeepSpeed config."""
# 1. Check model.tp_size first
if (model_tp := getattr(self.model, "_tp_size", None)) is not None:
return model_tp
# 2. Fall back to DeepSpeed config if enabled
if self.is_deepspeed_enabled and (deepspeed_config := getattr(self.args, "hf_deepspeed_config", None)):
return deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
# 3. Default fallback
return 1
def _wrap_model(self, model: nn.Module, training: bool = True, dataloader: DataLoader | None = None) -> nn.Module:
"""Wrap `model` for distributed training if needed (DDP, FSDP, SageMaker, etc.)."""
# train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
if self.accelerator.unwrap_model(model, keep_torch_compile=False) is not model:
return model
if is_sagemaker_mp_enabled():
# Wrapping the base model twice in a DistributedModel will raise an error.
if isinstance(model, smp.model.DistributedModel):
return model
return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
# Multi-gpu training, 8bit models does not support DP
if self.args.n_gpu > 1 and not getattr(model, "is_loaded_in_8bit", False):
model = nn.DataParallel(model)
# Note: in torch.distributed mode, there's no point in wrapping the model
# inside a DistributedDataParallel as we'll be under `no_grad` anyways.
if not training:
return model
# Distributed training using PyTorch FSDP
if self.is_fsdp_xla_enabled:
model = wrap_model_xla_fsdp(model, self.args, self.is_fsdp_xla_v2_enabled)
elif is_sagemaker_dp_enabled():
model = nn.parallel.DistributedDataParallel(
model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
)
return model
def _update_auto_batch_size(self, batch_size):
"""Free memory, reset model wrapping, and update DeepSpeed config for the new batch size when using `auto_find_batch_size`"""
# `_train_batch_size` value might have changed to `auto_find_batch_size`
self._train_batch_size = batch_size
# frees the wrapped model and resets it back to the unwrapped base model
release_memory(self.model_wrapped)
if self.is_fsdp_enabled:
# Remove FSDP wrapping from sub-models because self.model points to the wrapped model in FSDP case
self.model = unwrap_model(self.model, recursive=True)
self.model_wrapped = self.model
# Check for DeepSpeed *after* the initial pass and modify the config
if self.is_deepspeed_enabled:
# Temporarily unset `self.args.train_batch_size`
original_bs = self.args.per_device_train_batch_size
self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
propagate_args_to_deepspeed(self.accelerator, self.args, auto_find_batch_size=True)
self.args.per_device_train_batch_size = original_bs
def _track_num_input_tokens(self, inputs):
"""Count input tokens seen (all or non-padding) and update state."""
if self.args.include_num_input_tokens_seen == "no":
return
main_input_name = getattr(self.model, "main_input_name", "input_ids")
if main_input_name not in inputs:
logger.warning(
"Tried to track the number of tokens seen, however the current model is "
"not configured properly to know what item is the input. To fix this, add "
"a `main_input_name` attribute to the model class you are using."
)
return
if self.args.include_num_input_tokens_seen == "non_padding":
if "attention_mask" in inputs:
input_tokens = inputs["attention_mask"].sum()
elif (
self.processing_class is not None
and hasattr(self.processing_class, "pad_token_id")
and self.processing_class.pad_token_id is not None
):
input_tokens = (inputs[main_input_name] != self.processing_class.pad_token_id).sum()
else:
logger.warning(
"Could not determine method to count non-padding tokens, falling back to counting all tokens."
)
input_tokens = inputs[main_input_name].numel()
else:
input_tokens = inputs[main_input_name].numel()
input_tokens = torch.as_tensor(input_tokens, device=self.args.device, dtype=torch.int64)
self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).sum().item()
def _clip_grad_norm(self, model):
"""Clip gradients to max_grad_norm. Returns the pre-clip gradient norm."""
if is_sagemaker_mp_enabled() and self.args.fp16:
return self.optimizer.clip_master_grads(self.args.max_grad_norm)
return self.accelerator.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
def _get_grad_norm(self, model, grad_norm=None):
"""Return the gradient norm as a Python float."""
if grad_norm is None:
# Compute norm without clipping (inf means no actual clipping happens)
grad_norm = self.accelerator.clip_grad_norm_(model.parameters(), float("inf"))
if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
if hasattr(grad_norm, "item"):
grad_norm = grad_norm.item()
return grad_norm
# ---- Evaluation & Prediction ----
def evaluate(
self,
eval_dataset: Dataset | dict[str, Dataset] | None = None,
ignore_keys: list[str] | None = None,
metric_key_prefix: str = "eval",
) -> dict[str, float]:
"""
Run evaluation and returns metrics.
The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
(pass it to the init `compute_metrics` argument).
You can also subclass and override this method to inject custom behavior.
Args:
eval_dataset (`Dataset` | dict[str, `Dataset`], *optional*):
Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will
evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the
`__len__` method.
If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run
separate evaluations on each dataset. This can be useful to monitor how training affects other
datasets or simply to get a more fine-grained evaluation.
When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one
of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets
`data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the
loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`.
ignore_keys (`list[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
"eval_bleu" if the prefix is "eval" (default)
Returns:
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
dictionary also contains the epoch number which comes from the training state.
"""
# handle multiple eval datasets
override = eval_dataset is not None
eval_dataset = eval_dataset if override else self.eval_dataset
if isinstance(eval_dataset, dict):
metrics = {}
for eval_dataset_name, _eval_dataset in eval_dataset.items():
dataset_metrics = self.evaluate(
eval_dataset=_eval_dataset if override else eval_dataset_name,
ignore_keys=ignore_keys,
metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
)
metrics.update(dataset_metrics)
return metrics
# memory metrics - must set up as early as possible
self._memory_tracker.start()
eval_dataloader = self.get_eval_dataloader(eval_dataset)
if self.is_fsdp_xla_v2_enabled:
eval_dataloader = tpu_spmd_dataloader(eval_dataloader)
start_time = time.time()
output = self.evaluation_loop(
eval_dataloader,
description="Evaluation",
# No point gathering the predictions if there are no metrics, otherwise we defer to
# self.args.prediction_loss_only
prediction_loss_only=True if self.compute_metrics is None else None,
ignore_keys=ignore_keys,
metric_key_prefix=metric_key_prefix,
)
total_batch_size = self.args.eval_batch_size * self.args.world_size
if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
output.metrics.update(
speed_metrics(
metric_key_prefix,
start_time,
num_samples=output.num_samples,
num_steps=math.ceil(output.num_samples / total_batch_size),
)
)
self.log(output.metrics)
if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
xm.master_print(met.metrics_report())
self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
self._memory_tracker.stop_and_update_metrics(output.metrics)
return output.metrics
def evaluation_loop(
self,
dataloader: DataLoader,
description: str,
prediction_loss_only: bool | None = None,
ignore_keys: list[str] | None = None,
metric_key_prefix: str = "eval",
) -> EvalLoopOutput:
"""
Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
Works both with or without labels.
"""
args = self.args
prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
# if eval is called w/o train, handle model prep here
if self.is_deepspeed_enabled and self.deepspeed is None:
_, _ = deepspeed_init(self, num_training_steps=0, inference=True)
model = self._wrap_model(self.model, training=False)
if len(self.accelerator._models) == 0 and model is self.model:
start_time = time.time()
model = (
self.accelerator.prepare(model)
if self.is_deepspeed_enabled or (self.is_fsdp_enabled and not self.args.torch_compile)
else self.accelerator.prepare_model(model, evaluation_mode=True)
)
self.model_preparation_time = round(time.time() - start_time, 4)
if self.is_fsdp_enabled:
self.model = model
# for the rest of this function `model` is the outside model, whether it was wrapped or not
if model is not self.model:
self.model_wrapped = model
# backward compatibility
if self.is_deepspeed_enabled:
self.deepspeed = self.model_wrapped
# if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
# while ``train`` is running, cast it to the right dtype first and then put on device
if not self.is_in_train:
if args.fp16_full_eval:
model = model.to(dtype=torch.float16, device=args.device)
elif args.bf16_full_eval:
model = model.to(dtype=torch.bfloat16, device=args.device)
batch_size = self.args.eval_batch_size
logger.info(f"\n***** Running {description} *****")
if has_length(dataloader):
logger.info(f" Num examples = {self.num_examples(dataloader)}")
else:
logger.info(" Num examples: Unknown")
logger.info(f" Batch size = {batch_size}")
if hasattr(model, "eval") and callable(model.eval):
model.eval()
if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
self.optimizer.eval()
self.callback_handler.eval_dataloader = dataloader
# Do this before wrapping.
eval_dataset = getattr(dataloader, "dataset", None)
# Initialize containers
all_losses = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
metrics = None
eval_set_kwargs = {}
# Will be useful when we have an iterable dataset so don't know its length.
observed_num_examples = 0
# Main evaluation loop
for step, inputs in enumerate(dataloader):
# Update the observed num examples
observed_batch_size = find_batch_size(inputs)
if observed_batch_size is not None:
observed_num_examples += observed_batch_size
# For batch samplers, batch_size is not known by the dataloader in advance.
if batch_size is None:
batch_size = observed_batch_size
# Prediction step
losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
main_input_name = getattr(self.model, "main_input_name", "input_ids")
inputs_decode = (
self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
)
if is_torch_xla_available():
xm.mark_step()
# Update containers
if losses is not None:
losses = self.gather_function(losses.repeat(batch_size))
all_losses.add(losses)
if inputs_decode is not None:
inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
inputs_decode = self.gather_function(inputs_decode)
if not self.args.batch_eval_metrics or description == "Prediction":
all_inputs.add(inputs_decode)
if labels is not None:
# Pad labels here, preparing for preprocess_logits_for_metrics in next logits block.
labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
if logits is not None:
logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
if self.preprocess_logits_for_metrics is not None:
logits = self.preprocess_logits_for_metrics(logits, labels)
logits = self.gather_function(logits)
if not self.args.batch_eval_metrics or description == "Prediction":
all_preds.add(logits)
if labels is not None:
labels = self.gather_function(labels)
if not self.args.batch_eval_metrics or description == "Prediction":
all_labels.add(labels)
self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
if self.args.batch_eval_metrics:
if self.compute_metrics is not None and logits is not None and labels is not None:
is_last_step = self.accelerator.gradient_state.end_of_dataloader
batch_kwargs = {}
batch_kwargs["losses"] = losses if "loss" in args.include_for_metrics else None
batch_kwargs["inputs"] = inputs if "inputs" in args.include_for_metrics else None
metrics = self.compute_metrics(
EvalPrediction(predictions=logits, label_ids=labels, **batch_kwargs),
compute_result=is_last_step,
)
del losses, logits, labels, inputs
torch.cuda.empty_cache()
# Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
elif args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
all_losses.to_cpu_and_numpy()
all_preds.to_cpu_and_numpy()
all_labels.to_cpu_and_numpy()
all_inputs.to_cpu_and_numpy()
del losses, logits, labels, inputs
torch.cuda.empty_cache()
# After all calls to `.gather_function`, reset to `gather_for_metrics`:
self.gather_function = self.accelerator.gather_for_metrics
# Gather all remaining tensors and put them back on the CPU
all_losses = all_losses.get_arrays()
all_preds = all_preds.get_arrays()
all_labels = all_labels.get_arrays()
all_inputs = all_inputs.get_arrays()
# Number of samples
if has_length(eval_dataset):
num_samples = len(eval_dataset)
# The instance check is weird and does not actually check for the type, but whether the dataset has the right
# methods. Therefore we need to make sure it also has the attribute.
elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
num_samples = eval_dataset.num_examples
else:
if has_length(dataloader):
num_samples = self.num_examples(dataloader)
else: # both len(dataloader.dataset) and len(dataloader) fail
num_samples = observed_num_examples
if num_samples == 0 and observed_num_examples > 0:
num_samples = observed_num_examples
# Metrics!
if (
self.compute_metrics is not None
and all_preds is not None
and all_labels is not None
and not self.args.batch_eval_metrics
):
eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
metrics = self.compute_metrics(
EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
)
elif metrics is None:
metrics = {}
# To be JSON-serializable, we need to remove numpy types or zero-d tensors
metrics = denumpify_detensorize(metrics)
if isinstance(all_losses, list) and all_losses:
metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()
elif isinstance(all_losses, np.ndarray):
metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
if hasattr(self, "model_preparation_time"):
metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time
# Prefix all keys with metric_key_prefix + '_'
for key in list(metrics.keys()):
if not key.startswith(f"{metric_key_prefix}_"):
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
def predict(
self, test_dataset: Dataset, ignore_keys: list[str] | None = None, metric_key_prefix: str = "test"
) -> PredictionOutput:
"""
Run prediction and returns predictions and potential metrics.
Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
will also return metrics, like in `evaluate()`.
Args:
test_dataset (`Dataset`):
Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
`model.forward()` method are automatically removed. Has to implement the method `__len__`
ignore_keys (`list[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
metric_key_prefix (`str`, *optional*, defaults to `"test"`):
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
"test_bleu" if the prefix is "test" (default)
If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
one array. The padding index is -100.
Returns: *NamedTuple* A namedtuple with the following keys:
- predictions (`np.ndarray`): The predictions on `test_dataset`.
- label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
- metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
labels).
"""
# memory metrics - must set up as early as possible
self._memory_tracker.start()
test_dataloader = self.get_test_dataloader(test_dataset)
start_time = time.time()
output = self.evaluation_loop(
test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
)
total_batch_size = self.args.eval_batch_size * self.args.world_size
if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
output.metrics.update(
speed_metrics(
metric_key_prefix,
start_time,
num_samples=output.num_samples,
num_steps=math.ceil(output.num_samples / total_batch_size),
)
)
self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
self._memory_tracker.stop_and_update_metrics(output.metrics)
return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
def prediction_step(
self,
model: nn.Module,
inputs: dict[str, torch.Tensor | Any],
prediction_loss_only: bool,
ignore_keys: list[str] | None = None,
) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
"""
Perform an evaluation step on `model` using `inputs`.
Subclass and override to inject custom behavior.
Args:
model (`nn.Module`):
The model to evaluate.
inputs (`dict[str, torch.Tensor | Any]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument `labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (`bool`):
Whether or not to return the loss only.
ignore_keys (`list[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
Return:
tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
logits and labels (each being optional).
"""
has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
# For CLIP-like models capable of returning loss values.
# If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
# is `True` in `model.forward`.
return_loss = inputs.get("return_loss")
if return_loss is None:
return_loss = self.can_return_loss
loss_without_labels = len(self.label_names) == 0 and return_loss
inputs = self._prepare_inputs(inputs)
if ignore_keys is None:
if hasattr(self.model, "config"):
ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", ["past_key_values"])
else:
ignore_keys = []
# labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
if has_labels or loss_without_labels:
labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
if len(labels) == 1:
labels = labels[0]
else:
labels = None
with torch.no_grad():
if is_sagemaker_mp_enabled():
raw_outputs = smp_forward_only(model, inputs)
if has_labels or loss_without_labels:
if isinstance(raw_outputs, dict):
loss_mb = raw_outputs["loss"]
logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
else:
loss_mb = raw_outputs[0]
logits_mb = raw_outputs[1:]
loss = loss_mb.reduce_mean().detach().cpu()
logits = smp_nested_concat(logits_mb)
else:
loss = None
if isinstance(raw_outputs, dict):
logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
else:
logits_mb = raw_outputs
logits = smp_nested_concat(logits_mb)
else:
if has_labels or loss_without_labels:
with self.compute_loss_context_manager():
num_items_in_batch = self._get_num_items_in_batch([inputs], self.args.device)
loss, outputs = self.compute_loss(
model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
)
loss = loss.detach().mean()
if isinstance(outputs, dict):
logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
else:
logits = outputs[1:]
else:
loss = None
with self.compute_loss_context_manager():
outputs = model(**inputs)
if isinstance(outputs, dict):
logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
else:
logits = outputs
if prediction_loss_only:
return (loss, None, None)
logits = nested_detach(logits)
if len(logits) == 1:
logits = logits[0]
return (loss, logits, labels)
def _evaluate(
self,
trial: "optuna.Trial | dict[str, Any] | None",
ignore_keys_for_eval: list[str] | None,
skip_scheduler: bool = False,
) -> dict[str, float]:
"""Run evaluation, report to HP search, and step ReduceLROnPlateau/GreedyLR if needed."""
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
self._report_to_hp_search(trial, self.state.global_step, metrics)
# Run delayed LR scheduler now that metrics are populated
if (
isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR))
and not skip_scheduler
):
metric_to_check = self.args.metric_for_best_model
if not metric_to_check.startswith("eval_"):
metric_to_check = f"eval_{metric_to_check}"
try:
self.lr_scheduler.step(metrics[metric_to_check])
except KeyError as exc:
raise KeyError(
f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
f"which is not found in the evaluation metrics. "
f"The available evaluation metrics are: {list(metrics.keys())}. "
f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
f"consider changing the `metric_for_best_model` via the TrainingArguments."
) from exc
return metrics
# ---- Checkpoint Saving ----
def _get_output_dir(self, trial: "optuna.Trial | dict[str, Any] | None") -> str:
"""Return the output directory, accounting for hyperparameter search trials."""
if self.hp_search_backend is not None and trial is not None:
if self.hp_search_backend == HPSearchBackend.OPTUNA:
run_id = trial.number
elif self.hp_search_backend == HPSearchBackend.RAY:
import ray.tune
run_id = ray.tune.get_context().get_trial_id()
elif self.hp_search_backend == HPSearchBackend.WANDB:
import wandb
run_id = wandb.run.id
run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}"
run_dir = os.path.join(self.args.output_dir, run_name)
else:
run_dir = self.args.output_dir
return run_dir
def _save_checkpoint(self, model: nn.Module, trial: "optuna.Trial | dict[str, Any] | None") -> None:
"""Save model checkpoint, optimizer, scheduler, scaler, RNG states, and trainer state."""
# In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
# want to save except FullyShardedDDP.
# assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
# Save model checkpoint
checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
if self.hp_search_backend is None and trial is None:
self.store_flos()
run_dir = self._get_output_dir(trial=trial)
output_dir = os.path.join(run_dir, checkpoint_folder)
self.save_model(output_dir, _internal_call=True)
if (
self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH, SaveStrategy.BEST]
and self.state.best_global_step
):
# Wait for everyone to get here so we are sure the model has been saved by process 0
# before we check if the best_checkpoint_dir exists
if is_torch_xla_available():
xm.rendezvous("load_best_model_at_end")
elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
dist.barrier()
elif is_sagemaker_mp_enabled():
smp.barrier()
best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}"
best_checkpoint_dir = os.path.join(run_dir, best_checkpoint_folder)
if os.path.exists(best_checkpoint_dir):
self.state.best_model_checkpoint = best_checkpoint_dir
if not self.args.save_only_model:
# Save optimizer and scheduler
self._save_optimizer_and_scheduler(output_dir)
self._save_scaler(output_dir)
# Save RNG state
self._save_rng_state(output_dir)
# Save the Trainer state
if self.args.should_save:
# Update `ExportableState` callbacks and `TrainerControl` state to where we are currently
for cb in [
cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
]:
cb_name = cb.__class__.__name__
cb_state = cb.state()
if isinstance(self.state.stateful_callbacks[cb_name], list):
self.state.stateful_callbacks[cb_name].append(cb_state)
else:
self.state.stateful_callbacks[cb_name] = cb_state
self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
if self.args.push_to_hub:
self._push_from_checkpoint(output_dir)
# Maybe delete some older checkpoints.
if self.args.should_save:
# we use mtime as default, filesystems without mtime support will be detected in `sort_checkpoints`
rotate_checkpoints(
output_dir=run_dir,
save_total_limit=self.args.save_total_limit,
best_model_checkpoint=self.state.best_model_checkpoint,
use_mtime=True,
)
def _determine_best_metric(self, metrics: dict[str, float], trial: "optuna.Trial | dict[str, Any] | None") -> bool:
"""
Determine if the model should be saved based on the evaluation metrics.
Returns:
bool: True if a new best metric was found, else False
"""
is_new_best_metric = False
if self.args.metric_for_best_model is not None:
metric_to_check = self.args.metric_for_best_model
if not metric_to_check.startswith("eval_"):
metric_to_check = f"eval_{metric_to_check}"
try:
metric_value = metrics[metric_to_check]
except KeyError as exc:
raise KeyError(
f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
) from exc
operator = np.greater if self.args.greater_is_better else np.less
if self.state.best_metric is None:
self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf")
if operator(metric_value, self.state.best_metric):
self.state.best_metric = metric_value
if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH, SaveStrategy.BEST]:
self.state.best_global_step = self.state.global_step
is_new_best_metric = True
return is_new_best_metric
def _save_rng_state(self, output_dir: str) -> None:
"""Save random number generator states for reproducible resumption."""
# Save RNG state in non-distributed training
rng_states = {
"python": random.getstate(),
"numpy": np.random.get_state(),
"cpu": torch.random.get_rng_state(),
}
if torch.cuda.is_available():
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
# In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
else:
rng_states["cuda"] = torch.cuda.random.get_rng_state()
if is_torch_xla_available():
rng_states["xla"] = xm.get_rng_state()
if is_torch_npu_available():
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
rng_states["npu"] = torch.npu.random.get_rng_state_all()
else:
rng_states["npu"] = torch.npu.random.get_rng_state()
if is_torch_hpu_available():
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
rng_states["hpu"] = torch.hpu.random.get_rng_state_all()
else:
rng_states["hpu"] = torch.hpu.random.get_rng_state()
if is_torch_mlu_available():
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
rng_states["mlu"] = torch.mlu.random.get_rng_state_all()
else:
rng_states["mlu"] = torch.mlu.random.get_rng_state()
if is_torch_musa_available():
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
rng_states["musa"] = torch.musa.get_rng_state_all()
else:
rng_states["musa"] = torch.musa.get_rng_state()
# A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
# not yet exist.
os.makedirs(output_dir, exist_ok=True)
if self.args.world_size <= 1:
torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
else:
torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
def _save_optimizer_and_scheduler(self, output_dir: str) -> None:
"""Save optimizer and learning rate scheduler states to `output_dir`."""
if is_torch_xla_available():
xm.rendezvous("saving_optimizer_states")
if self.is_fsdp_xla_v1_enabled:
optm = {
"optimizer": self.optimizer.state_dict(),
"shard_metadata": self.model.get_shard_metadata(),
}
xm.save(
optm,
os.path.join(
output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
),
master_only=False,
)
else:
xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
with warnings.catch_warnings(record=True) as caught_warnings:
xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
reissue_pt_warnings(caught_warnings)
elif is_sagemaker_mp_enabled():
opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
smp.barrier()
if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
smp.save(
opt_state_dict,
os.path.join(output_dir, OPTIMIZER_NAME),
partial=True,
v3=smp.state.cfg.shard_optimizer_state,
)
elif self.is_deepspeed_enabled:
# under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
# config `stage3_gather_16bit_weights_on_model_save` is True
accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
)
if accept_exclude_frozen_parameters and _is_peft_model(self.model):
self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
else:
self.model_wrapped.save_checkpoint(output_dir)
elif self.is_fsdp_enabled:
# save fsdp specific ckpt for resuming from ckpt
save_fsdp_model(
self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **get_fsdp_ckpt_kwargs()
)
save_fsdp_optimizer(
self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
)
elif self.args.should_save:
# deepspeed.save_checkpoint above saves model/optim/sched
torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
# Save SCHEDULER & SCALER
is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
self.lr_scheduler, DeepSpeedSchedulerWrapper
)
if (
self.args.should_save
and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
and not is_torch_xla_available()
):
with warnings.catch_warnings(record=True) as caught_warnings:
torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
reissue_pt_warnings(caught_warnings)
def _save_scaler(self, output_dir: str) -> None:
"""Save the gradient scaler state if one exists."""
# See if there is a scaler attribute
try:
scaler = self.accelerator.scaler
except AttributeError:
return
if scaler is None:
return
if is_torch_xla_available():
xm.rendezvous("saving_scaler_state")
with warnings.catch_warnings(record=True) as caught_warnings:
xm.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
reissue_pt_warnings(caught_warnings)
# Save SCALER
if self.args.should_save and not is_torch_xla_available():
with warnings.catch_warnings(record=True) as caught_warnings:
torch.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
reissue_pt_warnings(caught_warnings)
# ---- Checkpoint Resuming ----
def _load_from_checkpoint(self, resume_from_checkpoint: str, model: nn.Module | None = None) -> None:
"""Load model weights from a checkpoint directory."""
if model is None:
model = self.model
config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and (
# this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
any(
FSDP_MODEL_NAME in folder_name
for folder_name in os.listdir(resume_from_checkpoint)
if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
)
# this checks the FSDP state dict when `FULL_STATE_DICT` is used
or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin"))
)
# if multiple adapters exist, they get saved in sub directories
adapter_subdirs = (
[
folder_name
for folder_name in os.listdir(resume_from_checkpoint)
if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
and (
os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_WEIGHTS_NAME))
or os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_SAFE_WEIGHTS_NAME))
)
]
if os.path.isdir(resume_from_checkpoint)
else []
)
if is_fsdp_ckpt and not self.is_fsdp_enabled:
raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP")
if not (
any(
os.path.isfile(f)
for f in [
weights_file,
safe_weights_file,
weights_index_file,
safe_weights_index_file,
adapter_weights_file,
adapter_safe_weights_file,
]
)
or is_fsdp_ckpt
or adapter_subdirs
):
raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
logger.info(f"Loading model from {resume_from_checkpoint}.")
if os.path.isfile(config_file):
config = PreTrainedConfig.from_json_file(config_file)
checkpoint_version = config.transformers_version
if checkpoint_version is not None and checkpoint_version != __version__:
logger.warning(
f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
f"Transformers but your current version is {__version__}. This is not recommended and could "
"yield to errors or unwanted behaviors."
)
if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt:
# If the model is on the GPU, it still works!
if is_sagemaker_mp_enabled():
smp.resume_from_checkpoint(
path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False
)
elif self.is_fsdp_enabled:
load_fsdp_model(
self.accelerator.state.fsdp_plugin,
self.accelerator,
model,
resume_from_checkpoint,
**get_fsdp_ckpt_kwargs(),
)
else:
# We load the model state dict on the CPU to avoid an OOM error.
if os.path.isfile(safe_weights_file):
state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
else:
check_torch_load_is_safe()
state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
# workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
# which takes *args instead of **kwargs
load_result = model.load_state_dict(state_dict, False)
# release memory
del state_dict
self._issue_warnings_after_load(load_result)
# Load adapters following PR # 24096
elif _is_peft_model(model):
# If training a model using PEFT, assume that adapter have been saved properly.
if hasattr(model, "active_adapters") and hasattr(model, "load_adapter"):
if os.path.exists(resume_from_checkpoint):
active_adapters = model.active_adapters
if len(active_adapters) > 1:
logger.warning("Multiple active adapters detected will only consider the first adapter")
active_adapter = active_adapters[0]
if adapter_subdirs:
for subdir_name in adapter_subdirs:
peft_id = os.path.join(resume_from_checkpoint, subdir_name)
model.load_adapter(peft_id, subdir_name, is_trainable=(subdir_name == active_adapter))
model.set_adapter(active_adapter)
else:
model.load_adapter(resume_from_checkpoint, active_adapter, is_trainable=True)
else:
logger.warning(
"The intermediate checkpoints of PEFT may not be saved correctly, "
f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
"Check some examples here: https://github.com/huggingface/peft/issues/96"
)
else:
logger.warning(f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed")
else:
# We load the sharded checkpoint
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
if not is_sagemaker_mp_enabled():
self._issue_warnings_after_load(load_result)
def _load_best_model(self) -> None:
"""Load the best model found during training based on the tracked metric."""
logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
if self.is_deepspeed_enabled:
deepspeed_load_checkpoint(
self.model_wrapped,
self.state.best_model_checkpoint,
load_module_strict=not _is_peft_model(self.model),
)
elif self.is_fsdp_enabled:
load_result = load_fsdp_model(
self.accelerator.state.fsdp_plugin,
self.accelerator,
model,
self.state.best_model_checkpoint,
**get_fsdp_ckpt_kwargs(),
)
elif (
os.path.exists(best_model_path)
or os.path.exists(best_safe_model_path)
or os.path.exists(best_adapter_model_path)
or os.path.exists(best_safe_adapter_model_path)
):
has_been_loaded = True
if is_sagemaker_mp_enabled():
smp.resume_from_checkpoint(
path=self.state.best_model_checkpoint,
tag=WEIGHTS_NAME,
partial=False,
load_optimizer=False,
)
else:
if _is_peft_model(model):
# If training a model using PEFT, assume that adapter have been saved properly.
if hasattr(model, "active_adapters") and hasattr(model, "load_adapter"):
active_adapter = model.active_adapters[0]
if len(model.active_adapters) > 1:
logger.warning("Detected multiple active adapters, will only consider the first one")
if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
try:
model.load_adapter(self.state.best_model_checkpoint, active_adapter)
except RuntimeError as exc:
if model.peft_config[active_adapter].is_prompt_learning:
# for context: https://github.com/huggingface/peft/issues/2256
msg = (
"When using prompt learning PEFT methods such as "
f"{model.peft_config[active_adapter].peft_type.value}, setting "
"load_best_model_at_end=True can lead to errors, it is recommended "
"to set this to False and to load the model manually from the checkpoint "
"directory using PeftModel.from_pretrained(base_model, ) after training "
"has finished."
)
raise RuntimeError(msg) from exc
else:
raise
# Load_adapter has no return value present, modify it when appropriate.
from torch.nn.modules.module import _IncompatibleKeys
load_result = _IncompatibleKeys([], [])
else:
logger.warning(
"The intermediate checkpoints of PEFT may not be saved correctly, "
f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
"Check some examples here: https://github.com/huggingface/peft/issues/96"
)
has_been_loaded = False
else:
logger.warning(
f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed"
)
has_been_loaded = False
else:
# We load the model state dict on the CPU to avoid an OOM error.
if os.path.isfile(best_safe_model_path):
state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
else:
check_torch_load_is_safe()
state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
# If the model is on the GPU, it still works!
# workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
# which takes *args instead of **kwargs
load_result = model.load_state_dict(state_dict, False)
if not is_sagemaker_mp_enabled() and has_been_loaded:
self._issue_warnings_after_load(load_result)
elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists(
os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)
):
load_result = load_sharded_checkpoint(
model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled()
)
if not is_sagemaker_mp_enabled():
self._issue_warnings_after_load(load_result)
else:
logger.warning(
f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
"on multiple nodes, you should activate `--save_on_each_node`."
)
def _load_rng_state(self, checkpoint: str | None) -> None:
"""Restore random number generator states from a checkpoint."""
# Load RNG states from `checkpoint`
if checkpoint is None:
return
if self.args.world_size > 1:
process_index = self.args.process_index
rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
if not os.path.isfile(rng_file):
logger.info(
f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
"wasn't launched in a distributed fashion, reproducibility is not guaranteed."
)
return
else:
rng_file = os.path.join(checkpoint, "rng_state.pth")
if not os.path.isfile(rng_file):
logger.info(
"Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
"fashion, reproducibility is not guaranteed."
)
return
with safe_globals():
check_torch_load_is_safe()
checkpoint_rng_state = torch.load(rng_file, weights_only=True)
random.setstate(checkpoint_rng_state["python"])
np.random.set_state(checkpoint_rng_state["numpy"])
torch.random.set_rng_state(checkpoint_rng_state["cpu"])
if is_torch_xla_available():
xm.set_rng_state(checkpoint_rng_state["xla"])
is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED
if torch.cuda.is_available():
set_rng_state_for_device("CUDA", torch.cuda, checkpoint_rng_state, is_distributed)
if is_torch_npu_available():
set_rng_state_for_device("NPU", torch.npu, checkpoint_rng_state, is_distributed)
if is_torch_hpu_available():
set_rng_state_for_device("HPU", torch.hpu, checkpoint_rng_state, is_distributed)
if is_torch_mlu_available():
set_rng_state_for_device("MLU", torch.mlu, checkpoint_rng_state, is_distributed)
if is_torch_musa_available():
set_rng_state_for_device("MUSA", torch.musa, checkpoint_rng_state, is_distributed)
def _load_optimizer_and_scheduler(self, checkpoint: str | None) -> None:
"""If optimizer and scheduler states exist, load them."""
if checkpoint is None:
return
if self.is_deepspeed_enabled:
# deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
with warnings.catch_warnings(record=True) as caught_warnings:
check_torch_load_is_safe()
self.lr_scheduler.load_state_dict(
torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
)
reissue_pt_warnings(caught_warnings)
return
checkpoint_file_exists = (
glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
if is_sagemaker_mp_enabled()
else (
os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN))
or (
os.path.isdir(checkpoint)
and any(
OPTIMIZER_NAME_BIN.split(".")[0] in folder_name
for folder_name in os.listdir(checkpoint)
if os.path.isdir(os.path.join(checkpoint, folder_name))
)
)
)
)
checkpoint_file_exists = (
glob.glob(os.path.join(checkpoint, f"rank*-of-{self.args.world_size}-{OPTIMIZER_NAME}"))
if self.is_fsdp_xla_v1_enabled
else checkpoint_file_exists
)
if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
# Load in optimizer and scheduler states
if is_torch_xla_available():
# On TPU we have to take some extra precautions to properly load the states on the right device.
if self.is_fsdp_xla_v1_enabled:
check_torch_load_is_safe()
optimizer_state = torch.load(
os.path.join(
checkpoint, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
),
map_location="cpu",
weights_only=True,
)
# We only need `optimizer` when resuming from checkpoint
optimizer_state = optimizer_state["optimizer"]
else:
check_torch_load_is_safe()
optimizer_state = torch.load(
os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu", weights_only=True
)
with warnings.catch_warnings(record=True) as caught_warnings:
check_torch_load_is_safe()
lr_scheduler_state = torch.load(
os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu", weights_only=True
)
reissue_pt_warnings(caught_warnings)
xm.send_cpu_data_to_device(optimizer_state, self.args.device)
xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device)
self.optimizer.load_state_dict(optimizer_state)
self.lr_scheduler.load_state_dict(lr_scheduler_state)
else:
if is_sagemaker_mp_enabled():
def opt_load_hook(mod, opt):
opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
self.model_wrapped.register_post_step_hook(opt_load_hook)
else:
# We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models.
# In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
# likely to get OOM on CPU (since we load num_gpu times the optimizer state
map_location = self.args.device if self.args.world_size > 1 else "cpu"
if self.is_fsdp_enabled:
load_fsdp_optimizer(
self.accelerator.state.fsdp_plugin,
self.accelerator,
self.optimizer,
self.model,
checkpoint,
**get_fsdp_ckpt_kwargs(),
)
else:
check_torch_load_is_safe()
self.optimizer.load_state_dict(
torch.load(
os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location, weights_only=True
)
)
with warnings.catch_warnings(record=True) as caught_warnings:
check_torch_load_is_safe()
self.lr_scheduler.load_state_dict(
torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
)
reissue_pt_warnings(caught_warnings)
def _load_scaler(self, checkpoint: str | None) -> None:
"""If scaler state exists, load it."""
if checkpoint is None:
return
checkpoint_file_exists = os.path.isfile(os.path.join(checkpoint, SCALER_NAME))
if checkpoint_file_exists:
# On TPU we have to take some extra precautions to properly load the states on the right device.
# Load in scaler states
if is_torch_xla_available():
with warnings.catch_warnings(record=True) as caught_warnings:
check_torch_load_is_safe()
scaler_state = torch.load(
os.path.join(checkpoint, SCALER_NAME), map_location="cpu", weights_only=True
)
reissue_pt_warnings(caught_warnings)
xm.send_cpu_data_to_device(scaler_state, self.args.device)
self.accelerator.scaler.load_state_dict(scaler_state)
else:
with warnings.catch_warnings(record=True) as caught_warnings:
check_torch_load_is_safe()
self.accelerator.scaler.load_state_dict(
torch.load(os.path.join(checkpoint, SCALER_NAME), weights_only=True)
)
reissue_pt_warnings(caught_warnings)
def _load_callback_state(self) -> None:
"""If callback states exist and were passed in, restore their states if enabled"""
if not self.args.restore_callback_states_from_checkpoint:
return
# Callback states are stored in stateful_callbacks
not_found = []
new_callbacks = []
original_callbacks = self.callback_handler.callbacks + [self.control]
for stored_callback, data in self.state.stateful_callbacks.items():
if not isinstance(data, list):
data = [data]
if any(callback.__class__.__name__ == stored_callback for callback in original_callbacks):
# We can load/restore from multiple callbacks of the same type.
duplicates = [
callback for callback in original_callbacks if callback.__class__.__name__ == stored_callback
]
for callback, callback_data in zip(duplicates, data):
args = callback_data.get("args", {})
attributes = callback_data.get("attributes", {})
new_callback = type(callback)(**args)
for attribute, value in attributes.items():
setattr(new_callback, attribute, value)
if isinstance(callback, TrainerControl):
# Specifically for restoring the `control` state
self.control = new_callback
else:
new_callbacks.append(new_callback)
# We remove the existing callback and add it to the list of new callbacks
self.callback_handler.remove_callback(type(new_callback))
logger.info("Continuing training from checkpoint, restoring any callbacks that were passed in")
else:
not_found.append(stored_callback)
if len(not_found) > 0:
logger.warning(
f"Checkpoint included callbacks not included in current configuration. Ignoring. ({', '.join(not_found)})"
)
for callback in new_callbacks:
self.callback_handler.add_callback(callback)
def _issue_warnings_after_load(self, load_result: Any) -> None:
"""Log warnings for missing or unexpected keys after loading a checkpoint."""
if len(load_result.missing_keys) != 0:
if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
self.model._keys_to_ignore_on_save
):
self.model.tie_weights()
else:
logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
if len(load_result.unexpected_keys) != 0:
logger.warning(
f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
)
# ---- Saving & Serialization ----
def save_model(self, output_dir: str | None = None, _internal_call: bool = False) -> None:
"""
Will save the model, so you can reload it using `from_pretrained()`.
Will only save from the main process.
"""
if output_dir is None:
output_dir = self.args.output_dir
if is_torch_xla_available():
save_tpu_checkpoint(
self.model, self.args, self.accelerator, self.processing_class, self.is_fsdp_xla_v1_enabled, output_dir
)
elif is_sagemaker_mp_enabled():
# Calling the state_dict needs to be done on the wrapped model and on all processes.
os.makedirs(output_dir, exist_ok=True)
state_dict = self.model_wrapped.state_dict()
if self.args.should_save:
self._save(output_dir, state_dict=state_dict)
Path(os.path.join(output_dir, "user_content.pt")).touch()
elif self.is_fsdp_enabled:
if "FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type):
state_dict = self.accelerator.get_state_dict(self.model)
if self.args.should_save:
self._save(output_dir, state_dict=state_dict)
elif self.is_deepspeed_enabled:
try:
accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
)
zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
# When using PEFT with DeepSpeed ZeRO Stage 3,
# we do not need to load the frozen parameters
state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
else:
state_dict = self.accelerator.get_state_dict(self.deepspeed)
if self.args.should_save:
self._save(output_dir, state_dict=state_dict)
except ValueError:
logger.warning(
" stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
" zero_to_fp32.py to recover weights"
)
if self.args.should_save:
self._save(output_dir, state_dict={})
# remove the dummy state_dict
remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
self.model_wrapped.save_checkpoint(output_dir)
elif self.args.should_save:
self._save(output_dir)
# Push to the Hub when `save_model` is called by the user.
if self.args.push_to_hub and not _internal_call:
self.push_to_hub(commit_message="Model save", revision=self.args.hub_revision)
def _save(self, output_dir: str | None = None, state_dict: dict | None = None) -> None:
"""Save model weights, configuration, and processing class to `output_dir`."""
# If we are executing this function, we are the process zero, so we don't check for that.
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Saving model checkpoint to {output_dir}")
supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
if not isinstance(self.model, supported_classes):
if state_dict is None:
state_dict = self.model.state_dict()
if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
output_dir, state_dict=state_dict
)
else:
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
safetensors.torch.save_file(
state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
)
else:
self.model.save_pretrained(output_dir, state_dict=state_dict)
if self.processing_class is not None:
self.processing_class.save_pretrained(output_dir)
elif (
self.data_collator is not None
and hasattr(self.data_collator, "tokenizer")
and self.data_collator.tokenizer is not None
):
logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`")
self.data_collator.tokenizer.save_pretrained(output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
# ---- Logging & Metrics ----
def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
"""
Log `logs` on the various objects watching training.
Subclass and override this method to inject custom behavior.
Args:
logs (`dict[str, float]`):
The values to log.
start_time (`Optional[float]`):
The start of training.
"""
if self.state.epoch is not None:
logs["epoch"] = self.state.epoch
if self.args.include_num_input_tokens_seen != "no":
logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
if start_time is not None:
current_session_num_tokens = self.state.num_input_tokens_seen - self._initial_num_input_tokens_seen
logs.update(speed_metrics("train", start_time, num_tokens=current_session_num_tokens))
output = {**logs, "step": self.state.global_step}
self.state.log_history.append(output)
self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
def store_flos(self) -> None:
"""Store the number of floating-point operations that went into the model."""
if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
self.state.total_flos += (
distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item()
)
self.current_flos = 0
else:
self.state.total_flos += self.current_flos
self.current_flos = 0
def floating_point_ops(self, inputs: dict[str, torch.Tensor | Any]) -> int:
"""
For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
operations for every backward + forward pass. If using another model, either implement such a method in the
model or subclass and override this method.
Args:
inputs (`dict[str, torch.Tensor | Any]`):
The inputs and targets of the model.
Returns:
`int`: The number of floating-point operations.
"""
if (main_input := getattr(self.model, "main_input_name", "input_ids")) in inputs and hasattr(
self.model, "num_parameters"
):
return 6 * inputs[main_input].numel() * self.model.num_parameters(exclude_embeddings=True)
return 0
# ---- Hub Integration ----
def init_hf_repo(self, token: str | None = None) -> None:
"""
Initializes a git repo in `self.args.hub_model_id`.
"""
# Only on process zero
if not self.is_world_process_zero():
return
if self.args.hub_model_id is None:
repo_name = Path(self.args.output_dir).absolute().name
else:
repo_name = self.args.hub_model_id
token = token if token is not None else self.args.hub_token
repo_url = create_repo(repo_name, token=token, private=self.args.hub_private_repo, exist_ok=True)
self.hub_model_id = repo_url.repo_id
self.push_in_progress = None
def create_model_card(
self,
language: str | None = None,
license: str | None = None,
tags: str | list[str] | None = None,
model_name: str | None = None,
finetuned_from: str | None = None,
tasks: str | list[str] | None = None,
dataset_tags: str | list[str] | None = None,
dataset: str | list[str] | None = None,
dataset_args: str | list[str] | None = None,
) -> None:
"""
Creates a draft of a model card using the information available to the `Trainer`.
Args:
language (`str`, *optional*):
The language of the model (if applicable)
license (`str`, *optional*):
The license of the model. Will default to the license of the pretrained model used, if the original
model given to the `Trainer` comes from a repo on the Hub.
tags (`str` or `list[str]`, *optional*):
Some tags to be included in the metadata of the model card.
model_name (`str`, *optional*):
The name of the model.
finetuned_from (`str`, *optional*):
The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
of the original model given to the `Trainer` (if it comes from the Hub).
tasks (`str` or `list[str]`, *optional*):
One or several task identifiers, to be included in the metadata of the model card.
dataset_tags (`str` or `list[str]`, *optional*):
One or several dataset tags, to be included in the metadata of the model card.
dataset (`str` or `list[str]`, *optional*):
One or several dataset identifiers, to be included in the metadata of the model card.
dataset_args (`str` or `list[str]`, *optional*):
One or several dataset arguments, to be included in the metadata of the model card.
"""
if not self.is_world_process_zero():
return
model_card_filepath = os.path.join(self.args.output_dir, "README.md")
is_peft_library = False
if os.path.exists(model_card_filepath):
library_name = ModelCard.load(model_card_filepath).data.get("library_name")
is_peft_library = library_name == "peft"
# Append existing tags in `tags`
existing_tags = ModelCard.load(model_card_filepath).data.tags
if tags is not None and existing_tags is not None:
if isinstance(tags, str):
tags = [tags]
for tag in existing_tags:
if tag not in tags:
tags.append(tag)
training_summary = TrainingSummary.from_trainer(
self,
language=language,
license=license,
tags=tags,
model_name=model_name,
finetuned_from=finetuned_from,
tasks=tasks,
dataset_tags=dataset_tags,
dataset=dataset,
dataset_args=dataset_args,
)
model_card = training_summary.to_model_card()
with open(model_card_filepath, "w") as f:
f.write(model_card)
if is_peft_library:
self.accelerator.unwrap_model(self.model).create_or_update_model_card(self.args.output_dir)
def push_to_hub(
self,
commit_message: str | None = "End of training",
blocking: bool = True,
token: str | None = None,
revision: str | None = None,
**kwargs,
) -> CommitInfo:
"""
Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`.
Parameters:
commit_message (`str`, *optional*, defaults to `"End of training"`):
Message to commit while pushing.
blocking (`bool`, *optional*, defaults to `True`):
Whether the function should return only when the `git push` has finished.
token (`str`, *optional*, defaults to `None`):
Token with write permission to overwrite Trainer's original args.
revision (`str`, *optional*):
The git revision to commit from. Defaults to the head of the "main" branch.
kwargs (`dict[str, Any]`, *optional*):
Additional keyword arguments passed along to [`~Trainer.create_model_card`].
Returns:
The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
progress of the commit if `blocking=True`.
"""
self.callback_handler.on_push_begin(self.args, self.state, self.control)
model_name = kwargs.pop("model_name", None)
if model_name is None and self.args.should_save:
if self.args.hub_model_id is None:
model_name = Path(self.args.output_dir).name
else:
model_name = self.args.hub_model_id.split("/")[-1]
token = token if token is not None else self.args.hub_token
# In case the user calls this method with args.push_to_hub = False
if self.hub_model_id is None:
self.init_hf_repo(token=token)
# Needs to be executed on all processes for TPU training, but will only save on the processed determined by
# self.args.should_save.
self.save_model(_internal_call=True)
# Only push from one node.
if not self.is_world_process_zero():
return
# Add additional tags in the case the model has already some tags and users pass
# "tags" argument to `push_to_hub` so that trainer automatically handles internal tags
# from all models since Trainer does not call `model.push_to_hub`.
if getattr(self.model, "model_tags", None) is not None:
if "tags" not in kwargs:
kwargs["tags"] = []
# If it is a string, convert it to a list
if isinstance(kwargs["tags"], str):
kwargs["tags"] = [kwargs["tags"]]
for model_tag in self.model.model_tags:
if model_tag not in kwargs["tags"]:
kwargs["tags"].append(model_tag)
self.create_model_card(model_name=model_name, **kwargs)
if revision is None:
revision = self.args.hub_revision
# Wait for the current upload to be finished.
self._finish_current_push()
return upload_folder(
repo_id=self.hub_model_id,
folder_path=self.args.output_dir,
commit_message=commit_message,
token=token,
run_as_future=not blocking,
ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
revision=revision,
)
def _push_from_checkpoint(self, checkpoint_folder: str) -> None:
"""Push model and checkpoint files to the Hub from a checkpoint folder."""
if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
return
# If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
return
self.callback_handler.on_push_begin(self.args, self.state, self.control)
output_dir = self.args.output_dir
# To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
# Add sharded checkpoints if we have an index
for index_file in [WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]:
index_path = os.path.join(checkpoint_folder, index_file)
if os.path.isfile(index_path):
modeling_files.append(index_file)
with open(index_path) as f:
index = json.loads(f.read())
shard_files = list(set(index["weight_map"].values()))
modeling_files.extend(shard_files)
if is_peft_available():
modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME])
for modeling_file in modeling_files:
if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
# Saving the processing class is fast and we don't know how many files it may have spawned, so we resave it to be sure.
if self.processing_class is not None:
self.processing_class.save_pretrained(output_dir)
# Same for the training arguments
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
if self.args.save_strategy == SaveStrategy.STEPS:
commit_message = f"Training in progress, step {self.state.global_step}"
else:
commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
model_push_job = upload_folder(
repo_id=self.hub_model_id,
folder_path=output_dir,
commit_message=commit_message,
token=self.args.hub_token,
run_as_future=True,
ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
revision=self.args.hub_revision,
)
push_jobs = [model_push_job]
if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
path_in_repo = (
"last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
)
checkpoint_push = upload_folder(
repo_id=self.hub_model_id,
folder_path=checkpoint_folder,
path_in_repo=path_in_repo,
commit_message=commit_message + ", checkpoint",
token=self.args.hub_token,
run_as_future=True,
revision=self.args.hub_revision,
)
push_jobs.append(checkpoint_push)
if self.push_in_progress is None or self.push_in_progress.is_done():
self.push_in_progress = PushInProgress(push_jobs)
else:
self.push_in_progress.jobs.extend(push_jobs)
def _finish_current_push(self) -> None:
"""Wait for any in-progress push to the Hub to complete."""
if not hasattr(self, "push_in_progress"):
return
if self.push_in_progress is not None and not self.push_in_progress.is_done():
logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.")
self.push_in_progress.wait_until_done()
# ---- Hyperparameter Search ----
def hyperparameter_search(
self,
hp_space: Callable[["optuna.Trial"], dict[str, float]] | None = None,
compute_objective: Callable[[dict[str, float]], float] | None = None,
n_trials: int = 20,
direction: str | list[str] = "minimize",
backend: str | HPSearchBackend | None = None,
hp_name: Callable[["optuna.Trial"], str] | None = None,
**kwargs,
) -> BestRun | list[BestRun]:
"""
Launch a hyperparameter search using `optuna` or `Ray Tune`. The optimized quantity is determined
by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
the sum of all metrics otherwise.
To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
optimizer/scheduler.
Args:
hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*):
A function that defines the hyperparameter search space. Will default to
[`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`]
depending on your backend.
compute_objective (`Callable[[dict[str, float]], float]`, *optional*):
A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
method. Will default to [`~trainer_utils.default_compute_objective`].
n_trials (`int`, *optional*, defaults to 100):
The number of trial runs to test.
direction (`str` or `list[str]`, *optional*, defaults to `"minimize"`):
If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
several metrics. If it's multi objectives optimization, direction is `list[str]`, can be List of
`"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
`"maximize"` when optimizing one or several metrics.
backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
The backend to use for hyperparameter search. Will default to optuna or Ray Tune, depending
on which one is installed. If all are installed, will default to optuna.
hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
A function that defines the trial/run name. Will default to None.
kwargs (`dict[str, Any]`, *optional*):
Additional keyword arguments for each backend:
- `optuna`: parameters from
[optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from
[optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize)
- `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run).
If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available).
If `progress_reporter` is not set in the `kwargs`,
[ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used.
Returns:
[`trainer_utils.BestRun` or `list[trainer_utils.BestRun]`]: All the information about the best run or best
runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
backend.
"""
if backend is None:
backend = default_hp_search_backend()
backend = HPSearchBackend(backend)
backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
backend_obj.ensure_available()
self.hp_search_backend = backend
if self.model_init is None:
raise RuntimeError(
"To use hyperparameter search, you need to pass your model through a model_init function."
)
self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space
self.hp_name = hp_name
self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
best_run = backend_obj.run(self, n_trials, direction, **kwargs)
self.hp_search_backend = None
return best_run
def call_model_init(self, trial: "optuna.Trial | dict[str, Any] | None" = None) -> nn.Module:
"""Invoke `model_init` to get a fresh model instance, optionally conditioned on a hyperparameter trial."""
model_init_argcount = number_of_arguments(self.model_init)
if model_init_argcount == 0:
model = self.model_init()
elif model_init_argcount == 1:
model = self.model_init(trial)
else:
raise RuntimeError("model_init should have 0 or 1 argument.")
if model is None:
raise RuntimeError("model_init should not return None.")
return model
def _hp_search_setup(self, trial: "optuna.Trial | dict[str, Any] | None") -> None:
"""Set up training arguments and accelerator state for a hyperparameter search trial."""
self._trial = trial
if self.hp_search_backend is None or trial is None:
return
if self.hp_search_backend == HPSearchBackend.OPTUNA:
params = self.hp_space(trial)
elif self.hp_search_backend == HPSearchBackend.RAY:
params = trial
params.pop("wandb", None)
elif self.hp_search_backend == HPSearchBackend.WANDB:
params = trial
for key, value in params.items():
if not hasattr(self.args, key):
logger.warning(
f"Trying to set {key} in the hyperparameter search but there is no corresponding field in"
" `TrainingArguments`."
)
continue
old_attr = getattr(self.args, key, None)
# Casting value to the proper type
if old_attr is not None:
value = type(old_attr)(value)
setattr(self.args, key, value)
if self.hp_search_backend == HPSearchBackend.OPTUNA:
logger.info(f"Trial: {trial.params}")
if self.hp_search_backend == HPSearchBackend.WANDB:
logger.info(f"W&B Sweep parameters: {trial}")
if self.is_deepspeed_enabled:
if self.args.deepspeed is None:
raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
self.accelerator.free_memory()
# Rebuild the deepspeed config to reflect the updated training parameters
from accelerate.utils import DeepSpeedPlugin
from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
self.args.hf_deepspeed_config.trainer_config_process(self.args)
self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
# From 1.0 on, we need to fully wipe the DS plugin when doing sweeps.
# Simply calling `_reset_state` is enough and doesn't need a version pin.
AcceleratorState()._reset_state()
# `train_batch_size` might change when using HPO https://github.com/huggingface/transformers/pull/18918
self._train_batch_size = self.args.train_batch_size
self.create_accelerator_and_postprocess()
def _report_to_hp_search(
self, trial: "optuna.Trial | dict[str, Any] | None", step: int, metrics: dict[str, float]
) -> None:
"""Report intermediate metrics to the active hyperparameter search backend."""
if self.hp_search_backend is None or trial is None:
return
metrics = metrics.copy()
self.objective = self.compute_objective(metrics)
if self.hp_search_backend == HPSearchBackend.OPTUNA:
import optuna
if hasattr(trial, "study") and not trial.study._is_multi_objective():
trial.report(self.objective, step)
if trial.should_prune():
self.callback_handler.on_train_end(self.args, self.state, self.control)
raise optuna.TrialPruned()
elif self.hp_search_backend == HPSearchBackend.RAY:
import ray.tune
with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
checkpoint = None
if self.control.should_save:
self._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir)
checkpoint = ray.tune.Checkpoint.from_directory(temp_checkpoint_dir)
metrics["objective"] = self.objective
ray.tune.report(metrics, checkpoint=checkpoint)
def _tune_save_checkpoint(self, checkpoint_dir: str) -> None:
"""Save a checkpoint during a Ray Tune hyperparameter search trial."""
output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
self.save_model(output_dir, _internal_call=True)
if self.args.should_save:
# Update the `TrainerControl` state to where we are currently
self.state.stateful_callbacks["TrainerControl"] = self.control.state()
self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
# ---- Callbacks ----
def add_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> None:
"""
Add a callback to the current list of [`~transformers.TrainerCallback`].
Args:
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will instantiate a member of that class.
"""
self.callback_handler.add_callback(callback)
def pop_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> TrainerCallback | None:
"""
Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it.
If the callback is not found, returns `None` (and no error is raised).
Args:
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will pop the first member of that class found in the list of callbacks.
Returns:
[`~transformers.TrainerCallback`]: The callback removed, if found.
"""
return self.callback_handler.pop_callback(callback)
def remove_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> None:
"""
Remove a callback from the current list of [`~transformers.TrainerCallback`].
Args:
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will remove the first member of that class found in the list of callbacks.
"""
self.callback_handler.remove_callback(callback)
# ---- Utilities ----
def is_local_process_zero(self) -> bool:
"""
Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
machines) main process.
"""
return self.args.local_process_index == 0
def is_world_process_zero(self) -> bool:
"""
Whether or not this process is the global main process (when training in a distributed fashion on several
machines, this is only going to be `True` for one process).
"""
# Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global
# process index.
if is_sagemaker_mp_enabled():
return smp.rank() == 0
return self.args.process_index == 0
def _move_model_to_device(self, model: nn.Module, device: torch.device) -> None:
"""Move the model to the specified device, re-tying weights on XLA if needed."""
if getattr(model, "hf_device_map", None) is not None:
logger.warning(
"The model is already on multiple devices. Skipping the move to device specified in `args`."
)
return
model = model.to(device)
# Moving a model to an XLA device disconnects the tied weights, so we have to retie them.
if self.args.parallel_mode == ParallelMode.TPU and hasattr(model, "tie_weights"):
model.tie_weights()