| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117 |
- # Copyright 2023 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import copy
- import inspect
- import json
- import os
- import re
- from dataclasses import replace
- from typing import TYPE_CHECKING, Any, Literal, Optional
- from ..conversion_mapping import (
- _MODEL_TO_CONVERSION_PATTERN,
- get_checkpoint_conversion_mapping,
- get_model_conversion_mapping,
- )
- from ..core_model_loading import (
- Concatenate,
- ConversionOps,
- MergeModulelist,
- Transpose,
- WeightConverter,
- WeightRenaming,
- )
- from ..utils import (
- CONFIG_NAME,
- cached_file,
- check_peft_version,
- extract_commit_hash,
- find_adapter_config_file,
- is_accelerate_available,
- is_peft_available,
- is_torch_available,
- logging,
- )
- from ..utils.hub import DownloadKwargs
- from ..utils.loading_report import log_state_dict_report
- if is_torch_available():
- import torch
- if is_accelerate_available():
- from accelerate import dispatch_model
- from accelerate.utils import get_balanced_memory, infer_auto_device_map
- # Minimum PEFT version supported for the integration
- MIN_PEFT_VERSION = "0.18.0"
- logger = logging.get_logger(__name__)
- if TYPE_CHECKING:
- from ..modeling_utils import LoadStateDictConfig
- # TODO: remove once PEFT < 0.19 no longer supported
- def _block_diag_3d(tensors: list[torch.Tensor]) -> torch.Tensor:
- if len(tensors) < 2:
- raise ValueError(f"_block_diag_3d expects at least 2 tensors, got {len(tensors)}")
- if any(t.dim() != 3 for t in tensors):
- raise ValueError("_block_diag_3d expects all tensors to be 3d.")
- num_experts = tensors[0].shape[0]
- if any(t.shape[0] != num_experts for t in tensors):
- raise ValueError("All tensors passed to _block_diag_3d must have the same number of experts.")
- lora_b_block_diag = []
- for i in range(num_experts):
- lora_b_block_diag.append(torch.block_diag(*[tensor[i] for tensor in tensors]))
- return torch.stack(lora_b_block_diag, dim=0)
- # TODO: remove once PEFT < 0.19 no longer supported
- class PeftConcatenate(Concatenate):
- """Convert per-expert LoRA weights to merged weights.
- When the base weights are fused, e.g. W01 = [W0, W1], the LoRA weights also need to be fused. To achieve this
- correctly, concatenate the LoRA A weights along the r (rank) dimension. This doesn't require a new Operation. But
- for LoRA B, the weights need to be merged in a block diagonal fashion to achieve the correct result.
- To illustrate:
- Before:
- W0' = W0 + A0 @ B0
- W1' = W1 + A1 @ B1
- After:
- W01' = W01 + A01 @ B01_bd
- where:
- A01 = [A0, A1]
- B01_bd = [[B0, 0], [0, B1]]
- This class is responsible for merging LoRA B in this block-diagonal fashion. Assuming that we fuse N weights, it
- should look like this:
- 1. LoRA B is 2-dim
- Normal LoRA weight of shape (out_feat, rank), the output shape should be (N * out_feat, N * rank).
- 2. LoRA B is 3-dim
- MoE LoRA weight of shape (experts, out_feat, rank), the output shape should be (experts, N * out_feat, N * rank).
- After this, the experts x rank dimension are flattened, as PEFT expects 2d tensors for LoRA.
- """
- @torch.no_grad
- def convert(
- self,
- input_dict: dict[str, list[torch.Tensor]],
- source_patterns: list[str],
- target_patterns: list[str],
- full_layer_name: str,
- **kwargs,
- ) -> dict[str, list[torch.Tensor]]:
- dims = [v.dim() for v in input_dict.values()]
- if set(dims) not in ({2}, {3}):
- raise ValueError(
- f"To convert this LoRA adapter, the LoRA weights all need to have either 2 or 3 dims, got {set(dims)}"
- )
- # Keep source order stable (e.g. w1 before w3 for Mixtral) to preserve gate/up semantics.
- ordered_tensors = [
- input_dict[source_pattern] for source_pattern in source_patterns if source_pattern in input_dict
- ]
- if len(ordered_tensors) != len(input_dict):
- missing = set(input_dict) - set(source_patterns)
- raise ValueError(
- "Collected tensors contain keys not present in source_patterns. "
- f"Unexpected keys: {sorted(missing)}; source_patterns={source_patterns}"
- )
- if set(dims) == {2}:
- output_dict = {full_layer_name: torch.block_diag(*ordered_tensors)}
- else:
- # with r being the LoRA rank and n being the number of fused weights:
- out = _block_diag_3d(ordered_tensors) # shape = experts, n*out_feat, 2*r
- out = torch.permute(out, (2, 0, 1)) # shape = 2*r, experts, n*out_feat
- out = out.flatten(0, 1) # shape = 2*r * experts, n*out_feat
- out = out.T
- output_dict = {full_layer_name: out}
- return output_dict
- @property
- def reverse_op(self) -> ConversionOps:
- raise NotImplementedError("Reversing PEFT LoRA MoE conversions is not supported yet.")
- # TODO: remove once PEFT < 0.19 no longer supported
- class FlattenDims(ConversionOps):
- """
- Flatten the tensors along the given dimensions
- """
- def __init__(self, dims: int | tuple[int, ...]):
- if isinstance(dims, int):
- dims = (dims,)
- self.dims = dims
- @torch.no_grad
- def convert(
- self,
- input_dict: dict[str, list[torch.Tensor]],
- source_patterns: list[str],
- target_patterns: list[str],
- config,
- **kwargs,
- ) -> dict[str, list[torch.Tensor]]:
- output_dict = {k: v.flatten(*self.dims) for k, v in input_dict.items()}
- return output_dict
- @property
- def reverse_op(self) -> ConversionOps:
- raise NotImplementedError("Reversing flatteing operatio is not supported.")
- def __repr__(self):
- return f"{self.__class__.__name__}(dims={self.dims})"
- # TODO: remove once PEFT < 0.19 no longer supported
- class PermuteDims(ConversionOps):
- """
- Permute the tensors along the given dimensions
- """
- def __init__(self, dims: tuple[int, ...]):
- self.dims = dims
- @torch.no_grad
- def convert(
- self,
- input_dict: dict[str, list[torch.Tensor]],
- source_patterns: list[str],
- target_patterns: list[str],
- config,
- **kwargs,
- ) -> dict[str, list[torch.Tensor]]:
- output_dict = {k: v.permute(*self.dims) for k, v in input_dict.items()}
- return output_dict
- @property
- def reverse_op(self) -> ConversionOps:
- raise NotImplementedError("Reversing flatteing operatio is not supported yet.")
- def __repr__(self):
- return f"{self.__class__.__name__}(dims={self.dims})"
- # TODO: remove once PEFT < 0.19 no longer supported
- def build_peft_weight_mapping(
- weight_conversions: list[WeightConverter | WeightRenaming] | None, adapter_name: str, peft_config=None
- ) -> list[WeightConverter | WeightRenaming]:
- # We iterate over all the operations of the original model and simply edit them to apply to the PEFT adapter when
- # appropriate.
- # Note: This function is used in PEFT, changing it requires coordination.
- if not weight_conversions:
- return []
- # strip "base_model.model" and add adapter name
- new_weight_conversions = [WeightRenaming("base_model.model.model.", "model.")]
- prefixes = set()
- from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING
- peft_type = getattr(peft_config, "peft_type", None)
- if peft_type in PEFT_TYPE_TO_PREFIX_MAPPING:
- prefixes.add(PEFT_TYPE_TO_PREFIX_MAPPING[peft_type])
- else:
- prefixes.update(PEFT_TYPE_TO_PREFIX_MAPPING.values())
- for prefix in sorted(prefixes):
- escaped_prefix = re.escape(prefix)
- new_weight_conversions.append(
- WeightRenaming(
- source_patterns=rf"({escaped_prefix}[^\.]*)",
- target_patterns=rf"\1.{adapter_name}",
- )
- )
- for orig_conversion in weight_conversions:
- if isinstance(orig_conversion, WeightRenaming):
- new_weight_conversions.append(orig_conversion)
- continue
- if len(orig_conversion.target_patterns) == 1 and orig_conversion.target_patterns[0].endswith("gate_up_proj"):
- # gate_up_proj requires both merging the experts and concatenating for the fusion of w1 and w3
- for lora in ("lora_A", "lora_B"): # TODO: lora_embedding_A and lora_embedding_B
- # deal with operations
- peft_weight_operations = []
- for op in orig_conversion.operations:
- if isinstance(op, Concatenate):
- if lora == "lora_B": # block diagonal concat
- peft_weight_operations.append(PeftConcatenate(dim=op.dim))
- else: # normal concat + flatten
- peft_weight_operations.append(op)
- peft_weight_operations.append(FlattenDims(dims=(0, 1)))
- elif isinstance(op, MergeModulelist):
- peft_weight_operations.append(op)
- # TODO: this assumption may not hold for models != mixtral
- # For source, we capture the original weights + the lora weights
- new_source_patterns = []
- for pat in list(orig_conversion.source_patterns):
- # we replace the weight pattern to colllect loras
- pat = pat.rsplit(".", 1)[0]
- # note: the source state_dict does *not* contain the adapter name
- new_source_patterns.append(f"{pat}.{lora}.*")
- # the gate_up_proj is the innner PEFT ParamWrapper, so we need to use base_layer
- pat = orig_conversion.target_patterns[0]
- pat = pat.replace("gate_up_proj", "base_layer")
- # we make sure the target key is correct, add '.weight' because the parameter is targeted directly
- new_target_patterns = [f"{pat}.{lora}.{adapter_name}.weight"]
- # Instantiate a new object that correctly post process patterns if needed
- new_conversion = orig_conversion.__class__(
- source_patterns=new_source_patterns,
- target_patterns=new_target_patterns,
- distributed_operation=orig_conversion.distributed_operation,
- quantization_operation=orig_conversion.quantization_operation,
- operations=peft_weight_operations,
- )
- new_weight_conversions.append(new_conversion)
- elif len(orig_conversion.target_patterns) == 1 and orig_conversion.target_patterns[0].endswith("down_proj"):
- # down_proj only requires merging of experts
- for lora in ("lora_A", "lora_B"): # TODO: lora_embedding_A and lora_embedding_B
- peft_weight_operations = []
- for op in orig_conversion.operations:
- if isinstance(op, MergeModulelist):
- peft_weight_operations.append(op)
- if lora == "lora_A":
- peft_weight_operations.append(FlattenDims(dims=(0, 1)))
- else:
- peft_weight_operations.append(PermuteDims(dims=(2, 0, 1)))
- peft_weight_operations.append(FlattenDims(dims=(0, 1)))
- peft_weight_operations.append(Transpose(dim0=0, dim1=1))
- # TODO: this assumption may not hold for models != mixtral
- # For source, we capture the original weights + the lora weights
- new_source_patterns = []
- for pat in list(orig_conversion.source_patterns):
- # we replace the weight pattern to colllect loras
- pat = pat.rsplit(".", 1)[0]
- # note: the source state_dict does *not* contain the adapter name
- new_source_patterns.append(f"{pat}.{lora}.*")
- # the down_proj is the outer PEFT ParamWrapper, so we remove the prefix
- pat = orig_conversion.target_patterns[0]
- pat = pat.replace(".down_proj", "")
- # we make sure the target key is correct, add '.weight' because the parameter is targeted directly
- new_target_patterns = [f"{pat}.{lora}.{adapter_name}.weight"]
- # Instantiate a new object that correctly post process patterns if needed
- new_conversion = orig_conversion.__class__(
- source_patterns=new_source_patterns,
- target_patterns=new_target_patterns,
- distributed_operation=orig_conversion.distributed_operation,
- quantization_operation=orig_conversion.quantization_operation,
- operations=peft_weight_operations,
- )
- new_weight_conversions.append(new_conversion)
- return new_weight_conversions
- # The main reason we have to explicit this is because the conversion mapping
- # has the full layer name, while the config do not. We coould regex match but
- # this is more explicit and less error prone.
- # Note: this is used in PEFT, changing it requires coordiation.
- # TODO: remove once PEFT < 0.19 no longer supported
- _MOE_TARGET_MODULE_MAPPING: dict[str, dict[str, str]] = {
- "mixtral": {
- "gate": "gate.weight",
- "w1": "gate_up_proj",
- "w3": "gate_up_proj",
- "w2": "down_proj",
- },
- "qwen2_moe": {
- "gate": "gate.weight",
- "gate_proj": "gate_up_proj",
- "up_proj": "gate_up_proj",
- "down_proj": "down_proj",
- },
- }
- # Note: this is used in PEFT, changing it requires coordiation.
- # TODO: remove once PEFT < 0.19 no longer supported
- _MOE_FUSED_TARGETS: dict[str, dict[str, set[str]]] = {
- # use lists for dict values to ensure stable order
- "mixtral": {"gate_up_proj": ["w1", "w3"]},
- "qwen2_moe": {"gate_up_proj": ["gate_proj", "up_proj"]},
- }
- # TODO: remove once PEFT < 0.19 no longer supported
- def patch_moe_parameter_targeting(model, peft_config):
- """PEFT currently assumes that expert layers are of shape
- (expert, in, out)
- but with Mixtral in transformers v5 this is not true anymore.
- This will be addressed in PEFT >0.19 until then we need to handle
- it here for now.
- """
- from functools import wraps
- import peft
- model_type = getattr(model.config, "model_type", None)
- if get_checkpoint_conversion_mapping(model_type) is not None:
- update_layer = peft.tuners.lora.layer.ParamWrapper.update_layer
- @wraps(update_layer)
- def new_update_layer(layer, *args, **kwargs):
- did_swap = getattr(layer, "_did_swap_in_out_features", False)
- if not did_swap and layer.parameter_name in ("down_proj", "gate_up_proj"):
- tmp_in_features = layer.in_features
- layer.in_features = layer.out_features
- layer.out_features = tmp_in_features
- layer._did_swap_in_out_features = True
- return update_layer(layer, *args, **kwargs)
- peft.tuners.lora.layer.ParamWrapper.update_layer = new_update_layer
- class PeftAdapterMixin:
- """
- A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
- more details about adapters and injecting them on a transformer-based model, check out the documentation of PEFT
- library: https://huggingface.co/docs/peft/index
- Currently supported PEFT methods are all non-prompt learning methods (LoRA, IA³, etc.). Other PEFT models such as
- prompt tuning, prompt learning are out of scope as these adapters are not "injectable" into a torch module. For
- using these methods, please refer to the usage guide of PEFT library.
- With this mixin, if the correct PEFT version is installed (>= 0.18.0), it is possible to:
- - Load an adapter stored on a local path or in a remote Hub repository, and inject it in the model
- - Attach new adapters in the model and train them with Trainer or by your own.
- - Attach multiple adapters and iteratively activate / deactivate them
- - Activate / deactivate all adapters from the model.
- - Get the `state_dict` of the active adapter.
- """
- _hf_peft_config_loaded = False
- _prepare_peft_hotswap_kwargs: dict | None = None
- def load_adapter(
- self,
- peft_model_id: str | None = None,
- adapter_name: str | None = None,
- peft_config: dict[str, Any] | None = None,
- adapter_state_dict: dict[str, "torch.Tensor"] | None = None,
- low_cpu_mem_usage: bool = False,
- is_trainable: bool = False,
- hotswap: bool | Literal["auto"] = "auto",
- local_files_only: bool = False,
- adapter_kwargs: dict[str, Any] | None = None,
- load_config: Optional["LoadStateDictConfig"] = None,
- **kwargs,
- ) -> None:
- """
- Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
- invite you to read more about them on PEFT official documentation: https://huggingface.co/docs/peft
- Requires PEFT to be installed as a backend to load the adapter weights.
- Args:
- peft_model_id (`str`, *optional*):
- The identifier of the model to look for on the Hub, or a local path to the saved adapter config file
- and adapter weights.
- adapter_name (`str`, *optional*):
- The adapter name to use. If not set, will use the name "default".
- load_config (`LoadStateDictConfig`, *optional*):
- A load configuration to reuse when pulling adapter weights, typically from `from_pretrained`.
- kwargs (`dict[str, Any]`, *optional*):
- Additional `LoadStateDictConfig` fields passed as keyword arguments.
- peft_config (`dict[str, Any]`, *optional*):
- The configuration of the adapter to add, supported adapters are all non-prompt learning configs (LoRA,
- IA³, etc). This argument is used in case users directly pass PEFT state dicts.
- adapter_state_dict (`dict[str, torch.Tensor]`, *optional*):
- The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
- dicts.
- low_cpu_mem_usage (`bool`, *optional*, defaults to `False`):
- Reduce memory usage while loading the PEFT adapter. This should also speed up the loading process.
- is_trainable (`bool`, *optional*, defaults to `False`):
- Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
- used for inference.
- hotswap : (`"auto"` or `bool`, *optional*, defaults to `"auto"`)
- Whether to substitute an existing (LoRA) adapter with the newly loaded adapter in-place. This means
- that, instead of loading an additional adapter, this will take the existing adapter weights and replace
- them with the weights of the new adapter. This can be faster and more memory efficient. However, the
- main advantage of hotswapping is that when the model is compiled with torch.compile, loading the new
- adapter does not require recompilation of the model. When using hotswapping, the passed `adapter_name`
- should be the name of an already loaded adapter.
- If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
- to call an additional method before loading the adapter:
- ```py
- model = AutoModel.from_pretrained(...)
- max_rank = ... # the highest rank among all LoRAs that you want to load
- # call *before* compiling and loading the LoRA adapter
- model.enable_peft_hotswap(target_rank=max_rank)
- model.load_adapter(file_name_1, adapter_name="default")
- # optionally compile the model now
- model = torch.compile(model, ...)
- output_1 = model(...)
- # now you can hotswap the 2nd adapter, use the same name as for the 1st
- # hotswap is activated by default since enable_peft_hotswap was called
- model.load_adapter(file_name_2, adapter_name="default")
- output_2 = model(...)
- ```
- By default, hotswap is disabled and requires passing `hotswap=True`. If you called
- `enable_peft_hotswap` first, it is enabled. You can still manually disable it in that case by passing
- `hotswap=False`.
- Note that hotswapping comes with a couple of limitations documented here:
- https://huggingface.co/docs/peft/main/en/package_reference/hotswap
- adapter_kwargs (`dict[str, Any]`, *optional*):
- Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
- `find_adapter_config_file` method.
- """
- from peft import PeftType
- from ..modeling_utils import LoadStateDictConfig, _get_resolved_checkpoint_files
- if local_files_only:
- kwargs["local_files_only"] = True
- base_load_config = load_config.__dict__ if load_config is not None else {}
- base_load_config.update(kwargs)
- base_load_config.setdefault("pretrained_model_name_or_path", None)
- load_config = LoadStateDictConfig(**base_load_config)
- peft_model_id = peft_model_id or load_config.pretrained_model_name_or_path
- if hotswap == "auto":
- # if user called model.enable_peft_hotswap and this is not the first adapter, enable hotswap
- hotswap_enabled = getattr(self, "_hotswap_enabled", False)
- not_first_adapter = bool(self._hf_peft_config_loaded and (adapter_name in self.peft_config))
- hotswap = hotswap_enabled and not_first_adapter
- if hotswap:
- if (not self._hf_peft_config_loaded) or (adapter_name not in self.peft_config):
- raise ValueError(
- "To hotswap an adapter, there must already be an existing adapter with the same adapter name."
- )
- if any(conf.peft_type != PeftType.LORA for conf in self.peft_config.values()):
- raise ValueError("Hotswapping is currently only supported for LoRA, please set `hotswap=False`.")
- adapter_name = adapter_name if adapter_name is not None else "default"
- adapter_kwargs = adapter_kwargs or {}
- from peft import PeftConfig, inject_adapter_in_model
- if self._hf_peft_config_loaded and (not hotswap) and (adapter_name in self.peft_config):
- raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
- elif hotswap and ((not self._hf_peft_config_loaded) or (adapter_name not in self.peft_config)):
- raise ValueError(
- "To hotswap an adapter, there must already be an existing adapter with the same adapter name."
- )
- if peft_model_id is None and (adapter_state_dict is None and peft_config is None):
- raise ValueError(
- "You should either pass a `peft_model_id` or a `peft_config` and `adapter_state_dict` to load an adapter."
- )
- if peft_config is None:
- load_config.download_kwargs.update(**adapter_kwargs)
- adapter_config_file = find_adapter_config_file(
- peft_model_id,
- **load_config.download_kwargs,
- )
- if adapter_config_file is None:
- raise ValueError(
- f"adapter model file not found in {peft_model_id}. Make sure you are passing the correct path to the "
- "adapter model."
- )
- peft_config = PeftConfig.from_pretrained(
- peft_model_id,
- **load_config.download_kwargs,
- )
- weight_conversions = get_model_conversion_mapping(self)
- # TODO: remove once PEFT < 0.19 is dropped, use peft.utils.transformers_weight_conversion
- peft_config = convert_peft_config_for_transformers(peft_config, model=self, conversions=weight_conversions)
- if hasattr(peft_config, "inference_mode"):
- peft_config.inference_mode = not is_trainable
- peft_weight_conversions = build_peft_weight_mapping(weight_conversions, adapter_name, peft_config=peft_config)
- patch_moe_parameter_targeting(model=self, peft_config=peft_config)
- if not hotswap:
- # Create and add fresh new adapters into the model, unless the weights are hotswapped
- inject_adapter_in_model(peft_config, self, adapter_name)
- if not self._hf_peft_config_loaded:
- self._hf_peft_config_loaded = True
- if adapter_state_dict is None:
- adapter_filenames = ["adapter_model.safetensors", "adapter_model.bin"]
- if load_config.use_safetensors is False:
- adapter_filenames.reverse()
- checkpoint_files = sharded_metadata = None
- last_error = None
- for adapter_filename in adapter_filenames:
- try:
- checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
- pretrained_model_name_or_path=peft_model_id,
- variant=None,
- gguf_file=None,
- use_safetensors=(
- load_config.use_safetensors if adapter_filename.endswith(".safetensors") else False
- ),
- user_agent=None,
- is_remote_code=False,
- transformers_explicit_filename=adapter_filename,
- download_kwargs=load_config.download_kwargs,
- )
- break
- except OSError as error:
- last_error = error
- if checkpoint_files is None:
- raise last_error or OSError("Could not download either a .bin or a .safetensors adapter file.")
- else:
- checkpoint_files, sharded_metadata = [], {}
- device_map = getattr(self, "hf_device_map", {"": self.device})
- load_config = replace(
- load_config,
- pretrained_model_name_or_path=peft_model_id,
- sharded_metadata=sharded_metadata,
- weight_mapping=peft_weight_conversions,
- device_map=device_map,
- )
- loading_info, _ = self._load_pretrained_model(
- model=self,
- state_dict=adapter_state_dict,
- checkpoint_files=checkpoint_files,
- load_config=load_config,
- # pass expected keys explicitly, otherwise they are determined from the state_dict, which can contain
- # unexpected entries, like "layer.SCB" from a bnb layer.
- expected_keys=[n for n, _ in self.named_parameters()],
- )
- if peft_config.inference_mode:
- from peft.tuners.tuners_utils import BaseTunerLayer
- self.eval()
- for module in self.modules():
- if isinstance(module, BaseTunerLayer):
- module.requires_grad_(False)
- adapter_key_markers = {adapter_name}
- if peft_config is not None and getattr(peft_config, "peft_type", None) is not None:
- adapter_key_markers.add(peft_config.peft_type.value.lower())
- def is_adapter_key(key: str) -> bool:
- return any(marker in key for marker in adapter_key_markers)
- loading_info.missing_keys = {k for k in loading_info.missing_keys if is_adapter_key(k)}
- log_state_dict_report(
- model=self,
- pretrained_model_name_or_path=load_config.pretrained_model_name_or_path,
- ignore_mismatched_sizes=load_config.ignore_mismatched_sizes,
- loading_info=loading_info,
- logger=logger,
- )
- def enable_peft_hotswap(
- self, target_rank: int = 128, check_compiled: Literal["error", "warn", "ignore"] = "error"
- ) -> None:
- """Enables the possibility to hotswap PEFT adapters with different ranks, or, if the model is compiled, without
- triggering recompilation.
- Right now, hotswapping is only supported for LoRA.
- Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of
- the loaded adapters differ. If the ranks are all identical and the model is not compiled, hotswapping works
- without calling this method first.
- Args:
- target_rank (`int`, *optional*, defaults to `128`):
- The highest rank among all the adapters that will be loaded.
- check_compiled (`str`, *optional*, defaults to `"error"`):
- How to handle the case when the model is already compiled, which should generally be avoided. The
- options are:
- - "error" (default): raise an error
- - "warn": issue a warning
- - "ignore": do nothing
- """
- if getattr(self, "peft_config", {}):
- if check_compiled == "error":
- raise RuntimeError("Call `enable_peft_hotswap` before loading the first adapter.")
- elif check_compiled == "warn":
- logger.warning(
- "It is recommended to call `enable_peft_hotswap` before loading the first adapter to avoid recompilation."
- )
- elif check_compiled != "ignore":
- raise ValueError(
- f"check_compiles should be one of 'error', 'warn', or 'ignore', got '{check_compiled}' instead."
- )
- self._hotswap_enabled = True
- self._prepare_peft_hotswap_kwargs = {"target_rank": target_rank, "check_compiled": check_compiled}
- def add_adapter(self, adapter_config, adapter_name: str | None = None) -> None:
- r"""
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Adds a fresh new adapter to the current model for training purpose. If no adapter name is passed, a default
- name is assigned to the adapter to follow the convention of PEFT library (in PEFT we use "default" as the
- default adapter name).
- Note that the newly added adapter is not automatically activated. To activate it, use `model.set_adapter`.
- Args:
- adapter_config (`~peft.PeftConfig`):
- The configuration of the adapter to add, supported adapters are non-prompt learning methods (LoRA,
- IA³, etc.).
- adapter_name (`str`, *optional*, defaults to `"default"`):
- The name of the adapter to add. If no name is passed, a default name is assigned to the adapter.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- from peft import PeftConfig, inject_adapter_in_model
- adapter_name = adapter_name or "default"
- if not self._hf_peft_config_loaded:
- self._hf_peft_config_loaded = True
- elif adapter_name in self.peft_config:
- raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
- if not isinstance(adapter_config, PeftConfig):
- raise TypeError(f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead.")
- # Retrieve the name or path of the model, one could also use self.config._name_or_path
- # but to be consistent with what we do in PEFT: https://github.com/huggingface/peft/blob/6e783780ca9df3a623992cc4d1d665001232eae0/src/peft/mapping.py#L100
- adapter_config.base_model_name_or_path = self.__dict__.get("name_or_path", None)
- # TODO: WE NEED TOO APPLY OUR DYNAMIC WEIGHT CONVERSION AT SOME POINT HERE!
- inject_adapter_in_model(adapter_config, self, adapter_name)
- self.set_adapter(adapter_name)
- def set_adapter(self, adapter_name: list[str] | str) -> None:
- """
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
- Args:
- adapter_name (`Union[list[str], str]`):
- The name of the adapter to set. Can be also a list of strings to set multiple adapters.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- elif isinstance(adapter_name, list):
- missing = set(adapter_name) - set(self.peft_config)
- if len(missing) > 0:
- raise ValueError(
- f"Following adapter(s) could not be found: {', '.join(missing)}. Make sure you are passing the correct adapter name(s)."
- f" current loaded adapters are: {list(self.peft_config.keys())}"
- )
- elif adapter_name not in self.peft_config:
- raise ValueError(
- f"Adapter with name {adapter_name} not found. Please pass the correct adapter name among {list(self.peft_config.keys())}"
- )
- from peft.tuners.tuners_utils import BaseTunerLayer
- from peft.utils import ModulesToSaveWrapper
- _adapters_has_been_set = False
- for _, module in self.named_modules():
- if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
- module.set_adapter(adapter_name)
- _adapters_has_been_set = True
- if not _adapters_has_been_set:
- raise ValueError(
- "Did not succeeded in setting the adapter. Please make sure you are using a model that supports adapters."
- )
- def disable_adapters(self) -> None:
- r"""
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Disable all adapters that are attached to the model. This leads to inferring with the base model only.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- from peft.tuners.tuners_utils import BaseTunerLayer
- from peft.utils import ModulesToSaveWrapper
- for _, module in self.named_modules():
- if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
- module.enable_adapters(enabled=False)
- def enable_adapters(self) -> None:
- """
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Enable adapters that are attached to the model.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- from peft.tuners.tuners_utils import BaseTunerLayer
- for _, module in self.named_modules():
- if isinstance(module, BaseTunerLayer):
- module.enable_adapters(enabled=True)
- def active_adapters(self) -> list[str]:
- """
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Gets the current active adapters of the model. In case of multi-adapter inference (combining multiple adapters
- for inference) returns the list of all active adapters so that users can deal with them accordingly.
- For previous PEFT versions (that does not support multi-adapter inference), `module.active_adapter` will return
- a single string.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- from peft.tuners.tuners_utils import BaseTunerLayer
- for _, module in self.named_modules():
- if isinstance(module, BaseTunerLayer):
- active_adapters = module.active_adapter
- break
- # For previous PEFT versions
- if isinstance(active_adapters, str):
- active_adapters = [active_adapters]
- return active_adapters
- def get_adapter_state_dict(self, adapter_name: str | None = None, state_dict: dict | None = None) -> dict:
- """
- If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
- official documentation: https://huggingface.co/docs/peft
- Gets the adapter state dict that should only contain the weights tensors of the specified adapter_name adapter.
- If no adapter_name is passed, the active adapter is used.
- Args:
- adapter_name (`str`, *optional*):
- The name of the adapter to get the state dict from. If no name is passed, the active adapter is used.
- state_dict (nested dictionary of `torch.Tensor`, *optional*)
- The state dictionary of the model. Will default to `self.state_dict()`, but can be used if special
- precautions need to be taken when recovering the state dictionary of a model (like when using model
- parallelism).
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- from peft import get_peft_model_state_dict
- if adapter_name is None:
- adapter_name = self.active_adapters()[0]
- adapter_state_dict = get_peft_model_state_dict(self, state_dict=state_dict, adapter_name=adapter_name)
- return adapter_state_dict
- def _dispatch_accelerate_model(
- self,
- device_map: str,
- max_memory: int | None = None,
- offload_folder: str | None = None,
- offload_index: int | None = None,
- ) -> None:
- """
- Optional re-dispatch the model and attach new hooks to the model in case the model has been loaded with
- accelerate (i.e. with `device_map=xxx`)
- Args:
- device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
- A map that specifies where each submodule should go. It doesn't need to be refined to each
- parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
- same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
- like `1`) on which the model will be allocated, the device map will map the entire model to this
- device. Passing `device_map = 0` means put the whole model on GPU 0.
- To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
- more information about each option see [designing a device
- map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
- max_memory (`Dict`, *optional*):
- A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
- GPU and the available CPU RAM if unset.
- offload_folder (`str` or `os.PathLike`, *optional*):
- If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
- offload_index (`int`, *optional*):
- The offload_index argument to be passed to `accelerate.dispatch_model` method.
- """
- dispatch_model_kwargs = {}
- # Safety checker for previous `accelerate` versions
- # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/
- if "offload_index" in inspect.signature(dispatch_model).parameters:
- dispatch_model_kwargs["offload_index"] = offload_index
- no_split_module_classes = self._no_split_modules
- if device_map != "sequential":
- max_memory = get_balanced_memory(
- self,
- max_memory=max_memory,
- no_split_module_classes=no_split_module_classes,
- low_zero=(device_map == "balanced_low_0"),
- )
- if isinstance(device_map, str):
- device_map = infer_auto_device_map(
- self, max_memory=max_memory, no_split_module_classes=no_split_module_classes
- )
- dispatch_model(
- self,
- device_map=device_map,
- offload_dir=offload_folder,
- **dispatch_model_kwargs,
- )
- def delete_adapter(self, adapter_names: list[str] | str) -> None:
- """
- Delete a PEFT adapter from the underlying model.
- Args:
- adapter_names (`Union[list[str], str]`):
- The name(s) of the adapter(s) to delete.
- """
- check_peft_version(min_version=MIN_PEFT_VERSION)
- if not self._hf_peft_config_loaded:
- raise ValueError("No adapter loaded. Please load an adapter first.")
- from peft.functional import delete_adapter
- if isinstance(adapter_names, str):
- adapter_names = [adapter_names]
- # Check that all adapter names are present in the config
- missing_adapters = [name for name in adapter_names if name not in self.peft_config]
- if missing_adapters:
- raise ValueError(
- f"The following adapter(s) are not present and cannot be deleted: {', '.join(missing_adapters)}"
- )
- prefixes = [f"{self.peft_config[adapter_name].peft_type.value.lower()}_" for adapter_name in adapter_names]
- for adapter_name, prefix in zip(adapter_names, prefixes):
- delete_adapter(self, adapter_name=adapter_name, prefix=prefix)
- # For transformers integration - we need to pop the adapter from the config
- if getattr(self, "_hf_peft_config_loaded", False) and hasattr(self, "peft_config"):
- self.peft_config.pop(adapter_name, None)
- # In case all adapters are deleted, we need to delete the config
- # and make sure to set the flag to False
- if len(self.peft_config) == 0:
- del self.peft_config
- self._hf_peft_config_loaded = False
- def maybe_load_adapters(
- pretrained_model_name_or_path,
- download_kwargs: DownloadKwargs,
- **adapter_kwargs,
- ):
- if pretrained_model_name_or_path is None or not is_peft_available():
- return None, pretrained_model_name_or_path, adapter_kwargs
- token = download_kwargs.get("token")
- if download_kwargs.get("commit_hash") is None:
- resolved_config_file = cached_file(
- pretrained_model_name_or_path,
- CONFIG_NAME,
- cache_dir=download_kwargs.get("cache_dir"),
- force_download=bool(download_kwargs.get("force_download", False)),
- proxies=download_kwargs.get("proxies"),
- local_files_only=bool(download_kwargs.get("local_files_only", False)),
- token=token,
- revision=download_kwargs.get("revision"),
- subfolder=download_kwargs.get("subfolder"),
- _raise_exceptions_for_gated_repo=False,
- _raise_exceptions_for_missing_entries=False,
- _raise_exceptions_for_connection_errors=False,
- )
- download_kwargs["commit_hash"] = extract_commit_hash(resolved_config_file, None)
- _adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None)
- token_from_adapter_kwargs = adapter_kwargs.pop("token", None)
- if _adapter_model_path is None:
- peft_kwargs = adapter_kwargs.copy()
- for arg_name in ("cache_dir", "proxies", "subfolder"): # don't override revision
- if (arg_name not in peft_kwargs) and (arg_name in download_kwargs):
- peft_kwargs[arg_name] = download_kwargs[arg_name]
- if "commit_hash" in download_kwargs:
- peft_kwargs["_commit_hash"] = download_kwargs["commit_hash"]
- peft_kwargs["force_download"] = bool(download_kwargs.get("force_download", False))
- peft_kwargs["local_files_only"] = bool(download_kwargs.get("local_files_only", False))
- peft_kwargs["token"] = token or token_from_adapter_kwargs
- _adapter_model_path = find_adapter_config_file(
- pretrained_model_name_or_path,
- **peft_kwargs,
- )
- if _adapter_model_path is not None and os.path.isfile(_adapter_model_path):
- with open(_adapter_model_path, "r", encoding="utf-8") as f:
- _adapter_model_path = pretrained_model_name_or_path
- # Only override the model name/path if the current value doesn't point to a
- # complete model with an embedded adapter so that local models with embedded
- # adapters will load from the local base model rather than pull the base
- # model named in the adapter's config from the hub.
- if not os.path.exists(pretrained_model_name_or_path) or not os.path.exists(
- os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
- ):
- pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]
- return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
- #####################
- # weight conversion #
- #####################
- # With transformers v5, we need to convert some weights to reflect updated model architectures. If users have trained
- # PEFT adapters for these models, they also need to be updated. This may require updating the PEFT config too. The
- # logic for this is found below. Right now, only LoRA is supported.
- # TODO: remove once PEFT < 0.19 no longer supported
- def _convert_peft_config_moe(peft_config, model_type: str):
- base_model_type = _MODEL_TO_CONVERSION_PATTERN.get(model_type, None)
- if base_model_type is None:
- return peft_config
- target_module_mapping = _MOE_TARGET_MODULE_MAPPING[base_model_type]
- fused_targets = _MOE_FUSED_TARGETS.get(base_model_type, {})
- peft_config.target_parameters = set(peft_config.target_parameters or [])
- peft_config.target_modules = set(peft_config.target_modules or [])
- if not hasattr(peft_config, "rank_pattern") or peft_config.rank_pattern is None:
- peft_config.rank_pattern = {}
- if not hasattr(peft_config, "alpha_pattern") or peft_config.alpha_pattern is None:
- peft_config.alpha_pattern = {}
- new_target_parameters = peft_config.target_parameters.copy()
- remaining_target_modules = set()
- matched_targets: dict[str, set[str]] = {new_name: set() for new_name in fused_targets}
- for target in peft_config.target_modules:
- mapped_new_name = None
- mapped_old_name = None
- for old_name, new_name in target_module_mapping.items():
- if (target == old_name) or target.endswith(f".{old_name}"):
- mapped_new_name = new_name
- mapped_old_name = old_name
- break
- if mapped_new_name is None:
- remaining_target_modules.add(target)
- continue
- new_target_parameters.add(mapped_new_name)
- if mapped_new_name in fused_targets and mapped_old_name is not None:
- matched_targets.setdefault(mapped_new_name, set()).add(mapped_old_name)
- for new_name, required_old_targets in fused_targets.items():
- present_targets = matched_targets.get(new_name, set())
- if 0 < len(present_targets) < len(required_old_targets):
- missing = ", ".join(sorted(required_old_targets - present_targets))
- present = ", ".join(sorted(present_targets))
- raise ValueError(
- f"Cannot convert PEFT target(s) {present} without also targeting {missing} because they are fused into {new_name}."
- )
- if len(present_targets) == len(required_old_targets) and len(required_old_targets) > 1:
- peft_config.rank_pattern[rf".*\.{re.escape(new_name)}"] = peft_config.r * len(required_old_targets)
- # Preserve per-branch LoRA scaling after fusion.
- # Example: w1 + w3 => r doubles, so alpha must also double to keep alpha/r unchanged.
- peft_config.alpha_pattern[rf".*\.{re.escape(new_name)}"] = peft_config.lora_alpha * len(
- required_old_targets
- )
- peft_config.target_parameters = new_target_parameters
- peft_config.target_modules = remaining_target_modules
- return peft_config
- # TODO: remove once PEFT < 0.19 no longer supported
- def convert_peft_config_for_transformers(peft_config, model: torch.nn.Module, conversions: list[Any] | None):
- """
- Convert the PEFT config of models whose architecture changed from transformers v4 to v5.
- For most models, this requires no changes, this mostly affects some MoE models like Mixtral.
- """
- # If, for any reason, we cannot apply conversion, we just return the PEFT config as is.
- from peft import PeftType # avoid circular import
- if peft_config.peft_type != PeftType.LORA:
- # weight conversion is currently only supported for LoRA
- return peft_config
- if not hasattr(model, "config"):
- # not a transformer model
- return peft_config
- if not hasattr(model.config, "model_type"):
- # not a transformer model
- return peft_config
- peft_config = copy.deepcopy(peft_config) # don't mutate the original config
- model_type = getattr(model.config, "model_type", None)
- if get_checkpoint_conversion_mapping(model_type) is not None:
- peft_config = _convert_peft_config_moe(peft_config, model_type)
- return peft_config
|