# Copyright (C) 2025 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from copy import deepcopy from typing import TYPE_CHECKING from .core_model_loading import ( Chunk, Concatenate, ErnieFuseAndSplitTextVisionExperts, MergeModulelist, Transpose, WeightConverter, WeightRenaming, ) if TYPE_CHECKING: from .modeling_utils import PreTrainedModel from .quantizers import HfQuantizer _MODEL_TO_CONVERSION_PATTERN = { # Mixtral-style MoE "minimax": "mixtral", "minimax_m2": "mixtral", # Qwen2-style MoE "afmoe": "qwen2_moe", "deepseek_v2": "qwen2_moe", "deepseek_v3": "qwen2_moe", "dots1": "qwen2_moe", "ernie4_5_moe": "qwen2_moe", "glm4_moe": "qwen2_moe", "glm4_moe_lite": "qwen2_moe", "glm_moe_dsa": "qwen2_moe", "glm4v_moe": "qwen2_moe", "longcat_flash": "qwen2_moe", "solar_open": "qwen2_moe", "qwen3_moe": "qwen2_moe", "qwen3_omni_moe": "qwen2_moe", "qwen3_omni_moe_thinker": "qwen2_moe", "qwen3_next": "qwen2_moe", "hunyuan_v1_moe": "qwen2_moe", "flex_olmo": "qwen2_moe", "olmoe": "qwen2_moe", "exaone_moe": "qwen2_moe", "rt_detr_v2": "rt_detr", "pp_doclayout_v2": "rt_detr", "pp_doclayout_v3": "rt_detr", "paligemma": "llava", "aya_vision": "llava", "fuyu": "llava", "got_ocr2": "llava", "shieldgemma2": "llava", "gemma3": "llava", "internvl": "llava", "llava_next": "llava", "llava_next_video": "llava", "llava_onevision": "llava", "vipllava": "llava", "video_llava": "llava", "mistral3": "llava", "mllama": "llava", "qwen2_5_vl": "qwen2_vl", "sam3_tracker_video": "sam3_tracker", "pp_chart2table": "llava", "gemma3n_text": "qwen3_5_text", "qwen3_5_moe_text": "qwen3_5_text", } def _build_checkpoint_conversion_mapping(): mapping = { "llava": [ WeightRenaming(source_patterns=r"language_model.model", target_patterns="language_model"), WeightRenaming(source_patterns=r"language_model.lm_head", target_patterns="lm_head"), ], "emu3": [ WeightRenaming(source_patterns=r"text_model.model", target_patterns="text_model"), WeightRenaming(source_patterns=r"text_model.lm_head", target_patterns="lm_head"), ], "paddleocr_vl": [ WeightRenaming(source_patterns=r"mlp_AR", target_patterns="model.projector"), WeightRenaming( source_patterns=r"^model(?!(\.visual|\.projector|\.language_model))", target_patterns="model.language_model", ), ], "qwen2_vl": [ WeightRenaming( source_patterns=r"(? we end up with 2 tensors Concatenate(dim=1), # each process has 2 tensors, gate and up, we concat them into gate_up ], # we want the loading to add this shard operation here. Though we can't shard after concats and merge, needs to be first ), WeightConverter( source_patterns=[ ".experts.*.w2.weight", ], target_patterns=".experts.down_proj", # target key gets the list of two tensors operations=[ MergeModulelist( dim=0 ), # each process has two lists of tensors, we cat each list. -> we end up with 2 tensors ], # we want the loading to add this shard operation here. Though we can't shard after concats and merge, needs to be first ), ], "qwen2_moe": [ WeightConverter( source_patterns=[ "mlp.experts.*.gate_proj.weight", "mlp.experts.*.up_proj.weight", ], target_patterns="mlp.experts.gate_up_proj", operations=[MergeModulelist(dim=0), Concatenate(dim=1)], ), WeightConverter( source_patterns="mlp.experts.*.down_proj.weight", target_patterns="mlp.experts.down_proj", operations=[MergeModulelist(dim=0)], ), ], "qwen3_vl_moe": [ WeightConverter( source_patterns="mlp.experts.gate_up_proj", target_patterns="mlp.experts.gate_up_proj", operations=[Transpose(1, 2, check_dims=True)], ), WeightConverter( source_patterns="mlp.experts.down_proj", target_patterns="mlp.experts.down_proj", operations=[Transpose(1, 2, check_dims=True)], ), ], "phimoe": [ WeightRenaming(".block_sparse_moe.", ".mlp."), WeightRenaming(".gate.weight", ".router.weight"), WeightConverter( source_patterns=[ ".experts.*.w1.weight", ".experts.*.w3.weight", ], target_patterns=".experts.gate_up_proj", operations=[MergeModulelist(dim=0), Concatenate(dim=1)], ), WeightConverter( source_patterns=".experts.*.w2.weight", target_patterns=".experts.down_proj", operations=[MergeModulelist(dim=0)], ), ], "lfm2_moe": [ WeightConverter( source_patterns=[ "feed_forward.experts.*.w1.weight", "feed_forward.experts.*.w3.weight", ], target_patterns="feed_forward.experts.gate_up_proj", operations=[MergeModulelist(dim=0), Concatenate(dim=1)], ), WeightConverter( source_patterns="feed_forward.experts.*.w2.weight", target_patterns="feed_forward.experts.down_proj", operations=[MergeModulelist(dim=0)], ), ], "ernie4_5_vl_moe": [ # vision WeightRenaming("vision_model", "vision_tower"), # resampler WeightRenaming("spatial_linear.0", "spatial_linear.fc1"), WeightRenaming("spatial_linear.2", "spatial_linear.fc2"), WeightRenaming("spatial_linear.3", "spatial_linear.ln"), WeightRenaming("temporal_linear.0", "temporal_linear.fc1"), WeightRenaming("temporal_linear.2", "temporal_linear.fc2"), WeightRenaming("temporal_linear.3", "temporal_linear.ln"), # language model WeightRenaming(r"(? None: global _checkpoint_conversion_mapping_cache if _checkpoint_conversion_mapping_cache is None: _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping() if model_type in _checkpoint_conversion_mapping_cache and not overwrite: raise ValueError(f"Model type {model_type} already exists in the checkpoint conversion mapping.") _checkpoint_conversion_mapping_cache[model_type] = mapping def extract_weight_conversions_for_model(model: PreTrainedModel) -> list[WeightConverter | WeightRenaming] | None: model_type = getattr(model.config, "model_type", None) if model_type is not None: model_specific_conversions = get_checkpoint_conversion_mapping(model_type) return model_specific_conversions return None def get_model_conversion_mapping( model: PreTrainedModel, key_mapping: dict[str, str] | None = None, hf_quantizer: HfQuantizer | None = None, add_legacy: bool = True, ) -> list[WeightConverter | WeightRenaming]: """ For a given `model`, obtain the weight conversion mapping if any are registered either as a simple renaming `_checkpoint_conversion_mapping` class argument, or in the general WeightConverter mapping. """ # Lazy import to avoid circular import issues from .modeling_utils import PreTrainedModel # note: this function is used in PEFT, so changing the API requires coordination weight_conversions = [] # Load models with explicit, user-provided key mapping if key_mapping is not None: weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()] # Model have several `PreTrainedModel` within with the same model type # For ex: XForConditionalGeneration -> XModel. We don't want to apply the same # conversion pattern twice because of that seen_model_types = set() if (conversions := extract_weight_conversions_for_model(model)) is not None: weight_conversions.extend(conversions) seen_model_types.add(model.config.model_type) # Recurse over submodules and collect all conversions for submodule in model.modules(): if ( submodule is not model and isinstance(submodule, PreTrainedModel) and submodule.config.model_type not in seen_model_types ): conversions = extract_weight_conversions_for_model(submodule) if conversions is not None: weight_conversions.extend(conversions) seen_model_types.add(submodule.config.model_type) if add_legacy: weight_conversions.extend(get_checkpoint_conversion_mapping("legacy")) # Add the ones from the quantizer as well if provided if hf_quantizer is not None: weight_conversions.extend(hf_quantizer.get_weight_conversions()) return weight_conversions