# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/chmv2/modular_chmv2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_chmv2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Copyright 2026 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Literal from huggingface_hub.dataclasses import strict from ...backbone_utils import consolidate_backbone_kwargs_to_config from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import AutoConfig @auto_docstring(checkpoint="facebook/dinov3-vitl16-chmv2-dpt-head") @strict class CHMv2Config(PreTrainedConfig): r""" backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*): The configuration of the backbone model. Only DINOv3ViTConfig is currently supported. patch_size (`int`, *optional*, defaults to 16): The patch size used by the backbone vision transformer. reassemble_factors (`list[float]`, *optional*, defaults to `[4, 2, 1, 0.5]`): The up/downsampling factors of the reassemble layers. post_process_channels (`list[int]`, *optional*, defaults to `[128, 256, 512, 1024]`): The output channel sizes of the reassemble stage for each backbone feature level. fusion_hidden_size (`int`, *optional*, defaults to 256): The number of channels before fusion. head_hidden_size (`int`, *optional*, defaults to 128): The number of channels in the hidden layer of the depth estimation head. number_output_channels (`int`, *optional*, defaults to 256): Number of output channels for the CHMv2 head (number of depth bins). readout_type (`str`, *optional*, defaults to `"project"`): Type of readout operation for the CLS token. One of `["ignore", "add", "project"]`. min_depth (`float`, *optional*, defaults to 0.001): The minimum depth value for depth bin calculation. max_depth (`float`, *optional*, defaults to 96.0): The maximum depth value for depth bin calculation. bins_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`): The strategy for depth bins distribution. One of `["linear", "log", "chmv2_mixlog"]`. norm_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`): The normalization strategy for depth prediction. One of `["linear", "softmax", "sigmoid", "chmv2_mixlog"]`. ```python >>> from transformers import CHMv2Config, CHMv2ForDepthEstimation >>> configuration = CHMv2Config() >>> model = CHMv2ForDepthEstimation(configuration) >>> configuration = model.config ``` """ model_type = "chmv2" sub_configs = {"backbone_config": AutoConfig} backbone_config: dict | PreTrainedConfig | None = None patch_size: int = 16 initializer_range: float = 0.02 reassemble_factors: list[float | int] | None = None post_process_channels: list[int] | None = None fusion_hidden_size: int = 256 head_hidden_size: int = 128 number_output_channels: int = 256 readout_type: str = "project" min_depth: float = 0.001 max_depth: float = 96.0 bins_strategy: Literal["linear", "log", "chmv2_mixlog"] = "chmv2_mixlog" norm_strategy: Literal["linear", "softmax", "sigmoid", "chmv2_mixlog"] = "chmv2_mixlog" def __post_init__(self, **kwargs): if self.reassemble_factors is None: self.reassemble_factors = [4, 2, 1, 0.5] if self.post_process_channels is None: self.post_process_channels = [128, 256, 512, 1024] default_config_kwargs = { "image_size": 416, "hidden_size": 1024, "intermediate_size": 4096, "num_attention_heads": 16, "num_hidden_layers": 24, "num_register_tokens": 4, "key_bias": True, "out_indices": [6, 12, 18, 24], "reshape_hidden_states": True, "apply_layernorm": True, "layer_norm_eps": 1e-6, "return_class_token": True, } self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config( backbone_config=self.backbone_config, default_config_type="dinov3_vit", default_config_kwargs=default_config_kwargs, **kwargs, ) super().__post_init__(**kwargs) __all__ = ["CHMv2Config"]