yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/chmv2/modular_chmv2.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_chmv2.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Literal

from huggingface_hub.dataclasses import strict

from ...backbone_utils import consolidate_backbone_kwargs_to_config
from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import AutoConfig


@auto_docstring(checkpoint="facebook/dinov3-vitl16-chmv2-dpt-head")
@strict
class CHMv2Config(PreTrainedConfig):
    r"""
    backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*):
        The configuration of the backbone model. Only DINOv3ViTConfig is currently supported.
    patch_size (`int`, *optional*, defaults to 16):
        The patch size used by the backbone vision transformer.
    reassemble_factors (`list[float]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
        The up/downsampling factors of the reassemble layers.
    post_process_channels (`list[int]`, *optional*, defaults to `[128, 256, 512, 1024]`):
        The output channel sizes of the reassemble stage for each backbone feature level.
    fusion_hidden_size (`int`, *optional*, defaults to 256):
        The number of channels before fusion.
    head_hidden_size (`int`, *optional*, defaults to 128):
        The number of channels in the hidden layer of the depth estimation head.
    number_output_channels (`int`, *optional*, defaults to 256):
        Number of output channels for the CHMv2 head (number of depth bins).
    readout_type (`str`, *optional*, defaults to `"project"`):
        Type of readout operation for the CLS token. One of `["ignore", "add", "project"]`.
    min_depth (`float`, *optional*, defaults to 0.001):
        The minimum depth value for depth bin calculation.
    max_depth (`float`, *optional*, defaults to 96.0):
        The maximum depth value for depth bin calculation.
    bins_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`):
        The strategy for depth bins distribution. One of `["linear", "log", "chmv2_mixlog"]`.
    norm_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`):
        The normalization strategy for depth prediction. One of `["linear", "softmax", "sigmoid", "chmv2_mixlog"]`.

    ```python
    >>> from transformers import CHMv2Config, CHMv2ForDepthEstimation

    >>> configuration = CHMv2Config()
    >>> model = CHMv2ForDepthEstimation(configuration)
    >>> configuration = model.config
    ```
    """

    model_type = "chmv2"
    sub_configs = {"backbone_config": AutoConfig}

    backbone_config: dict | PreTrainedConfig | None = None
    patch_size: int = 16
    initializer_range: float = 0.02
    reassemble_factors: list[float | int] | None = None
    post_process_channels: list[int] | None = None
    fusion_hidden_size: int = 256
    head_hidden_size: int = 128
    number_output_channels: int = 256
    readout_type: str = "project"
    min_depth: float = 0.001
    max_depth: float = 96.0
    bins_strategy: Literal["linear", "log", "chmv2_mixlog"] = "chmv2_mixlog"
    norm_strategy: Literal["linear", "softmax", "sigmoid", "chmv2_mixlog"] = "chmv2_mixlog"

    def __post_init__(self, **kwargs):
        if self.reassemble_factors is None:
            self.reassemble_factors = [4, 2, 1, 0.5]
        if self.post_process_channels is None:
            self.post_process_channels = [128, 256, 512, 1024]

        default_config_kwargs = {
            "image_size": 416,
            "hidden_size": 1024,
            "intermediate_size": 4096,
            "num_attention_heads": 16,
            "num_hidden_layers": 24,
            "num_register_tokens": 4,
            "key_bias": True,
            "out_indices": [6, 12, 18, 24],
            "reshape_hidden_states": True,
            "apply_layernorm": True,
            "layer_norm_eps": 1e-6,
            "return_class_token": True,
        }

        self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
            backbone_config=self.backbone_config,
            default_config_type="dinov3_vit",
            default_config_kwargs=default_config_kwargs,
            **kwargs,
        )

        super().__post_init__(**kwargs)


__all__ = ["CHMv2Config"]