yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/pi0/modular_pi0.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_pi0.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import CONFIG_MAPPING, AutoConfig


@auto_docstring(checkpoint="lerobot/pi0_base")
@strict
class PI0Config(PreTrainedConfig):
    r"""
    vlm_config (`dict`, *optional*):
        Configuration for the vlm backbone (PaliGemmaModel).
    dit_config (`dict`, *optional*):
        Configuration for the DiT backbone. Defaults to a Gemma 300M variant.
    chunk_size (`int`, *optional*, defaults to 50):
        Number of action steps to predict per chunk.
    max_state_dim (`int`, *optional*, defaults to 32):
        Maximum state vector dimension (shorter vectors are zero-padded).
    max_action_dim (`int`, *optional*, defaults to 32):
        Maximum action vector dimension (shorter vectors are zero-padded).
    num_inference_steps (`int`, *optional*, defaults to 10):
        Number of denoising steps during inference.
    time_sampling_beta_alpha (`float`, *optional*, defaults to 1.5):
        Alpha parameter for Beta distribution used to sample diffusion time during training.
    time_sampling_beta_beta (`float`, *optional*, defaults to 1.0):
        Beta parameter for Beta distribution used to sample diffusion time during training.
    time_sampling_scale (`float`, *optional*, defaults to 0.999):
        Scale factor for sampled time values.
    time_sampling_offset (`float`, *optional*, defaults to 0.001):
        Offset added to sampled time values.
    min_period (`float`, *optional*, defaults to 0.004):
        Minimum period for sinusoidal time embedding.
    max_period (`float`, *optional*, defaults to 4.0):
        Maximum period for sinusoidal time embedding.
    loss_reduction (`str`, *optional*, defaults to `"mean"`):
        The reduction to use on MSE loss.

    Example:
    ```python
    >>> from transformers import PI0ForConditionalGeneration, PI0Config

    >>> config = PI0Config()
    >>> model = PI0ForConditionalGeneration(config)
    ```
    """

    model_type = "pi0"
    sub_configs = {"vlm_config": AutoConfig, "dit_config": AutoConfig}

    vlm_config: dict | PreTrainedConfig | None = None
    dit_config: dict | PreTrainedConfig | None = None
    chunk_size: int = 50
    max_state_dim: int = 32
    max_action_dim: int = 32
    num_inference_steps: int = 10
    time_sampling_beta_alpha: float = 1.5
    time_sampling_beta_beta: float = 1.0
    time_sampling_scale: float = 0.999
    time_sampling_offset: float = 0.001
    min_period: float = 4e-3
    max_period: float = 4.0
    loss_reduction: str = "mean"

    def __post_init__(self, **kwargs):
        if isinstance(self.vlm_config, dict):
            vlm_model_type = self.vlm_config.get("model_type", "paligemma")
            self.vlm_config = CONFIG_MAPPING[vlm_model_type](**self.vlm_config)
        elif self.vlm_config is None:
            self.vlm_config = CONFIG_MAPPING["paligemma"](
                text_config={
                    "model_type": "gemma",
                    "hidden_size": 2048,
                    "num_hidden_layers": 18,
                    "intermediate_size": 16384,
                    "num_attention_heads": 8,
                    "num_key_value_heads": 1,
                    "vocab_size": 257152,
                },
                vision_config={
                    "model_type": "siglip_vision_model",
                    "intermediate_size": 4304,
                    "hidden_size": 1152,
                    "patch_size": 14,
                    "image_size": 224,
                    "num_hidden_layers": 27,
                    "num_attention_heads": 16,
                    "vocab_size": 257152,
                    "vision_use_head": False,
                },
                projection_dim=2048,
                image_token_id=257152,
            )

        if isinstance(self.dit_config, dict):
            dit_model_type = self.dit_config.get("model_type", "gemma")
            self.dit_config = CONFIG_MAPPING[dit_model_type](**self.dit_config)
        elif self.dit_config is None:
            self.dit_config = CONFIG_MAPPING["gemma"](
                hidden_size=1024,
                num_hidden_layers=18,
                intermediate_size=4096,
                num_attention_heads=8,
                num_key_value_heads=1,
                head_dim=256,
                vocab_size=self.vlm_config.text_config.vocab_size,
            )

        # Force bidirectional attention
        self.dit_config.is_causal = False
        self.dit_config.use_bidirectional_attention = True
        self.vlm_config.text_config.use_bidirectional_attention = True
        super().__post_init__(**kwargs)

    def validate_architecture(self):
        """Part of `@strict`-powered validation. Validates the architecture of the config."""
        if self.dit_config.hidden_size % 2 != 0:
            raise ValueError(f"DiT hidden dim=({self.config.dit_config.hidden_size}) must be divisible by 2")


__all__ = ["PI0Config"]