# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/pi0/modular_pi0.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_pi0.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring from ..auto import CONFIG_MAPPING, AutoConfig @auto_docstring(checkpoint="lerobot/pi0_base") @strict class PI0Config(PreTrainedConfig): r""" vlm_config (`dict`, *optional*): Configuration for the vlm backbone (PaliGemmaModel). dit_config (`dict`, *optional*): Configuration for the DiT backbone. Defaults to a Gemma 300M variant. chunk_size (`int`, *optional*, defaults to 50): Number of action steps to predict per chunk. max_state_dim (`int`, *optional*, defaults to 32): Maximum state vector dimension (shorter vectors are zero-padded). max_action_dim (`int`, *optional*, defaults to 32): Maximum action vector dimension (shorter vectors are zero-padded). num_inference_steps (`int`, *optional*, defaults to 10): Number of denoising steps during inference. time_sampling_beta_alpha (`float`, *optional*, defaults to 1.5): Alpha parameter for Beta distribution used to sample diffusion time during training. time_sampling_beta_beta (`float`, *optional*, defaults to 1.0): Beta parameter for Beta distribution used to sample diffusion time during training. time_sampling_scale (`float`, *optional*, defaults to 0.999): Scale factor for sampled time values. time_sampling_offset (`float`, *optional*, defaults to 0.001): Offset added to sampled time values. min_period (`float`, *optional*, defaults to 0.004): Minimum period for sinusoidal time embedding. max_period (`float`, *optional*, defaults to 4.0): Maximum period for sinusoidal time embedding. loss_reduction (`str`, *optional*, defaults to `"mean"`): The reduction to use on MSE loss. Example: ```python >>> from transformers import PI0ForConditionalGeneration, PI0Config >>> config = PI0Config() >>> model = PI0ForConditionalGeneration(config) ``` """ model_type = "pi0" sub_configs = {"vlm_config": AutoConfig, "dit_config": AutoConfig} vlm_config: dict | PreTrainedConfig | None = None dit_config: dict | PreTrainedConfig | None = None chunk_size: int = 50 max_state_dim: int = 32 max_action_dim: int = 32 num_inference_steps: int = 10 time_sampling_beta_alpha: float = 1.5 time_sampling_beta_beta: float = 1.0 time_sampling_scale: float = 0.999 time_sampling_offset: float = 0.001 min_period: float = 4e-3 max_period: float = 4.0 loss_reduction: str = "mean" def __post_init__(self, **kwargs): if isinstance(self.vlm_config, dict): vlm_model_type = self.vlm_config.get("model_type", "paligemma") self.vlm_config = CONFIG_MAPPING[vlm_model_type](**self.vlm_config) elif self.vlm_config is None: self.vlm_config = CONFIG_MAPPING["paligemma"]( text_config={ "model_type": "gemma", "hidden_size": 2048, "num_hidden_layers": 18, "intermediate_size": 16384, "num_attention_heads": 8, "num_key_value_heads": 1, "vocab_size": 257152, }, vision_config={ "model_type": "siglip_vision_model", "intermediate_size": 4304, "hidden_size": 1152, "patch_size": 14, "image_size": 224, "num_hidden_layers": 27, "num_attention_heads": 16, "vocab_size": 257152, "vision_use_head": False, }, projection_dim=2048, image_token_id=257152, ) if isinstance(self.dit_config, dict): dit_model_type = self.dit_config.get("model_type", "gemma") self.dit_config = CONFIG_MAPPING[dit_model_type](**self.dit_config) elif self.dit_config is None: self.dit_config = CONFIG_MAPPING["gemma"]( hidden_size=1024, num_hidden_layers=18, intermediate_size=4096, num_attention_heads=8, num_key_value_heads=1, head_dim=256, vocab_size=self.vlm_config.text_config.vocab_size, ) # Force bidirectional attention self.dit_config.is_causal = False self.dit_config.use_bidirectional_attention = True self.vlm_config.text_config.use_bidirectional_attention = True super().__post_init__(**kwargs) def validate_architecture(self): """Part of `@strict`-powered validation. Validates the architecture of the config.""" if self.dit_config.hidden_size % 2 != 0: raise ValueError(f"DiT hidden dim=({self.config.dit_config.hidden_size}) must be divisible by 2") __all__ = ["PI0Config"]