| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Pix2Struct model configuration"""
- from huggingface_hub.dataclasses import strict
- from ...configuration_utils import PreTrainedConfig
- from ...utils import auto_docstring, logging
- logger = logging.get_logger(__name__)
- @auto_docstring(checkpoint="google/pix2struct-base")
- @strict
- class Pix2StructTextConfig(PreTrainedConfig):
- r"""
- relative_attention_num_buckets (`int`, *optional*, defaults to 32):
- The number of buckets to use for each attention layer.
- relative_attention_max_distance (`int`, *optional*, defaults to 128):
- The maximum distance of the longer sequences for the bucket separation.
- dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`):
- The non-linear activation function (function or string).
- Example:
- ```python
- >>> from transformers import Pix2StructTextConfig, Pix2StructTextModel
- >>> # Initializing a Pix2StructTextConfig with google/pix2struct-base style configuration
- >>> configuration = Pix2StructTextConfig()
- >>> # Initializing a Pix2StructTextModel (with random weights) from the google/pix2struct-base style configuration
- >>> model = Pix2StructTextModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```"""
- model_type = "pix2struct_text_model"
- keys_to_ignore_at_inference = ["past_key_values"]
- attribute_map = {
- "hidden_size": "hidden_size",
- "num_attention_heads": "num_heads",
- "num_hidden_layers": "num_layers",
- "decoder_attention_heads": "num_heads",
- "encoder_attention_heads": "num_heads",
- "encoder_layers": "num_layers",
- "decoder_layers": "num_layers",
- }
- vocab_size: int = 50244
- hidden_size: int = 768
- d_kv: int = 64
- d_ff: int = 2048
- num_layers: int = 12
- num_heads: int = 12
- relative_attention_num_buckets: int = 32
- relative_attention_max_distance: int = 128
- dropout_rate: float | int = 0.1
- layer_norm_epsilon: float = 1e-6
- initializer_factor: float = 1.0
- dense_act_fn: str = "gelu_new"
- decoder_start_token_id: int = 0
- use_cache: bool = False
- pad_token_id: int | None = 0
- eos_token_id: int | list[int] | None = 1
- bos_token_id: int | None = None
- tie_word_embeddings: bool = False
- is_decoder: bool = True
- add_cross_attention: bool = False
- @auto_docstring(checkpoint="google/pix2struct-base")
- @strict
- class Pix2StructVisionConfig(PreTrainedConfig):
- r"""
- patch_embed_hidden_size (`int`, *optional*, defaults to 768):
- Dimensionality of the input patch_embedding layer in the Transformer encoder.
- d_ff (`int`, *optional*, defaults to 2048):
- Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
- d_kv (`int`, *optional*, defaults to 64):
- Dimensionality of the key, query, value projections per attention head.
- The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
- `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
- dense_act_fn (`Union[Callable, str]`, *optional*, defaults to `"gelu_new"`):
- The non-linear activation function (function or string).
- seq_len (`int`, *optional*, defaults to 4096):
- Maximum sequence length (here number of patches) supported by the model.
- relative_attention_num_buckets (`int`, *optional*, defaults to 32):
- The number of buckets to use for each attention layer.
- relative_attention_max_distance (`int`, *optional*, defaults to 128):
- The maximum distance (in tokens) to use for each attention layer.
- Example:
- ```python
- >>> from transformers import Pix2StructVisionConfig, Pix2StructVisionModel
- >>> # Initializing a Pix2StructVisionConfig with google/pix2struct-base style configuration
- >>> configuration = Pix2StructVisionConfig()
- >>> # Initializing a Pix2StructVisionModel (with random weights) from the google/pix2struct-base style configuration
- >>> model = Pix2StructVisionModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```"""
- model_type = "pix2struct_vision_model"
- hidden_size: int = 768
- patch_embed_hidden_size: int = 768
- d_ff: int = 2048
- d_kv: int = 64
- num_hidden_layers: int = 12
- num_attention_heads: int = 12
- dense_act_fn: str = "gelu_new"
- layer_norm_eps: float = 1e-6
- dropout_rate: float | int = 0.0
- attention_dropout: float | int = 0.0
- initializer_range: float = 1e-10
- initializer_factor: float = 1.0
- seq_len: int = 4096
- relative_attention_num_buckets: int = 32
- relative_attention_max_distance: int = 128
- @auto_docstring(checkpoint="google/pix2struct-base")
- @strict
- class Pix2StructConfig(PreTrainedConfig):
- r"""
- is_vqa (`bool`, *optional*, defaults to `False`):
- Whether the model has been fine-tuned for VQA or not.
- Example:
- ```python
- >>> from transformers import Pix2StructConfig, Pix2StructForConditionalGeneration
- >>> # Initializing a Pix2StructConfig with google/pix2struct-base style configuration
- >>> configuration = Pix2StructConfig()
- >>> # Initializing a Pix2StructForConditionalGeneration (with random weights) from the google/pix2struct-base style configuration
- >>> model = Pix2StructForConditionalGeneration(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- >>> # We can also initialize a Pix2StructConfig from a Pix2StructTextConfig and a Pix2StructVisionConfig
- >>> # Initializing a Pix2Struct text and Pix2Struct vision configuration
- >>> config_text = Pix2StructTextConfig()
- >>> config_vision = Pix2StructVisionConfig()
- >>> config = Pix2StructConfig(text_config=config_text, vision_config=config_vision)
- ```"""
- model_type = "pix2struct"
- sub_configs = {"text_config": Pix2StructTextConfig, "vision_config": Pix2StructVisionConfig}
- text_config: dict | PreTrainedConfig | None = None
- vision_config: dict | PreTrainedConfig | None = None
- initializer_factor: float = 1.0
- initializer_range: float = 0.02
- is_vqa: bool = False
- tie_word_embeddings: bool = False
- is_encoder_decoder: bool = True
- def __post_init__(self, **kwargs):
- if self.text_config is None:
- self.text_config = Pix2StructTextConfig(
- is_encoder_decoder=self.is_encoder_decoder,
- tie_word_embeddings=self.tie_word_embeddings,
- )
- logger.info("`text_config` is `None`. initializing the `Pix2StructTextConfig` with default values.")
- elif isinstance(self.text_config, dict):
- self.text_config["is_encoder_decoder"] = self.is_encoder_decoder
- self.text_config["tie_word_embeddings"] = self.tie_word_embeddings
- self.text_config = Pix2StructTextConfig(**self.text_config)
- if self.vision_config is None:
- self.vision_config = Pix2StructVisionConfig()
- logger.info("`vision_config` is `None`. initializing the `Pix2StructVisionConfig` with default values.")
- elif isinstance(self.vision_config, dict):
- self.vision_config = Pix2StructVisionConfig(**self.vision_config)
- self.decoder_start_token_id = self.text_config.decoder_start_token_id
- self.pad_token_id = self.text_config.pad_token_id
- self.eos_token_id = self.text_config.eos_token_id
- self.text_config.initializer_range = self.initializer_range
- self.vision_config.initializer_range = self.initializer_range
- super().__post_init__(**kwargs)
- __all__ = ["Pix2StructConfig", "Pix2StructTextConfig", "Pix2StructVisionConfig"]
|