| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- # Copyright Deepmind and The HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Perceiver model configuration"""
- from huggingface_hub.dataclasses import strict
- from ...configuration_utils import PreTrainedConfig
- from ...utils import auto_docstring
- @auto_docstring(checkpoint="deepmind/language-perceiver")
- @strict
- class PerceiverConfig(PreTrainedConfig):
- r"""
- num_latents (`int`, *optional*, defaults to 256):
- The number of latents.
- d_latents (`int`, *optional*, defaults to 1280):
- Dimension of the latent embeddings.
- num_blocks (`int`, *optional*, defaults to 1):
- Number of blocks in the Transformer encoder.
- num_self_attends_per_block (`int`, *optional*, defaults to 26):
- The number of self-attention layers per block.
- num_self_attention_heads (`int`, *optional*, defaults to 8):
- Number of attention heads for each self-attention layer in the Transformer encoder.
- num_cross_attention_heads (`int`, *optional*, defaults to 8):
- Number of attention heads for each cross-attention layer in the Transformer encoder.
- qk_channels (`int`, *optional*):
- Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
- layers of the encoder. Will default to preserving the dimension of the queries if not specified.
- v_channels (`int`, *optional*):
- Dimension to project the values before applying attention in the cross-attention and self-attention layers
- of the encoder. Will default to preserving the dimension of the queries if not specified.
- cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
- Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
- self_attention_widening_factor (`int`, *optional*, defaults to 1):
- Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
- cross_attention_widening_factor (`int`, *optional*, defaults to 1):
- Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
- use_query_residual (`float`, *optional*, defaults to `True`):
- Whether to add a query residual in the cross-attention layer of the encoder.
- image_size (`int`, *optional*, defaults to 56):
- Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
- train_size (`list[int]`, *optional*, defaults to `[368, 496]`):
- Training size of the images for the optical flow model.
- num_frames (`int`, *optional*, defaults to 16):
- Number of video frames used for the multimodal autoencoding model.
- audio_samples_per_frame (`int`, *optional*, defaults to 1920):
- Number of audio samples per frame for the multimodal autoencoding model.
- samples_per_patch (`int`, *optional*, defaults to 16):
- Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
- output_shape (`list[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
- Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
- autoencoding model. This excludes the channel dimension.
- output_num_channels (`int`, *optional*, defaults to 512):
- Number of output channels for each modalitiy decoder.
- Example:
- ```python
- >>> from transformers import PerceiverModel, PerceiverConfig
- >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
- >>> configuration = PerceiverConfig()
- >>> # Initializing a model from the deepmind/language-perceiver style configuration
- >>> model = PerceiverModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```"""
- model_type = "perceiver"
- num_latents: int = 256
- d_latents: int = 1280
- d_model: int = 768
- num_blocks: int = 1
- num_self_attends_per_block: int = 26
- num_self_attention_heads: int = 8
- num_cross_attention_heads: int = 8
- qk_channels: int | None = None
- v_channels: int | None = None
- cross_attention_shape_for_attention: str = "kv"
- self_attention_widening_factor: int = 1
- cross_attention_widening_factor: int = 1
- hidden_act: str = "gelu"
- attention_probs_dropout_prob: float | int = 0.1
- initializer_range: float = 0.02
- layer_norm_eps: float = 1e-12
- use_query_residual: bool = True
- vocab_size: int = 262
- max_position_embeddings: int = 2048
- image_size: int | list[int] | tuple[int, int] = 56
- train_size: list[int] | tuple[int, ...] = (368, 496)
- num_frames: int = 16
- audio_samples_per_frame: int = 1920
- samples_per_patch: int = 16
- output_shape: list[int] | tuple[int, ...] = (1, 16, 224, 224)
- output_num_channels: int = 512
- _label_trainable_num_channels: int = 1024
- __all__ = ["PerceiverConfig"]
|