yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
							# Copyright Deepmind and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Perceiver model configuration"""

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring


@auto_docstring(checkpoint="deepmind/language-perceiver")
@strict
class PerceiverConfig(PreTrainedConfig):
    r"""
    num_latents (`int`, *optional*, defaults to 256):
        The number of latents.
    d_latents (`int`, *optional*, defaults to 1280):
        Dimension of the latent embeddings.
    num_blocks (`int`, *optional*, defaults to 1):
        Number of blocks in the Transformer encoder.
    num_self_attends_per_block (`int`, *optional*, defaults to 26):
        The number of self-attention layers per block.
    num_self_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each self-attention layer in the Transformer encoder.
    num_cross_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each cross-attention layer in the Transformer encoder.
    qk_channels (`int`, *optional*):
        Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
        layers of the encoder. Will default to preserving the dimension of the queries if not specified.
    v_channels (`int`, *optional*):
        Dimension to project the values before applying attention in the cross-attention and self-attention layers
        of the encoder. Will default to preserving the dimension of the queries if not specified.
    cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
        Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
    self_attention_widening_factor (`int`, *optional*, defaults to 1):
        Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
    cross_attention_widening_factor (`int`, *optional*, defaults to 1):
        Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
    use_query_residual (`float`, *optional*, defaults to `True`):
        Whether to add a query residual in the cross-attention layer of the encoder.
    image_size (`int`, *optional*, defaults to 56):
        Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
    train_size (`list[int]`, *optional*, defaults to `[368, 496]`):
        Training size of the images for the optical flow model.
    num_frames (`int`, *optional*, defaults to 16):
        Number of video frames used for the multimodal autoencoding model.
    audio_samples_per_frame (`int`, *optional*, defaults to 1920):
        Number of audio samples per frame for the multimodal autoencoding model.
    samples_per_patch (`int`, *optional*, defaults to 16):
        Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
    output_shape (`list[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
        Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
        autoencoding model. This excludes the channel dimension.
    output_num_channels (`int`, *optional*, defaults to 512):
        Number of output channels for each modalitiy decoder.

    Example:

    ```python
    >>> from transformers import PerceiverModel, PerceiverConfig

    >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
    >>> configuration = PerceiverConfig()

    >>> # Initializing a model from the deepmind/language-perceiver style configuration
    >>> model = PerceiverModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "perceiver"

    num_latents: int = 256
    d_latents: int = 1280
    d_model: int = 768
    num_blocks: int = 1
    num_self_attends_per_block: int = 26
    num_self_attention_heads: int = 8
    num_cross_attention_heads: int = 8
    qk_channels: int | None = None
    v_channels: int | None = None
    cross_attention_shape_for_attention: str = "kv"
    self_attention_widening_factor: int = 1
    cross_attention_widening_factor: int = 1
    hidden_act: str = "gelu"
    attention_probs_dropout_prob: float | int = 0.1
    initializer_range: float = 0.02
    layer_norm_eps: float = 1e-12
    use_query_residual: bool = True
    vocab_size: int = 262
    max_position_embeddings: int = 2048
    image_size: int | list[int] | tuple[int, int] = 56
    train_size: list[int] | tuple[int, ...] = (368, 496)
    num_frames: int = 16
    audio_samples_per_frame: int = 1920
    samples_per_patch: int = 16
    output_shape: list[int] | tuple[int, ...] = (1, 16, 224, 224)
    output_num_channels: int = 512
    _label_trainable_num_channels: int = 1024


__all__ = ["PerceiverConfig"]