configuration_idefics2.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. """Idefics2 model configuration"""
  14. from huggingface_hub.dataclasses import strict
  15. from ...configuration_utils import PreTrainedConfig
  16. from ...utils import auto_docstring, logging
  17. from ..auto import CONFIG_MAPPING, AutoConfig
  18. logger = logging.get_logger(__name__)
  19. @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b")
  20. @strict
  21. class Idefics2VisionConfig(PreTrainedConfig):
  22. r"""
  23. Example:
  24. ```python
  25. >>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
  26. >>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
  27. >>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration
  28. >>> configuration = Idefics2VisionConfig()
  29. >>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
  30. >>> model = Idefics2VisionTransformer(configuration)
  31. >>> # Accessing the model configuration
  32. >>> configuration = model.config
  33. ```"""
  34. model_type = "idefics2_vision"
  35. base_config_key = "vision_config"
  36. hidden_size: int = 768
  37. intermediate_size: int = 3072
  38. num_hidden_layers: int = 12
  39. num_attention_heads: int = 12
  40. num_channels: int = 3
  41. image_size: int | list[int] | tuple[int, int] = 224
  42. patch_size: int | list[int] | tuple[int, int] = 32
  43. hidden_act: str = "gelu_pytorch_tanh"
  44. layer_norm_eps: float = 1e-6
  45. attention_dropout: float | int = 0.0
  46. initializer_range: float = 0.02
  47. @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b")
  48. @strict
  49. class Idefics2PerceiverConfig(PreTrainedConfig):
  50. r"""
  51. resampler_n_latents (`int`, *optional*, defaults to 64):
  52. Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
  53. resampler_depth (`int`, *optional*, defaults to 3):
  54. Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
  55. resampler_n_heads (`int`, *optional*, defaults to 16):
  56. Number of heads in each Transformer block (for multi-headed self-attention).
  57. resampler_head_dim (`int`, *optional*, defaults to 96):
  58. Dimensionality of each head projection in the Transformer block.
  59. """
  60. model_type = "idefics2_perceiver"
  61. hidden_act: str = "silu"
  62. hidden_size: int = 4096
  63. rms_norm_eps: float = 1e-06
  64. resampler_n_latents: int = 64
  65. resampler_depth: int = 3
  66. resampler_n_heads: int = 16
  67. resampler_head_dim: int = 96
  68. num_key_value_heads: int = 4
  69. attention_dropout: float | int = 0.0
  70. initializer_range: float = 0.02
  71. def validate_architecture(self):
  72. """Part of `@strict`-powered validation. Validates the architecture of the config."""
  73. if self.num_key_value_heads > self.resampler_n_heads:
  74. raise ValueError(
  75. f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to"
  76. f" resampler_n_heads={self.resampler_n_heads}"
  77. )
  78. @auto_docstring(checkpoint="HuggingFaceM4/idefics2-8b")
  79. @strict
  80. class Idefics2Config(PreTrainedConfig):
  81. r"""
  82. perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*):
  83. Custom perceiver config or dict
  84. Example:
  85. ```python
  86. >>> from transformers import Idefics2Model, Idefics2Config
  87. >>> # Initializing configuration
  88. >>> configuration = Idefics2Config()
  89. >>> # Initializing a model from the configuration
  90. >>> model = Idefics2Model(configuration)
  91. >>> # Accessing the model configuration
  92. >>> configuration = model.config
  93. ```"""
  94. model_type = "idefics2"
  95. sub_configs = {
  96. "text_config": AutoConfig,
  97. "perceiver_config": Idefics2PerceiverConfig,
  98. "vision_config": Idefics2VisionConfig,
  99. }
  100. use_cache: bool = True
  101. image_token_id: int = 32_001
  102. tie_word_embeddings: bool = False
  103. vision_config: dict | PreTrainedConfig | None = None
  104. perceiver_config: dict | PreTrainedConfig | None = None
  105. text_config: dict | PreTrainedConfig | None = None
  106. def __post_init__(self, **kwargs):
  107. if self.perceiver_config is None:
  108. self.perceiver_config = Idefics2PerceiverConfig()
  109. logger.info("perciver_config is None, using default perceiver config")
  110. elif isinstance(self.perceiver_config, dict):
  111. self.perceiver_config = Idefics2PerceiverConfig(**self.perceiver_config)
  112. if self.vision_config is None:
  113. self.vision_config = Idefics2VisionConfig()
  114. logger.info("vision_config is None, using default vision config")
  115. elif isinstance(self.vision_config, dict):
  116. self.vision_config = Idefics2VisionConfig(**self.vision_config)
  117. if isinstance(self.text_config, dict):
  118. self.text_config["model_type"] = self.text_config.get("model_type", "mistral")
  119. self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config)
  120. elif self.text_config is None:
  121. logger.info("text_config is None, using default text config")
  122. self.text_config = CONFIG_MAPPING["mistral"](
  123. max_position_embeddings=4096 * 8,
  124. rms_norm_eps=1e-5,
  125. # None in the original configuration_mistral, we set it to the unk_token_id
  126. pad_token_id=0,
  127. )
  128. if self.text_config.hidden_size != self.perceiver_config.hidden_size:
  129. self.perceiver_config.hidden_size = self.text_config.hidden_size
  130. self.perceiver_config.rms_norm_eps = self.text_config.rms_norm_eps
  131. logger.warning_once(
  132. "Perceiver config has a different `hidden_size` than text config, which means default values were used. "
  133. "In your model's config on the hub, add `hidden_size` and `rms_norm_eps` keys under the `perceiver_config` dict. "
  134. )
  135. super().__post_init__(**kwargs)
  136. __all__ = ["Idefics2Config"]