configuration_align.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """ALIGN model configuration"""
  15. from huggingface_hub.dataclasses import strict
  16. from ...configuration_utils import PreTrainedConfig
  17. from ...utils import auto_docstring, logging
  18. logger = logging.get_logger(__name__)
  19. @auto_docstring(checkpoint="kakaobrain/align-base")
  20. @strict
  21. class AlignTextConfig(PreTrainedConfig):
  22. r"""
  23. Example:
  24. ```python
  25. >>> from transformers import AlignTextConfig, AlignTextModel
  26. >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
  27. >>> configuration = AlignTextConfig()
  28. >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
  29. >>> model = AlignTextModel(configuration)
  30. >>> # Accessing the model configuration
  31. >>> configuration = model.config
  32. ```"""
  33. model_type = "align_text_model"
  34. base_config_key = "text_config"
  35. vocab_size: int = 30522
  36. hidden_size: int = 768
  37. num_hidden_layers: int = 12
  38. num_attention_heads: int = 12
  39. intermediate_size: int = 3072
  40. hidden_act: str = "gelu"
  41. hidden_dropout_prob: float | int = 0.1
  42. attention_probs_dropout_prob: float | int = 0.1
  43. max_position_embeddings: int = 512
  44. type_vocab_size: int = 2
  45. initializer_range: float = 0.02
  46. layer_norm_eps: float = 1e-12
  47. pad_token_id: int | None = 0
  48. bos_token_id: int | None = None
  49. eos_token_id: int | list[int] | None = None
  50. @auto_docstring(checkpoint="kakaobrain/align-base")
  51. @strict
  52. class AlignVisionConfig(PreTrainedConfig):
  53. r"""
  54. width_coefficient (`float`, *optional*, defaults to 2.0):
  55. Scaling coefficient for network width at each stage.
  56. depth_coefficient (`float`, *optional*, defaults to 3.1):
  57. Scaling coefficient for network depth at each stage.
  58. depth_divisor (`int`, *optional*, defaults to 8):
  59. A unit of network width.
  60. kernel_sizes (`list[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
  61. List of kernel sizes to be used in each block.
  62. in_channels (`list[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
  63. List of input channel sizes to be used in each block for convolutional layers.
  64. out_channels (`list[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
  65. List of output channel sizes to be used in each block for convolutional layers.
  66. depthwise_padding (`list[int]`, *optional*, defaults to `[]`):
  67. List of block indices with square padding.
  68. strides (`list[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
  69. List of stride sizes to be used in each block for convolutional layers.
  70. num_block_repeats (`list[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
  71. List of the number of times each block is to repeated.
  72. expand_ratios (`list[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
  73. List of scaling coefficient of each block.
  74. squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
  75. Squeeze expansion ratio.
  76. hidden_dim (`int`, *optional*, defaults to 1280):
  77. The hidden dimension of the layer before the classification head.
  78. pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
  79. Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
  80. `"max"`]
  81. batch_norm_momentum (`float`, *optional*, defaults to 0.99):
  82. The momentum used by the batch normalization layers.
  83. drop_connect_rate (`float`, *optional*, defaults to 0.2):
  84. The drop rate for skip connections.
  85. Example:
  86. ```python
  87. >>> from transformers import AlignVisionConfig, AlignVisionModel
  88. >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration
  89. >>> configuration = AlignVisionConfig()
  90. >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration
  91. >>> model = AlignVisionModel(configuration)
  92. >>> # Accessing the model configuration
  93. >>> configuration = model.config
  94. ```"""
  95. model_type = "align_vision_model"
  96. base_config_key = "vision_config"
  97. num_channels: int = 3
  98. image_size: int | list[int] | tuple[int, int] = 600
  99. width_coefficient: float = 2.0
  100. depth_coefficient: float = 3.1
  101. depth_divisor: int = 8
  102. kernel_sizes: list[int] | tuple[int, ...] = (3, 3, 5, 3, 5, 5, 3)
  103. in_channels: list[int] | tuple[int, ...] = (32, 16, 24, 40, 80, 112, 192)
  104. out_channels: list[int] | tuple[int, ...] = (16, 24, 40, 80, 112, 192, 320)
  105. depthwise_padding: list | tuple[int, ...] = ()
  106. strides: list[int] | tuple[int, ...] = (1, 2, 2, 2, 1, 2, 1)
  107. num_block_repeats: list[int] | tuple[int, ...] = (1, 2, 2, 3, 3, 4, 1)
  108. expand_ratios: list[int] | tuple[int, ...] = (1, 6, 6, 6, 6, 6, 6)
  109. squeeze_expansion_ratio: float = 0.25
  110. hidden_act: str = "swish"
  111. hidden_dim: int = 2560
  112. pooling_type: str = "mean"
  113. initializer_range: float = 0.02
  114. batch_norm_eps: float = 0.001
  115. batch_norm_momentum: float = 0.99
  116. drop_connect_rate: float | int = 0.2
  117. def __post_init__(self, **kwargs):
  118. self.num_hidden_layers = sum(self.num_block_repeats) * 4
  119. for attr in [
  120. "kernel_sizes",
  121. "in_channels",
  122. "out_channels",
  123. "depthwise_padding",
  124. "strides",
  125. "num_block_repeats",
  126. "expand_ratios",
  127. ]:
  128. # cast tuple so it can be JSON-ized when saving
  129. setattr(self, attr, list(getattr(self, attr)))
  130. super().__post_init__(**kwargs)
  131. @auto_docstring(checkpoint="kakaobrain/align-base")
  132. @strict
  133. class AlignConfig(PreTrainedConfig):
  134. r"""
  135. temperature_init_value (`float`, *optional*, defaults to 1.0):
  136. The initial value of the *temperature* parameter. Default is used as per the original ALIGN implementation.
  137. Example:
  138. ```python
  139. >>> from transformers import AlignConfig, AlignModel
  140. >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
  141. >>> configuration = AlignConfig()
  142. >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
  143. >>> model = AlignModel(configuration)
  144. >>> # Accessing the model configuration
  145. >>> configuration = model.config
  146. >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
  147. >>> from transformers import AlignTextConfig, AlignVisionConfig
  148. >>> # Initializing ALIGN Text and Vision configurations
  149. >>> config_text = AlignTextConfig()
  150. >>> config_vision = AlignVisionConfig()
  151. >>> config = AlignConfig(text_config=config_text, vision_config=config_vision)
  152. ```"""
  153. model_type = "align"
  154. sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig}
  155. text_config: dict | PreTrainedConfig | None = None
  156. vision_config: dict | PreTrainedConfig | None = None
  157. projection_dim: int = 640
  158. temperature_init_value: float = 1.0
  159. initializer_range: float = 0.02
  160. def __post_init__(self, **kwargs):
  161. if self.text_config is None:
  162. self.text_config = AlignTextConfig()
  163. logger.info("`text_config` is `None`. Initializing the `AlignTextConfig` with default values.")
  164. elif isinstance(self.text_config, dict):
  165. self.text_config = AlignTextConfig(**self.text_config)
  166. if self.vision_config is None:
  167. self.vision_config = AlignVisionConfig()
  168. logger.info("`vision_config` is `None`. initializing the `AlignVisionConfig` with default values.")
  169. elif isinstance(self.vision_config, dict):
  170. self.vision_config = AlignVisionConfig(**self.vision_config)
  171. super().__post_init__(**kwargs)
  172. __all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]