configuration_siglip2.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/siglip2/modular_siglip2.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_siglip2.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # Copyright 2025 The HuggingFace Inc. team.
  8. #
  9. # Licensed under the Apache License, Version 2.0 (the "License");
  10. # you may not use this file except in compliance with the License.
  11. # You may obtain a copy of the License at
  12. #
  13. # http://www.apache.org/licenses/LICENSE-2.0
  14. #
  15. # Unless required by applicable law or agreed to in writing, software
  16. # distributed under the License is distributed on an "AS IS" BASIS,
  17. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  18. # See the License for the specific language governing permissions and
  19. # limitations under the License.
  20. from huggingface_hub.dataclasses import strict
  21. from ...configuration_utils import PreTrainedConfig
  22. from ...utils import auto_docstring, logging
  23. logger = logging.get_logger(__name__)
  24. @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex")
  25. @strict
  26. class Siglip2TextConfig(PreTrainedConfig):
  27. r"""
  28. Example:
  29. ```python
  30. >>> from transformers import Siglip2TextConfig, Siglip2TextModel
  31. >>> # Initializing a Siglip2TextConfig with google/siglip2-base-patch16-224 style configuration
  32. >>> configuration = Siglip2TextConfig()
  33. >>> # Initializing a Siglip2TextModel (with random weights) from the google/siglip2-base-patch16-224 style configuration
  34. >>> model = Siglip2TextModel(configuration)
  35. >>> # Accessing the model configuration
  36. >>> configuration = model.config
  37. ```"""
  38. model_type = "siglip2_text_model"
  39. base_config_key = "text_config"
  40. vocab_size: int = 32000
  41. hidden_size: int = 768
  42. intermediate_size: int = 3072
  43. num_hidden_layers: int = 12
  44. num_attention_heads: int = 12
  45. max_position_embeddings: int = 64
  46. hidden_act: str = "gelu_pytorch_tanh"
  47. layer_norm_eps: float = 1e-6
  48. attention_dropout: float | int = 0.0
  49. # This differs from `CLIPTokenizer`'s default and from openai/siglip2
  50. # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
  51. pad_token_id: int | None = 1
  52. bos_token_id: int | None = 49406
  53. eos_token_id: int | list[int] | None = 49407
  54. projection_size: int | None = None
  55. def __post_init__(self, **kwargs):
  56. self.projection_size = self.projection_size if self.projection_size is not None else self.hidden_size
  57. super().__post_init__(**kwargs)
  58. @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex")
  59. @strict
  60. class Siglip2VisionConfig(PreTrainedConfig):
  61. r"""
  62. num_patches (`int`, *optional*, defaults to 256):
  63. The number of patches in the image with the size of (`patch_size`, `patch_size`).
  64. The image is resized to fill maximum of this number of patches, and to preserve
  65. the aspect ratio. In case the resulted number of patches is lower, the image is
  66. padded in "patch" dimension.
  67. Example:
  68. ```python
  69. >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel
  70. >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
  71. >>> configuration = Siglip2VisionConfig()
  72. >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
  73. >>> model = Siglip2VisionModel(configuration)
  74. >>> # Accessing the model configuration
  75. >>> configuration = model.config
  76. ```"""
  77. model_type = "siglip2_vision_model"
  78. base_config_key = "vision_config"
  79. hidden_size: int = 768
  80. intermediate_size: int = 3072
  81. num_hidden_layers: int = 12
  82. num_attention_heads: int = 12
  83. num_channels: int = 3
  84. patch_size: int | list[int] | tuple[int, int] = 16
  85. hidden_act: str = "gelu_pytorch_tanh"
  86. layer_norm_eps: float = 1e-6
  87. attention_dropout: float | int = 0.0
  88. num_patches: int = 256
  89. @auto_docstring(checkpoint="google/siglip2-base-patch16-naflex")
  90. @strict
  91. class Siglip2Config(PreTrainedConfig):
  92. r"""
  93. Example:
  94. ```python
  95. >>> from transformers import Siglip2Config, Siglip2Model
  96. >>> # Initializing a Siglip2Config with google/siglip2-base-patch16-224 style configuration
  97. >>> configuration = Siglip2Config()
  98. >>> # Initializing a Siglip2Model (with random weights) from the google/siglip2-base-patch16-224 style configuration
  99. >>> model = Siglip2Model(configuration)
  100. >>> # Accessing the model configuration
  101. >>> configuration = model.config
  102. >>> # We can also initialize a Siglip2Config from a Siglip2TextConfig and a Siglip2VisionConfig
  103. >>> from transformers import Siglip2TextConfig, Siglip2VisionConfig
  104. >>> # Initializing a Siglip2Text and Siglip2Vision configuration
  105. >>> config_text = Siglip2TextConfig()
  106. >>> config_vision = Siglip2VisionConfig()
  107. >>> config = Siglip2Config(text_config=config_text, vision_config=config_vision)
  108. ```"""
  109. model_type = "siglip2"
  110. sub_configs = {"text_config": Siglip2TextConfig, "vision_config": Siglip2VisionConfig}
  111. text_config: dict | PreTrainedConfig | None = None
  112. vision_config: dict | PreTrainedConfig | None = None
  113. initializer_factor: float = 1.0
  114. def __post_init__(self, **kwargs):
  115. if self.text_config is None:
  116. self.text_config = Siglip2TextConfig()
  117. logger.info("`text_config` is `None`. Initializing the `Siglip2TextConfig` with default values.")
  118. elif isinstance(self.text_config, dict):
  119. self.text_config = Siglip2TextConfig(**self.text_config)
  120. if self.vision_config is None:
  121. self.vision_config = Siglip2VisionConfig()
  122. logger.info("`vision_config` is `None`. initializing the `Siglip2VisionConfig` with default values.")
  123. elif isinstance(self.vision_config, dict):
  124. self.vision_config = Siglip2VisionConfig(**self.vision_config)
  125. super().__post_init__(**kwargs)
  126. __all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"]