configuration_instructblip.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """InstructBLIP model configuration"""
  15. from huggingface_hub.dataclasses import strict
  16. from ...configuration_utils import PreTrainedConfig
  17. from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
  18. from ...utils import auto_docstring, logging
  19. from ..auto import CONFIG_MAPPING, AutoConfig
  20. logger = logging.get_logger(__name__)
  21. @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl")
  22. @strict
  23. class InstructBlipVisionConfig(PreTrainedConfig):
  24. r"""
  25. Example:
  26. ```python
  27. >>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel
  28. >>> # Initializing a InstructBlipVisionConfig with Salesforce/instructblip-flan-t5-xl style configuration
  29. >>> configuration = InstructBlipVisionConfig()
  30. >>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
  31. >>> model = InstructBlipVisionModel(configuration)
  32. >>> # Accessing the model configuration
  33. >>> configuration = model.config
  34. ```"""
  35. model_type = "instructblip_vision_model"
  36. base_config_key = "vision_config"
  37. hidden_size: int = 1408
  38. intermediate_size: int = 6144
  39. num_hidden_layers: int = 39
  40. num_attention_heads: int = 16
  41. image_size: int | list[int] | tuple[int, int] = 224
  42. patch_size: int | list[int] | tuple[int, int] = 14
  43. hidden_act: str = "gelu"
  44. layer_norm_eps: float = 1e-6
  45. attention_dropout: float | int = 0.0
  46. initializer_range: float = 1e-10
  47. qkv_bias: bool = True
  48. @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl")
  49. @strict
  50. class InstructBlipQFormerConfig(PreTrainedConfig):
  51. r"""
  52. cross_attention_frequency (`int`, *optional*, defaults to 2):
  53. The frequency of adding cross-attention to the Transformer layers.
  54. encoder_hidden_size (`int`, *optional*, defaults to 1408):
  55. The hidden size of the hidden states for cross-attention.
  56. Examples:
  57. ```python
  58. >>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel
  59. >>> # Initializing a InstructBLIP Salesforce/instructblip-flan-t5-xl style configuration
  60. >>> configuration = InstructBlipQFormerConfig()
  61. >>> # Initializing a model (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
  62. >>> model = InstructBlipQFormerModel(configuration)
  63. >>> # Accessing the model configuration
  64. >>> configuration = model.config
  65. ```"""
  66. model_type = "instructblip_qformer"
  67. base_config_key = "qformer_config"
  68. vocab_size: int = 30522
  69. hidden_size: int = 768
  70. num_hidden_layers: int = 12
  71. num_attention_heads: int = 12
  72. intermediate_size: int = 3072
  73. hidden_act: str = "gelu"
  74. hidden_dropout_prob: float | int = 0.1
  75. attention_probs_dropout_prob: float | int = 0.1
  76. max_position_embeddings: int = 512
  77. initializer_range: float = 0.02
  78. layer_norm_eps: float = 1e-12
  79. pad_token_id: int | None = 0
  80. cross_attention_frequency: int = 2
  81. encoder_hidden_size: int = 1408
  82. @auto_docstring(checkpoint="Salesforce/instructblip-flan-t5-xl")
  83. @strict
  84. class InstructBlipConfig(PreTrainedConfig):
  85. r"""
  86. qformer_config (`dict`, *optional*):
  87. Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
  88. num_query_tokens (`int`, *optional*, defaults to 32):
  89. The number of query tokens passed through the Transformer.
  90. Example:
  91. ```python
  92. >>> from transformers import (
  93. ... InstructBlipVisionConfig,
  94. ... InstructBlipQFormerConfig,
  95. ... OPTConfig,
  96. ... InstructBlipConfig,
  97. ... InstructBlipForConditionalGeneration,
  98. ... )
  99. >>> # Initializing a InstructBlipConfig with Salesforce/instructblip-flan-t5-xl style configuration
  100. >>> configuration = InstructBlipConfig()
  101. >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
  102. >>> model = InstructBlipForConditionalGeneration(configuration)
  103. >>> # Accessing the model configuration
  104. >>> configuration = model.config
  105. >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PreTrainedConfig
  106. >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
  107. >>> vision_config = InstructBlipVisionConfig()
  108. >>> qformer_config = InstructBlipQFormerConfig()
  109. >>> text_config = OPTConfig()
  110. >>> config = InstructBlipConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
  111. ```"""
  112. model_type = "instructblip"
  113. attribute_map = {
  114. "image_token_id": "image_token_index",
  115. }
  116. sub_configs = {
  117. "text_config": AutoConfig,
  118. "qformer_config": InstructBlipQFormerConfig,
  119. "vision_config": InstructBlipVisionConfig,
  120. }
  121. vision_config: dict | PreTrainedConfig | None = None
  122. qformer_config: dict | PreTrainedConfig | None = None
  123. text_config: dict | PreTrainedConfig | None = None
  124. num_query_tokens: int = 32
  125. image_token_index: int | None = None
  126. initializer_factor: float = 1.0
  127. initializer_range: float = 0.02
  128. def __post_init__(self, **kwargs):
  129. if self.text_config is None:
  130. self.text_config = CONFIG_MAPPING["opt"]()
  131. logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
  132. elif isinstance(self.text_config, dict):
  133. text_model_type = self.text_config.get("model_type", "opt")
  134. self.text_config = CONFIG_MAPPING[text_model_type](**self.text_config)
  135. if self.qformer_config is None:
  136. self.qformer_config = InstructBlipQFormerConfig()
  137. logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
  138. elif isinstance(self.qformer_config, dict):
  139. self.qformer_config = InstructBlipQFormerConfig(**self.qformer_config)
  140. if self.vision_config is None:
  141. self.vision_config = InstructBlipVisionConfig()
  142. logger.info("`vision_config` is `None`. initializing the `InstructBlipVisionConfig` with default values.")
  143. elif isinstance(self.vision_config, dict):
  144. self.vision_config = InstructBlipVisionConfig(**self.vision_config)
  145. self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
  146. self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
  147. super().__post_init__(**kwargs)
  148. __all__ = ["InstructBlipConfig", "InstructBlipQFormerConfig", "InstructBlipVisionConfig"]