configuration_clvp.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """CLVP model configuration"""
  15. import os
  16. from huggingface_hub.dataclasses import strict
  17. from ...configuration_utils import PreTrainedConfig
  18. from ...utils import auto_docstring, logging
  19. logger = logging.get_logger(__name__)
  20. @auto_docstring(checkpoint="susnato/clvp_dev")
  21. @strict
  22. class ClvpEncoderConfig(PreTrainedConfig):
  23. r"""
  24. use_rotary_embedding (`bool`, *optional*, defaults to `True`):
  25. Whether to use rotary_embedding or not.
  26. use_attention_bias (`bool`, *optional*, defaults to `False`):
  27. Whether to use bias in Query, Key and Value layers during self attention.
  28. summary_type (`str`, *optional*, defaults to `"mean"`):
  29. What strategy to use to get pooler_output from the last_hidden_state. `"last"`, `"first"`, `"mean"` and
  30. `"cls_index"` are supported.
  31. Example:
  32. ```python
  33. >>> from transformers import ClvpEncoderConfig, ClvpEncoder
  34. >>> # Initializing a ClvpEncoderConfig with susnato/clvp_dev style configuration
  35. >>> encoder_configuration = ClvpEncoderConfig()
  36. >>> # Initializing a ClvpEncoder (with random weights) from the susnato/clvp_dev style configuration
  37. >>> model = ClvpEncoder(encoder_configuration)
  38. >>> # Accessing the model configuration
  39. >>> configuration = model.config
  40. ```"""
  41. model_type = "clvp_encoder"
  42. base_config_key = ["text_config", "speech_config"]
  43. vocab_size: int = 256
  44. hidden_size: int = 768
  45. intermediate_size: int = 1536
  46. projection_dim: int = 768
  47. num_hidden_layers: int = 20
  48. num_attention_heads: int = 12
  49. hidden_act: str = "gelu"
  50. layer_norm_eps: float = 1e-5
  51. attention_dropout: float | int = 0.1
  52. dropout: float | int = 0.1
  53. use_rotary_embedding: bool = True
  54. use_attention_bias: bool = False
  55. summary_type: str = "mean"
  56. initializer_factor: float = 1.0
  57. bos_token_id: int | None = 255
  58. eos_token_id: int | list[int] | None = 0
  59. pad_token_id: int | None = None
  60. @classmethod
  61. def from_pretrained(
  62. cls, pretrained_model_name_or_path: str | os.PathLike, config_type: str = "text_config", **kwargs
  63. ):
  64. config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
  65. # make sure to have the config_type be either "text_config" or "speech_config"
  66. # this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
  67. if config_type not in cls.base_config_key:
  68. raise ValueError(
  69. f"We can only load either 'text_config' or 'speech_config' but you are trying to load{config_type}"
  70. )
  71. # get the text config dict if we are loading from ClvpConfig
  72. if config_dict.get("model_type") == "clvp":
  73. config_dict = config_dict[config_type]
  74. if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
  75. logger.warning(
  76. f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
  77. f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
  78. )
  79. return cls.from_dict(config_dict, **kwargs)
  80. @auto_docstring(checkpoint="susnato/clvp_dev")
  81. @strict
  82. class ClvpDecoderConfig(PreTrainedConfig):
  83. r"""
  84. max_text_tokens (`int`, *optional*, defaults to 404):
  85. The maximum sequence length of text tokens that this model might ever be used with. Similar to
  86. `n_positions` in `GPT2Config`.
  87. n_inner (`int`, *optional*):
  88. Dimensionality of the inner feed-forward layers. `None` will set it to 4 times `hidden_size`.
  89. num_mel_attn_blocks (`int`, *optional*, defaults to 6):
  90. Denotes the number of self attention layers in [`ClvpConditioningEncoder`].
  91. summary_type (`string`, *optional*, defaults to `"cls_index"`):
  92. Argument used when doing sequence summary.
  93. Has to be one of the following options:
  94. - `"last"`: Take the last token hidden state (like XLNet).
  95. - `"first"`: Take the first token hidden state (like BERT).
  96. - `"mean"`: Take the mean of all tokens hidden states.
  97. - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
  98. - `"attn"`: Not implemented now, use multi-head attention.
  99. summary_use_proj (`bool`, *optional*, defaults to `True`):
  100. Whether or not to add a projection after the vector extraction.
  101. summary_activation (`str`, *optional*):
  102. Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
  103. summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
  104. Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
  105. summary_first_dropout (`float`, *optional*, defaults to 0.1):
  106. The dropout ratio to be used after the projection and activation.
  107. feature_size (`int`, *optional*, defaults to 80):
  108. The feature dimension of the extracted mel features. This value is used in [`ClvpConditioningEncoder`].
  109. use_attention_bias (`bool`, *optional*, defaults to `True`):
  110. Whether to use bias in Query, Key and Value layers during self attention.
  111. decoder_fixing_codes (`list`, *optional*, defaults to `[83, 45, 45, 248]`):
  112. These values are used in the method `fix_speech_decoder_output` to fix decoder generated outputs.
  113. Example:
  114. ```python
  115. >>> from transformers import ClvpDecoderConfig, ClvpDecoder
  116. >>> # Initializing a ClvpDecoderConfig with susnato/clvp_dev style configuration
  117. >>> decoder_configuration = ClvpDecoderConfig()
  118. >>> # Initializing a ClvpDecoder (with random weights) from the susnato/clvp_dev style configuration
  119. >>> model = ClvpDecoder(decoder_configuration)
  120. >>> # Accessing the model configuration
  121. >>> configuration = model.config
  122. ```"""
  123. model_type = "clvp_decoder"
  124. base_config_key = "decoder_config"
  125. vocab_size: int = 8194
  126. max_position_embeddings: int = 608
  127. max_text_tokens: int = 404
  128. hidden_size: int = 1024
  129. num_hidden_layers: int = 30
  130. num_attention_heads: int = 16
  131. n_inner: int | None = None
  132. num_mel_attn_blocks: int = 6
  133. activation_function: str = "gelu_new"
  134. resid_pdrop: float | int = 0.1
  135. embd_pdrop: float | int = 0.1
  136. attention_dropout: float | int = 0.1
  137. layer_norm_epsilon: float = 1e-5
  138. initializer_range: float = 0.02
  139. summary_type: str = "cls_index"
  140. summary_use_proj: bool = True
  141. summary_activation: str | None = None
  142. summary_proj_to_labels: bool = True
  143. summary_first_dropout: float | int = 0.1
  144. use_cache: bool = True
  145. bos_token_id: int | None = 8192
  146. eos_token_id: int | list[int] | None = 8193
  147. pad_token_id: int | None = None
  148. feature_size: int = 80
  149. use_attention_bias: bool = True
  150. initializer_factor: float = 1.0
  151. decoder_fixing_codes: list[int] | tuple[int, ...] = (83, 45, 45, 248)
  152. add_cross_attention: bool = False
  153. @auto_docstring(checkpoint="susnato/clvp_dev")
  154. @strict
  155. class ClvpConfig(PreTrainedConfig):
  156. r"""
  157. speech_config (`dict`, *optional*):
  158. Dictionary of configuration options used to initialize CLVP speech encoder.
  159. decoder_config (`dict`, *optional*):
  160. Dictionary of configuration options used to initialize [`ClvpDecoderConfig`].
  161. Example:
  162. ```python
  163. >>> from transformers import ClvpConfig, ClvpModelForConditionalGeneration
  164. >>> # Initializing a ClvpConfig with susnato/clvp_dev style configuration
  165. >>> configuration = ClvpConfig()
  166. >>> # Initializing a ClvpModelForConditionalGeneration (with random weights) from the susnato/clvp_dev style configuration
  167. >>> model = ClvpModelForConditionalGeneration(configuration)
  168. >>> # Accessing the model configuration
  169. >>> configuration = model.config
  170. >>> # We can also initialize a CLVPConfig from a CLVPTextConfig, CLVPSpeechConfig and a CLVPAutoRegressiveConfig
  171. >>> from transformers import ClvpEncoderConfig, ClvpDecoderConfig
  172. >>> # Initializing a CLVP text, CLVP speech and CLVP decoder configuration
  173. >>> config_text = ClvpEncoderConfig()
  174. >>> config_speech = ClvpEncoderConfig()
  175. >>> decoder_config = ClvpDecoderConfig()
  176. >>> config = ClvpConfig(config_text, config_speech, decoder_config)
  177. ```"""
  178. model_type = "clvp"
  179. sub_configs = {
  180. "text_config": ClvpEncoderConfig,
  181. "speech_config": ClvpEncoderConfig,
  182. "decoder_config": ClvpDecoderConfig,
  183. }
  184. text_config: dict | PreTrainedConfig | None = None
  185. speech_config: dict | PreTrainedConfig | None = None
  186. decoder_config: dict | PreTrainedConfig | None = None
  187. projection_dim: int = 768
  188. logit_scale_init_value: float = 2.6592
  189. initializer_factor: float = 1.0
  190. def __post_init__(self, **kwargs):
  191. if self.text_config is None:
  192. self.text_config = ClvpEncoderConfig()
  193. logger.info("`text_config` is `None`. initializing the `ClvpEncoderConfig` with default values.")
  194. elif isinstance(self.text_config, dict):
  195. self.text_config = ClvpEncoderConfig(**self.text_config)
  196. if self.speech_config is None:
  197. self.speech_config = ClvpEncoderConfig()
  198. logger.info("`speech_config` is `None`. initializing the `ClvpEncoderConfig` with default values.")
  199. elif isinstance(self.speech_config, dict):
  200. self.speech_config = ClvpEncoderConfig(**self.speech_config)
  201. if self.decoder_config is None:
  202. self.decoder_config = ClvpDecoderConfig()
  203. logger.info("`image_config` is `None`. initializing the `ClvpDecoderConfig` with default values.")
  204. elif isinstance(self.decoder_config, dict):
  205. self.decoder_config = ClvpDecoderConfig(**self.decoder_config)
  206. super().__post_init__(**kwargs)
  207. __all__ = ["ClvpConfig", "ClvpDecoderConfig", "ClvpEncoderConfig"]