configuration_bark.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. # Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """BARK model configuration"""
  15. from huggingface_hub.dataclasses import strict
  16. from ...configuration_utils import PreTrainedConfig
  17. from ...utils import auto_docstring, logging
  18. from ..auto import CONFIG_MAPPING, AutoConfig
  19. logger = logging.get_logger(__name__)
  20. @auto_docstring(checkpoint="suno/bark")
  21. @strict
  22. class BarkSubModelConfig(PreTrainedConfig):
  23. r"""
  24. block_size (`int`, *optional*, defaults to 1024):
  25. The maximum sequence length that this model might ever be used with. Typically set this to something large
  26. just in case (e.g., 512 or 1024 or 2048).
  27. input_vocab_size (`int`, *optional*, defaults to 10_048):
  28. Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
  29. `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
  30. regards to the chosen sub-model.
  31. output_vocab_size (`int`, *optional*, defaults to 10_048):
  32. Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
  33. by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
  34. with regards to the chosen sub-model.
  35. bias (`bool`, *optional*, defaults to `True`):
  36. Whether or not to use bias in the linear layers and layer norm layers.
  37. """
  38. keys_to_ignore_at_inference = ["past_key_values"]
  39. attribute_map = {
  40. "num_attention_heads": "num_heads",
  41. "num_hidden_layers": "num_layers",
  42. "vocab_size": "input_vocab_size",
  43. "window_size": "block_size",
  44. }
  45. block_size: int = 1024
  46. input_vocab_size: int = 10_048
  47. output_vocab_size: int = 10_048
  48. num_layers: int = 12
  49. num_heads: int = 12
  50. hidden_size: int = 768
  51. dropout: float | int = 0.0
  52. bias: bool = True
  53. initializer_range: float = 0.02
  54. use_cache: bool = True
  55. @auto_docstring(checkpoint="suno/bark")
  56. @strict
  57. class BarkSemanticConfig(BarkSubModelConfig):
  58. r"""
  59. block_size (`int`, *optional*, defaults to 1024):
  60. The maximum sequence length that this model might ever be used with. Typically set this to something large
  61. just in case (e.g., 512 or 1024 or 2048).
  62. input_vocab_size (`int`, *optional*, defaults to 10_048):
  63. Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
  64. `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
  65. regards to the chosen sub-model.
  66. output_vocab_size (`int`, *optional*, defaults to 10_048):
  67. Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
  68. by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
  69. with regards to the chosen sub-model.
  70. bias (`bool`, *optional*, defaults to `True`):
  71. Whether or not to use bias in the linear layers and layer norm layers
  72. Example:
  73. ```python
  74. >>> from transformers import BarkSemanticConfig, BarkSemanticModel
  75. >>> # Initializing a Bark sub-module style configuration
  76. >>> configuration = BarkSemanticConfig()
  77. >>> # Initializing a model (with random weights) from the suno/bark style configuration
  78. >>> model = BarkSemanticModel(configuration)
  79. >>> # Accessing the model configuration
  80. >>> configuration = model.config
  81. ```"""
  82. model_type = "semantic"
  83. base_config_key = "semantic_config"
  84. @auto_docstring(checkpoint="suno/bark")
  85. @strict
  86. class BarkCoarseConfig(BarkSubModelConfig):
  87. r"""
  88. block_size (`int`, *optional*, defaults to 1024):
  89. The maximum sequence length that this model might ever be used with. Typically set this to something large
  90. just in case (e.g., 512 or 1024 or 2048).
  91. input_vocab_size (`int`, *optional*, defaults to 10_048):
  92. Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
  93. `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
  94. regards to the chosen sub-model.
  95. output_vocab_size (`int`, *optional*, defaults to 10_048):
  96. Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
  97. by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
  98. with regards to the chosen sub-model.
  99. bias (`bool`, *optional*, defaults to `True`):
  100. Whether or not to use bias in the linear layers and layer norm layers
  101. Example:
  102. ```python
  103. >>> from transformers import BarkCoarseConfig, BarkCoarseModel
  104. >>> # Initializing a Bark sub-module style configuration
  105. >>> configuration = BarkCoarseConfig()
  106. >>> # Initializing a model (with random weights) from the suno/bark style configuration
  107. >>> model = BarkCoarseModel(configuration)
  108. >>> # Accessing the model configuration
  109. >>> configuration = model.config
  110. ```"""
  111. model_type = "coarse_acoustics"
  112. base_config_key = "coarse_acoustics_config"
  113. @auto_docstring(checkpoint="suno/bark")
  114. @strict
  115. class BarkFineConfig(BarkSubModelConfig):
  116. r"""
  117. block_size (`int`, *optional*, defaults to 1024):
  118. The maximum sequence length that this model might ever be used with. Typically set this to something large
  119. just in case (e.g., 512 or 1024 or 2048).
  120. input_vocab_size (`int`, *optional*, defaults to 10_048):
  121. Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
  122. `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
  123. regards to the chosen sub-model.
  124. output_vocab_size (`int`, *optional*, defaults to 10_048):
  125. Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
  126. by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
  127. with regards to the chosen sub-model.
  128. bias (`bool`, *optional*, defaults to `True`):
  129. Whether or not to use bias in the linear layers and layer norm layers
  130. n_codes_total (`int`, *optional*, defaults to 8):
  131. The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
  132. n_codes_given (`int`, *optional*, defaults to 1):
  133. The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
  134. sub-models.
  135. Example:
  136. ```python
  137. >>> from transformers import BarkFineConfig, BarkFineModel
  138. >>> # Initializing a Bark sub-module style configuration
  139. >>> configuration = BarkFineConfig()
  140. >>> # Initializing a model (with random weights) from the suno/bark style configuration
  141. >>> model = BarkFineModel(configuration)
  142. >>> # Accessing the model configuration
  143. >>> configuration = model.config
  144. ```"""
  145. model_type = "fine_acoustics"
  146. base_config_key = "fine_acoustics_config"
  147. tie_word_embeddings: bool = True
  148. n_codes_total: int = 8
  149. n_codes_given: int = 1
  150. @auto_docstring(checkpoint="suno/bark")
  151. @strict
  152. class BarkConfig(PreTrainedConfig):
  153. r"""
  154. semantic_config ([`BarkSemanticConfig`], *optional*):
  155. Configuration of the underlying semantic sub-model.
  156. coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
  157. Configuration of the underlying coarse acoustics sub-model.
  158. fine_acoustics_config ([`BarkFineConfig`], *optional*):
  159. Configuration of the underlying fine acoustics sub-model.
  160. codec_config ([`AutoConfig`], *optional*):
  161. Configuration of the underlying codec sub-model.
  162. Example:
  163. ```python
  164. >>> from transformers import (
  165. ... BarkSemanticConfig,
  166. ... BarkCoarseConfig,
  167. ... BarkFineConfig,
  168. ... BarkModel,
  169. ... BarkConfig,
  170. ... AutoConfig,
  171. ... )
  172. >>> # Initializing Bark sub-modules configurations.
  173. >>> semantic_config = BarkSemanticConfig()
  174. >>> coarse_acoustics_config = BarkCoarseConfig()
  175. >>> fine_acoustics_config = BarkFineConfig()
  176. >>> codec_config = AutoConfig.from_pretrained("facebook/encodec_24khz")
  177. >>> # Initializing a Bark module style configuration
  178. >>> configuration = BarkConfig(
  179. ... semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config
  180. ... )
  181. >>> # Initializing a model (with random weights)
  182. >>> model = BarkModel(configuration)
  183. >>> # Accessing the model configuration
  184. >>> configuration = model.config
  185. ```
  186. """
  187. model_type = "bark"
  188. sub_configs = {
  189. "semantic_config": BarkSemanticConfig,
  190. "coarse_acoustics_config": BarkCoarseConfig,
  191. "fine_acoustics_config": BarkFineConfig,
  192. "codec_config": AutoConfig,
  193. }
  194. semantic_config: dict | PreTrainedConfig | None = None
  195. coarse_acoustics_config: dict | PreTrainedConfig | None = None
  196. fine_acoustics_config: dict | PreTrainedConfig | None = None
  197. codec_config: dict | PreTrainedConfig | None = None
  198. initializer_range: float = 0.02
  199. def __post_init__(self, **kwargs):
  200. if self.semantic_config is None:
  201. self.semantic_config = BarkSemanticConfig()
  202. logger.info("`semantic_config` is `None`. Initializing the `BarkSemanticConfig` with default values.")
  203. elif isinstance(self.semantic_config, dict):
  204. self.semantic_config = BarkSemanticConfig(**self.semantic_config)
  205. if self.coarse_acoustics_config is None:
  206. self.coarse_acoustics_config = BarkCoarseConfig()
  207. logger.info(
  208. "`coarse_acoustics_config` is `None`. Initializing the `BarkCoarseConfig` with default values."
  209. )
  210. elif isinstance(self.coarse_acoustics_config, dict):
  211. self.coarse_acoustics_config = BarkCoarseConfig(**self.coarse_acoustics_config)
  212. if self.fine_acoustics_config is None:
  213. self.fine_acoustics_config = BarkFineConfig()
  214. logger.info("`fine_acoustics_config` is `None`. Initializing the `BarkFineConfig` with default values.")
  215. elif isinstance(self.fine_acoustics_config, dict):
  216. self.fine_acoustics_config = BarkFineConfig(**self.fine_acoustics_config)
  217. if self.codec_config is None:
  218. self.codec_config = CONFIG_MAPPING["encodec"]()
  219. logger.info("`codec_config` is `None`. Initializing the `codec_config` with default values.")
  220. elif isinstance(self.codec_config, dict):
  221. codec_model_type = self.codec_config.get("model_type", "encodec")
  222. self.codec_config = CONFIG_MAPPING[codec_model_type](**self.codec_config)
  223. super().__post_init__(**kwargs)
  224. __all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"]