configuration_xcodec.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Xcodec model configuration"""
  15. import math
  16. import numpy as np
  17. from huggingface_hub.dataclasses import strict
  18. from ...configuration_utils import PreTrainedConfig
  19. from ...utils import auto_docstring
  20. from ..auto import CONFIG_MAPPING, AutoConfig
  21. @auto_docstring(checkpoint="Manel/X-Codec")
  22. @strict
  23. class XcodecConfig(PreTrainedConfig):
  24. r"""
  25. target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
  26. The range of different bandwidths (in kbps) the model can encode audio with.
  27. channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
  28. Expansion factors for the number of output channels in each semantic block.
  29. strides (`List[int]`, *optional*, defaults to `[1, 1]`):
  30. Strides for each semantic encoder block.
  31. block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
  32. Dilation factors for the residual units in semantic blocks.
  33. unit_kernel_size (`int`, *optional*, defaults to 3):
  34. Kernel size inside each ResidualUnit in semantic blocks.
  35. acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
  36. An instance of the configuration for the acoustic (DAC) model.
  37. semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
  38. An instance of the configuration object for the semantic (HuBERT) model.
  39. Example:
  40. ```python
  41. >>> from transformers import XcodecModel, XcodecConfig
  42. >>> # Initializing configuration
  43. >>> configuration = XcodecConfig()
  44. >>> # Initializing a model (with random weights) from the configuration
  45. >>> model = XcodecModel(configuration)
  46. >>> # Accessing the model configuration
  47. >>> configuration = model.config
  48. ```"""
  49. model_type = "xcodec"
  50. sub_configs = {
  51. "acoustic_model_config": AutoConfig,
  52. "semantic_model_config": AutoConfig,
  53. }
  54. _default_acoustic_model_config_kwargs = {
  55. "encoder_hidden_size": 64,
  56. # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios`
  57. # (not sure if intentional by Xcodec but we keep it)
  58. "downsampling_ratios": [8, 5, 4, 2],
  59. "decoder_hidden_size": 1024,
  60. "upsampling_ratios": [8, 5, 4, 2],
  61. "hidden_size": 256,
  62. }
  63. _default_semantic_model_config_kwargs = {}
  64. target_bandwidths: list[int | float] | tuple[int | float, ...] = (0.5, 1, 1.5, 2, 4)
  65. sample_rate: int = 16000
  66. kernel_size: int = 3
  67. channel_ratios: list[int] | tuple[int, ...] = (1, 1)
  68. strides: list[int] | tuple[int, ...] = (1, 1)
  69. block_dilations: list[int] | tuple[int, ...] = (1, 1)
  70. unit_kernel_size: int = 3
  71. codebook_size: int = 1024
  72. codebook_dim: int | None = None
  73. initializer_range: float = 0.02
  74. acoustic_model_config: dict | PreTrainedConfig | None = None
  75. semantic_model_config: dict | PreTrainedConfig | None = None
  76. def __post_init__(self, **kwargs):
  77. if self.acoustic_model_config is None:
  78. self.acoustic_model_config = CONFIG_MAPPING["dac"](
  79. encoder_hidden_size=64,
  80. # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios`
  81. # (not sure if intentional by Xcodec but we keep it)
  82. downsampling_ratios=[8, 5, 4, 2],
  83. decoder_hidden_size=1024,
  84. upsampling_ratios=[8, 5, 4, 2],
  85. hidden_size=256,
  86. )
  87. elif isinstance(self.acoustic_model_config, dict):
  88. self.acoustic_model_config["model_type"] = self.acoustic_model_config.get("model_type", "dac")
  89. self.acoustic_model_config = CONFIG_MAPPING[self.acoustic_model_config["model_type"]](
  90. **{**self._default_acoustic_model_config_kwargs, **self.acoustic_model_config}
  91. )
  92. if self.semantic_model_config is None:
  93. self.semantic_model_config = CONFIG_MAPPING["hubert"]()
  94. elif isinstance(self.semantic_model_config, dict):
  95. self.semantic_model_config["model_type"] = self.semantic_model_config.get("model_type", "hubert")
  96. self.semantic_model_config = CONFIG_MAPPING[self.semantic_model_config["model_type"]](
  97. **{**self._default_semantic_model_config_kwargs, **self.semantic_model_config}
  98. )
  99. if self.codebook_dim is None:
  100. self.codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size
  101. super().__post_init__(**kwargs)
  102. @property
  103. def frame_rate(self) -> int:
  104. return math.ceil(self.sample_rate / self.hop_length)
  105. @property
  106. def semantic_hidden_size(self) -> int:
  107. return self.semantic_model_config.hidden_size
  108. @property
  109. def hop_length(self) -> int:
  110. return int(np.prod(self.acoustic_model_config.downsampling_ratios))
  111. @property
  112. def codebook_nbits(self) -> int:
  113. return math.ceil(math.log2(self.codebook_size))
  114. @property
  115. def hidden_size(self) -> int:
  116. return self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size
  117. @property
  118. def num_quantizers(self) -> int:
  119. return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * self.codebook_nbits))
  120. __all__ = ["XcodecConfig"]