configuration_csm.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # Copyright 2025 Sesame and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from huggingface_hub.dataclasses import strict
  15. from ...configuration_utils import PreTrainedConfig
  16. from ...modeling_rope_utils import RopeParameters
  17. from ...utils import auto_docstring, logging
  18. from ..auto.configuration_auto import AutoConfig
  19. logger = logging.get_logger(__name__)
  20. @auto_docstring(checkpoint="sesame/csm-1b")
  21. @strict
  22. class CsmDepthDecoderConfig(PreTrainedConfig):
  23. r"""
  24. backbone_hidden_size (`int`, *optional*, defaults to 2048):
  25. Dimension of the hidden representations of the backbone model used with this depth decoder.
  26. Example:
  27. ```python
  28. >>> from transformers import CsmDepthDecoder, CsmDepthDecoderConfig
  29. >>> # Initializing a CsmDepthDecoder
  30. >>> configuration = CsmDepthDecoderConfig()
  31. >>> model = CsmDepthDecoderModel(configuration)
  32. >>> # Accessing the model configuration
  33. >>> configuration = model.config
  34. ```"""
  35. model_type = "csm_depth_decoder_model"
  36. base_config_key = "depth_decoder_config"
  37. keys_to_ignore_at_inference = ["past_key_values"]
  38. attribute_map = {
  39. "codebook_size": "vocab_size",
  40. }
  41. default_theta = 500000.0
  42. num_codebooks: int | None = 32
  43. backbone_hidden_size: int = 2048
  44. vocab_size: int = 2051
  45. hidden_size: int = 1024
  46. intermediate_size: int = 8192
  47. num_hidden_layers: int = 4
  48. num_attention_heads: int = 8
  49. num_key_value_heads: int | None = 2
  50. hidden_act: str = "silu"
  51. max_position_embeddings: int = 33
  52. initializer_range: float = 0.02
  53. rms_norm_eps: float = 1e-5
  54. use_cache: bool = True
  55. pad_token_id: int | None = None
  56. bos_token_id: int | None = None
  57. eos_token_id: int | list[int] | None = None
  58. rope_parameters: RopeParameters | dict | None = None
  59. attention_bias: bool = False
  60. attention_dropout: float | int | None = 0.0
  61. mlp_bias: bool = False
  62. head_dim: int | None = None
  63. def __post_init__(self, **kwargs):
  64. if kwargs.pop("tie_word_embeddings", False):
  65. raise ValueError("`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig")
  66. # for backward compatibility
  67. if self.num_key_value_heads is None:
  68. self.num_key_value_heads = self.num_attention_heads
  69. self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads
  70. super().__post_init__(**kwargs)
  71. @auto_docstring(checkpoint="sesame/csm-1b")
  72. @strict
  73. class CsmConfig(PreTrainedConfig):
  74. r"""
  75. codebook_pad_token_id (`int`, *optional*, defaults to 2050):
  76. Padding token id for codebook tokens.
  77. codebook_eos_token_id (`int`, *optional*, defaults to 0):
  78. End of stream token id for codebook tokens.
  79. audio_token_id (`int`, *optional*, defaults to 128002):
  80. Audio token id in the text input.
  81. audio_eos_token_id (`int`, *optional*, defaults to 128003):
  82. End of stream token id for audio in the text input.
  83. tie_codebooks_embeddings (`bool`, *optional*, defaults to `True`):
  84. Whether to tie the codebook tokens embeddings of the backbone model to the codebook tokens embeddings of the depth decoder.
  85. depth_decoder_config (`CsmDepthDecoderConfig`, *optional*):
  86. Configuration for the depth decoder.
  87. codec_config (`PreTrainedConfig`, *optional*):
  88. Configuration for the codec.
  89. ```python
  90. >>> from transformers import CsmForConditionalGeneration, CsmConfig
  91. >>> # Initializing a CsmConfig
  92. >>> configuration = CsmConfig()
  93. >>> # Initializing a model
  94. >>> model = CsmForConditionalGeneration(configuration)
  95. >>> # Accessing the model configuration
  96. >>> configuration = model.config
  97. ```
  98. """
  99. model_type = "csm"
  100. base_config_key = "csm_config"
  101. keys_to_ignore_at_inference = ["past_key_values"]
  102. default_theta = 500000.0
  103. sub_configs = {
  104. "codec_config": AutoConfig,
  105. "depth_decoder_config": CsmDepthDecoderConfig,
  106. }
  107. attribute_map = {
  108. "codebook_size": "vocab_size",
  109. }
  110. num_codebooks: int | None = 32
  111. vocab_size: int = 2051
  112. text_vocab_size: int = 128256
  113. hidden_size: int = 2048
  114. intermediate_size: int = 8192
  115. num_hidden_layers: int = 16
  116. num_attention_heads: int = 32
  117. num_key_value_heads: int | None = 8
  118. hidden_act: str = "silu"
  119. max_position_embeddings: int = 2048
  120. initializer_range: float = 0.02
  121. rms_norm_eps: float = 1e-5
  122. use_cache: bool = True
  123. pad_token_id: int | None = 128002
  124. codebook_pad_token_id: int | None = 2050
  125. codebook_eos_token_id: int | list[int] | None = 0
  126. bos_token_id: int | None = 128000
  127. eos_token_id: int | list[int] | None = None
  128. audio_token_id: int | None = 128002
  129. audio_eos_token_id: int | list[int] | None = 128003
  130. rope_parameters: RopeParameters | dict | None = None
  131. attention_bias: bool = False
  132. attention_dropout: float | int | None = 0.0
  133. mlp_bias: bool = False
  134. head_dim: int | None = None
  135. tie_codebooks_embeddings: bool | None = True
  136. depth_decoder_config: dict | PreTrainedConfig | None = None
  137. codec_config: dict | PreTrainedConfig | None = None
  138. def __post_init__(self, **kwargs):
  139. if kwargs.pop("tie_word_embeddings", False):
  140. raise ValueError("`tie_word_embeddings=True` is not supported for CsmConfig")
  141. if self.depth_decoder_config is None:
  142. self.depth_decoder_config = CsmDepthDecoderConfig()
  143. logger.info("depth_decoder_config is None, using default depth decoder config.")
  144. elif isinstance(self.depth_decoder_config, dict):
  145. self.depth_decoder_config = CsmDepthDecoderConfig(**self.depth_decoder_config)
  146. if self.codec_config is None:
  147. self.codec_config = AutoConfig.for_model("mimi")
  148. logger.info("codec_config is None, using default audio encoder config.")
  149. elif isinstance(self.codec_config, dict):
  150. self.codec_config = AutoConfig.for_model(**self.codec_config)
  151. if self.num_key_value_heads is None:
  152. self.num_key_value_heads = self.num_attention_heads
  153. self.head_dim = self.head_dim if self.head_dim is not None else self.hidden_size // self.num_attention_heads
  154. self.tie_word_embeddings = False
  155. super().__post_init__(**kwargs)
  156. __all__ = [
  157. "CsmDepthDecoderConfig",
  158. "CsmConfig",
  159. ]