configuration_lasr.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/lasr/modular_lasr.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_lasr.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # Copyright 2025 The HuggingFace Inc. team and Google LLC. All rights reserved.
  8. #
  9. # Licensed under the Apache License, Version 2.0 (the "License");
  10. # you may not use this file except in compliance with the License.
  11. # You may obtain a copy of the License at
  12. #
  13. # http://www.apache.org/licenses/LICENSE-2.0
  14. #
  15. # Unless required by applicable law or agreed to in writing, software
  16. # distributed under the License is distributed on an "AS IS" BASIS,
  17. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  18. # See the License for the specific language governing permissions and
  19. # limitations under the License.
  20. from huggingface_hub.dataclasses import strict
  21. from ...configuration_utils import PreTrainedConfig
  22. from ...utils import auto_docstring
  23. @auto_docstring(checkpoint="google/medasr")
  24. @strict
  25. class LasrEncoderConfig(PreTrainedConfig):
  26. r"""
  27. convolution_bias (`bool`, *optional*, defaults to `False`):
  28. Whether to use bias in convolutions of the conformer's convolution module.
  29. conv_kernel_size (`int`, *optional*, defaults to 32):
  30. The kernel size of the convolution layers in the Conformer block.
  31. subsampling_conv_channels (`int`, *optional*, defaults to 256):
  32. The number of channels in the subsampling convolution layers.
  33. subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
  34. The kernel size of the subsampling convolution layers.
  35. subsampling_conv_stride (`int`, *optional*, defaults to 2):
  36. The stride of the subsampling convolution layers.
  37. dropout_positions (`float`, *optional*, defaults to 0.0):
  38. The dropout ratio for the positions in the input sequence.
  39. feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
  40. The residual weights for the feed forward layers.
  41. conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
  42. The residual weights for the convolution layers.
  43. batch_norm_momentum (`float`, *optional*, defaults to 0.01):
  44. The momentum for the batch normalization layers
  45. Example:
  46. ```python
  47. >>> from transformers import LasrEncoderModel, LasrEncoderConfig
  48. >>> # Initializing a `LasrEncoder` configuration
  49. >>> configuration = LasrEncoderConfig()
  50. >>> # Initializing a model from the configuration
  51. >>> model = LasrEncoderModel(configuration)
  52. >>> # Accessing the model configuration
  53. >>> configuration = model.config
  54. ```
  55. This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
  56. and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
  57. """
  58. model_type = "lasr_encoder"
  59. keys_to_ignore_at_inference = ["past_key_values"]
  60. hidden_size: int = 512
  61. num_hidden_layers: int = 17
  62. num_attention_heads: int = 8
  63. intermediate_size: int = 2048
  64. hidden_act: str = "silu"
  65. attention_bias: bool = False
  66. convolution_bias: bool = False
  67. conv_kernel_size: int = 32
  68. subsampling_conv_channels: int = 256
  69. num_mel_bins: int = 128
  70. subsampling_conv_kernel_size: int = 5
  71. subsampling_conv_stride: int = 2
  72. dropout: float | int = 0.1
  73. dropout_positions: float | int = 0.0
  74. layerdrop: float | int = 0.1
  75. activation_dropout: float | int = 0.1
  76. attention_dropout: float | int = 0.1
  77. max_position_embeddings: int = 10000
  78. initializer_range: float = 0.02
  79. layer_norm_eps: float = 1e-6
  80. feed_forward_residual_weights: list[float] | tuple[float, ...] = (1.5, 0.5)
  81. conv_residual_weights: list[float] | tuple[float, ...] = (2.0, 1.0)
  82. batch_norm_momentum: float = 0.01
  83. rope_parameters: dict | None = None
  84. def __post_init__(self, **kwargs):
  85. self.num_key_value_heads = self.num_attention_heads
  86. super().__post_init__(**kwargs)
  87. @auto_docstring(checkpoint="google/medasr")
  88. @strict
  89. class LasrCTCConfig(PreTrainedConfig):
  90. r"""
  91. ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
  92. Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
  93. instance of [`LasrForCTC`].
  94. ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
  95. Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
  96. occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
  97. of [`LasrForCTC`].
  98. Example:
  99. ```python
  100. >>> from transformers import LasrForCTC, LasrCTCConfig
  101. >>> # Initializing a Lasr configuration
  102. >>> configuration = LasrCTCConfig()
  103. >>> # Initializing a model from the configuration
  104. >>> model = LasrForCTC(configuration)
  105. >>> # Accessing the model configuration
  106. >>> configuration = model.config
  107. ```
  108. This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
  109. and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
  110. """
  111. model_type = "lasr_ctc"
  112. sub_configs = {"encoder_config": LasrEncoderConfig}
  113. vocab_size: int = 512
  114. ctc_loss_reduction: str = "mean"
  115. ctc_zero_infinity: bool = True
  116. encoder_config: dict | PreTrainedConfig | None = None
  117. pad_token_id: int = 0
  118. def __post_init__(self, **kwargs):
  119. if isinstance(self.encoder_config, dict):
  120. self.encoder_config = LasrEncoderConfig(**self.encoder_config)
  121. elif self.encoder_config is None:
  122. self.encoder_config = LasrEncoderConfig()
  123. self.initializer_range = self.encoder_config.initializer_range
  124. super().__post_init__(**kwargs)
  125. @property
  126. def inputs_to_logits_ratio(self):
  127. return self.encoder_config.subsampling_conv_stride**2
  128. __all__ = ["LasrEncoderConfig", "LasrCTCConfig"]