configuration_bloom.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Bloom configuration"""
  15. from huggingface_hub.dataclasses import strict
  16. from ...configuration_utils import PreTrainedConfig
  17. from ...utils import auto_docstring
  18. @auto_docstring(checkpoint="bigscience/bloom")
  19. @strict
  20. class BloomConfig(PreTrainedConfig):
  21. r"""
  22. apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
  23. If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
  24. slow_but_exact (`bool`, *optional*, defaults to `False`):
  25. Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While
  26. merging the TP rank tensors, due to slicing operations the results may be slightly different between the
  27. model trained on Megatron and our model. Please refer to [this
  28. issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to
  29. enable this feature. Enabling this will hurt the computational time of the inference. Will be probably
  30. resolved in the future once the main model has been fine-tuned with TP_rank=1.
  31. Example:
  32. ```python
  33. >>> from transformers import BloomConfig, BloomModel
  34. >>> # Initializing a Bloom configuration
  35. >>> configuration = BloomConfig()
  36. >>> # Initializing a model (with random weights) from the configuration
  37. >>> model = BloomModel(configuration)
  38. >>> # Accessing the model configuration
  39. >>> configuration = model.config
  40. ```"""
  41. model_type = "bloom"
  42. keys_to_ignore_at_inference = ["past_key_values"]
  43. attribute_map = {
  44. "num_hidden_layers": "n_layer",
  45. "num_attention_heads": "n_head",
  46. }
  47. vocab_size: int = 250880
  48. hidden_size: int = 64
  49. n_layer: int = 2
  50. n_head: int = 8
  51. layer_norm_epsilon: float = 1e-5
  52. initializer_range: float = 0.02
  53. use_cache: bool = True
  54. bos_token_id: int | None = 1
  55. eos_token_id: int | list[int] | None = 2
  56. pad_token_id: int | None = None
  57. apply_residual_connection_post_layernorm: bool = False
  58. hidden_dropout: float | int = 0.0
  59. attention_dropout: float | int = 0.0
  60. pretraining_tp: int = 1 # TP rank used when training with megatro
  61. slow_but_exact: bool = False
  62. tie_word_embeddings: bool = True
  63. def __post_init__(self, **kwargs):
  64. # Backward compatibility with n_embed kwarg
  65. n_embed = kwargs.pop("n_embed", None)
  66. self.hidden_size = self.hidden_size if n_embed is None else n_embed
  67. super().__post_init__(**kwargs)
  68. __all__ = ["BloomConfig"]