# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Bloom configuration"""

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring


@auto_docstring(checkpoint="bigscience/bloom")
@strict
class BloomConfig(PreTrainedConfig):
    r"""
    apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
        If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
    slow_but_exact (`bool`, *optional*, defaults to `False`):
        Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While
        merging the TP rank tensors, due to slicing operations the results may be slightly different between the
        model trained on Megatron and our model. Please refer to [this
        issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to
        enable this feature. Enabling this will hurt the computational time of the inference. Will be probably
        resolved in the future once the main model has been fine-tuned with TP_rank=1.

    Example:

    ```python
    >>> from transformers import BloomConfig, BloomModel

    >>> # Initializing a Bloom configuration
    >>> configuration = BloomConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = BloomModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "bloom"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
    }

    vocab_size: int = 250880
    hidden_size: int = 64
    n_layer: int = 2
    n_head: int = 8
    layer_norm_epsilon: float = 1e-5
    initializer_range: float = 0.02
    use_cache: bool = True
    bos_token_id: int | None = 1
    eos_token_id: int | list[int] | None = 2
    pad_token_id: int | None = None
    apply_residual_connection_post_layernorm: bool = False
    hidden_dropout: float | int = 0.0
    attention_dropout: float | int = 0.0
    pretraining_tp: int = 1  # TP rank used when training with megatro
    slow_but_exact: bool = False
    tie_word_embeddings: bool = True

    def __post_init__(self, **kwargs):
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = self.hidden_size if n_embed is None else n_embed
        super().__post_init__(**kwargs)


__all__ = ["BloomConfig"]