# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evolla model configuration"""

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring, logging


logger = logging.get_logger(__name__)


@auto_docstring(checkpoint="westlake-repl/Evolla-10B-hf")
@strict
class SaProtConfig(PreTrainedConfig):
    r"""
    mask_token_id (`int`, *optional*, defaults to 4):
        The id of the *mask* token in the protein sequence model.
    position_embedding_type (`str`, *optional*, defaults to `"rotary"`):
        The type of position embedding to use in the protein sequence model. Currently only `"rotary"` is supported.
    emb_layer_norm_before (`bool`, *optional*, defaults to `False`):
        Whether to apply layer normalization before the position embedding in the protein sequence model.
    token_dropout (`bool`, *optional*, defaults to `True`):
        Whether to apply dropout to the tokens in the protein sequence model.
    """

    vocab_size: int = 446
    mask_token_id: int = 4
    pad_token_id: int = 1
    hidden_size: int = 1280
    num_hidden_layers: int = 33
    num_attention_heads: int = 20
    intermediate_size: int = 5120
    hidden_dropout_prob: float | int = 0.1
    attention_probs_dropout_prob: float | int = 0.1
    max_position_embeddings: int = 1026
    initializer_range: float = 0.02
    layer_norm_eps: float = 1e-05
    position_embedding_type: str = "rotary"
    emb_layer_norm_before: bool = False
    token_dropout: bool = True
    is_decoder: bool = False
    add_cross_attention: bool = False


@auto_docstring(checkpoint="westlake-repl/Evolla-10B-hf")
@strict
class EvollaConfig(PreTrainedConfig):
    r"""
    protein_encoder_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`SaProtConfig`].
    aligner_ffn_mult (`int`, *optional*, defaults to 4):
        The FFN multiplier for the aligner layer.
    aligner_enable_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the aligner layer.
    aligner_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities in the aligner layer.
    aligner_num_add_layers (`int`, *optional*, defaults to 8):
        The number of additional layers for the aligner layer.
    resampler_depth (`int`, *optional*, defaults to 6):
        The depth of the resampler layer in the llama model.
    resampler_dim_head (`int`, *optional*, defaults to 64):
        The dimension of the heads in the resampler layer in the llama model.
    resampler_heads (`int`, *optional*, defaults to 8):
        The number of heads in the resampler layer in the llama model.
    resampler_num_latents (`int`, *optional*, defaults to 64):
        The number of latents in the resampler layer in the llama model.
    resampler_ff_mult (`int`, *optional*, defaults to 4):
        The FFN multiplier for the resampler layer.

    Example:

    ```python
    >>> from transformers import EvollaModel, EvollaConfig

    >>> # Initializing a Evolla evolla-10b style configuration
    >>> configuration = EvollaConfig()

    >>> # Initializing a model from the evolla-10b style configuration
    >>> model = EvollaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "EvollaModel"
    sub_configs = {"protein_encoder_config": SaProtConfig}
    default_theta = 500000.0

    protein_encoder_config: dict | PreTrainedConfig | None = None
    vocab_size: int = 128256  # llama vocab size
    hidden_size: int = 4096  # llama hidden size
    intermediate_size: int = 14336  # llama intermediate size
    num_hidden_layers: int = 32  # llama num layers
    num_attention_heads: int = 32  # llama num heads
    num_key_value_heads: int | None = 8  # llama num key-value heads
    hidden_act: str = "silu"  # llama activation function
    max_position_embeddings: int = 8192  # llama rope max length
    rms_norm_eps: float = 1e-05
    rope_parameters: RopeParameters | dict | None = None
    attention_bias: bool = False
    attention_dropout: float | int | None = 0.0
    mlp_bias: bool = False
    aligner_ffn_mult: int | None = 4
    aligner_enable_bias: bool | None = True
    aligner_attention_probs_dropout_prob: float | None = 0.1
    aligner_num_add_layers: int | None = 8
    resampler_depth: int | None = 6
    resampler_dim_head: int | None = 64
    resampler_heads: int | None = 8
    resampler_num_latents: int | None = 64
    resampler_ff_mult: int | None = 4
    initializer_range: float = 0.02
    pad_token_id: int | None = None
    bos_token_id: int | None = 128000
    eos_token_id: int | list[int] | None = 128009
    use_cache: bool = False
    tie_word_embeddings: bool = False
    is_decoder: bool | None = False
    add_cross_attention: bool | None = False

    def __post_init__(self, **kwargs):
        if self.protein_encoder_config is None:
            self.protein_encoder_config = SaProtConfig()
            logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
        elif isinstance(self.protein_encoder_config, dict):
            self.protein_encoder_config = SaProtConfig(**self.protein_encoder_config)
        super().__post_init__(**kwargs)


__all__ = ["EvollaConfig"]