| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- # Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Zamba model configuration"""
- import math
- from huggingface_hub.dataclasses import strict
- from ...configuration_utils import PreTrainedConfig
- from ...utils import auto_docstring
- @auto_docstring(checkpoint="Zyphra/Zamba-7B-v1")
- @strict
- class ZambaConfig(PreTrainedConfig):
- r"""
- attention_hidden_size (`int`, *optional*):
- Dimension of the hidden representations of the inputs to the Attention layer.
- attention_head_dim (`int`, *optional*):
- Dimension of the attention head in the Transformer decoder.
- n_mamba_heads (`int`, *optional*, defaults to 2):
- Number of mamba heads for each mamba layer.
- hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
- The non-linear activation function (function or string) in the mamba layer.
- num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
- Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
- integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
- logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
- sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
- significantly.
- attn_layer_period (`int`, *optional*, defaults to 6):
- Once in this many layers, we will have a shared attention layer
- attn_layer_offset (`int`, *optional*, defaults to 4):
- Offset of the shared attention layer
- use_mamba_kernels (`bool`, *optional*, defaults to `True`):
- Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
- `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
- `True` and kernels are not available
- mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
- Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
- """
- model_type = "zamba"
- keys_to_ignore_at_inference = ["past_key_values"]
- attribute_map = {"layer_types": "layers_block_type", "head_dim": "attention_head_dim"}
- vocab_size: int = 32000
- tie_word_embeddings: bool = True
- hidden_size: int = 3712
- attention_hidden_size: int | None = None
- intermediate_size: int = 14848
- num_hidden_layers: int = 76
- num_attention_heads: int = 16
- attention_head_dim: int | None = None
- num_key_value_heads: int = 16
- n_mamba_heads: int = 2
- hidden_act: str = "gelu"
- hidden_mamba_act: str = "silu"
- initializer_range: float = 0.02
- rms_norm_eps: float = 1e-5
- use_cache: bool = True
- num_logits_to_keep: int = 1
- pad_token_id: int | None = 0
- bos_token_id: int | None = 1
- eos_token_id: int | list[int] | None = 2
- max_position_embeddings: int = 4096
- attention_dropout: float | int = 0.0
- attn_layer_period: int = 6
- attn_layer_offset: int = 4
- use_mamba_kernels: bool = True
- mamba_d_state: int = 16
- mamba_d_conv: int = 4
- mamba_expand: int = 2
- mamba_dt_rank: str | int = "auto"
- time_step_min: float = 0.001
- time_step_max: float = 0.1
- time_step_floor: float = 1e-4
- mamba_conv_bias: bool = True
- mamba_proj_bias: bool = False
- def __post_init__(self, **kwargs):
- self.attention_hidden_size = self.attention_hidden_size or 2 * self.hidden_size
- self.attention_head_dim = self.attention_head_dim or 2 * self.hidden_size // self.num_attention_heads
- self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if self.mamba_dt_rank == "auto" else self.mamba_dt_rank
- self.layers_block_type = self._layers_block_type(
- self.num_hidden_layers, self.attn_layer_period, self.attn_layer_offset
- )
- super().__post_init__(**kwargs)
- def validate_architecture(self):
- """Part of `@strict`-powered validation. Validates the architecture of the config."""
- if (self.mamba_expand * self.hidden_size) % self.n_mamba_heads != 0:
- raise ValueError("`intermediate_size` should be divisible by `n_mamba_heads`.")
- def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset):
- layers = [
- "mamba",
- "mamba",
- "hybrid",
- ] + ["hybrid" if i % attn_layer_period == attn_layer_offset else "mamba" for i in range(num_hidden_layers - 3)]
- return layers
- __all__ = ["ZambaConfig"]
|