# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """FSMT configuration""" from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring @auto_docstring(checkpoint="facebook/wmt19-en-ru") @strict class FSMTConfig(PreTrainedConfig): r""" langs (`list[str]`): A list with source language and target_language (e.g., ['en', 'ru']). src_vocab_size (`int`): Vocabulary size of the encoder. Defines the number of different tokens that can be represented by the `inputs_ids` passed to the forward method in the encoder. tgt_vocab_size (`int`): Vocabulary size of the decoder. Defines the number of different tokens that can be represented by the `inputs_ids` passed to the forward method in the decoder. max_length (`int`, *optional*, defaults to 200): Maximum length to generate. num_beams (`int`, *optional*, defaults to 5): Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means no beam search. length_penalty (`float`, *optional*, defaults to 1): Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. early_stopping (`bool`, *optional*, defaults to `False`): Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. Examples: ```python >>> from transformers import FSMTConfig, FSMTModel >>> # Initializing a FSMT facebook/wmt19-en-ru style configuration >>> config = FSMTConfig() >>> # Initializing a model (with random weights) from the configuration >>> model = FSMTModel(config) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "fsmt" attribute_map = { "num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model", "vocab_size": "tgt_vocab_size", "num_hidden_layers": "encoder_layers", } langs: list[str] | tuple[str, ...] = ("en", "de") src_vocab_size: int = 42024 tgt_vocab_size: int = 42024 activation_function: str = "relu" d_model: int = 1024 max_length: int = 200 max_position_embeddings: int = 1024 encoder_ffn_dim: int = 4096 encoder_layers: int = 12 encoder_attention_heads: int = 16 encoder_layerdrop: float | int = 0.0 decoder_ffn_dim: int = 4096 decoder_layers: int = 12 decoder_attention_heads: int = 16 decoder_layerdrop: float | int = 0.0 attention_dropout: float | int = 0.0 dropout: float | int = 0.1 activation_dropout: float | int = 0.0 init_std: float = 0.02 decoder_start_token_id: int | None = 2 is_encoder_decoder: bool = True scale_embedding: bool = True tie_word_embeddings: bool = False num_beams: int = 5 length_penalty: float = 1.0 early_stopping: bool = False use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 eos_token_id: int | list[int] | None = 2 forced_eos_token_id: int | list[int] | None = 2 def __post_init__(self, **kwargs): kwargs.pop("decoder", None) # delete unused kwargs super().__post_init__(**kwargs) __all__ = ["FSMTConfig"]