# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """LayoutLMv2 model configuration""" from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...utils import auto_docstring, is_detectron2_available # soft dependency if is_detectron2_available(): import detectron2 @auto_docstring(checkpoint="microsoft/layoutlmv2-base-uncased") @strict class LayoutLMv2Config(PreTrainedConfig): r""" max_2d_position_embeddings (`int`, *optional*, defaults to 1024): The maximum value that the 2D position embedding might ever be used with. Typically set this to something large just in case (e.g., 1024). max_rel_pos (`int`, *optional*, defaults to 128): The maximum number of relative positions to be used in the self-attention mechanism. rel_pos_bins (`int`, *optional*, defaults to 32): The number of relative position bins to be used in the self-attention mechanism. fast_qkv (`bool`, *optional*, defaults to `True`): Whether or not to use a single matrix for the queries, keys, values in the self-attention layers. max_rel_2d_pos (`int`, *optional*, defaults to 256): The maximum number of relative 2D positions in the self-attention mechanism. rel_2d_pos_bins (`int`, *optional*, defaults to 64): The number of 2D relative position bins in the self-attention mechanism. convert_sync_batchnorm (`bool`, *optional*, defaults to `True`): Whether or not to convert batch normalization layers to synchronized batch normalization layers. image_feature_pool_shape (`list[int]`, *optional*, defaults to `[7, 7, 256]`): The shape of the average-pooled feature map. coordinate_size (`int`, *optional*, defaults to 128): Dimension of the coordinate embeddings. shape_size (`int`, *optional*, defaults to 128): Dimension of the width and height embeddings. has_relative_attention_bias (`bool`, *optional*, defaults to `True`): Whether or not to use a relative attention bias in the self-attention mechanism. has_spatial_attention_bias (`bool`, *optional*, defaults to `True`): Whether or not to use a spatial attention bias in the self-attention mechanism. has_visual_segment_embedding (`bool`, *optional*, defaults to `False`): Whether or not to add visual segment embeddings. detectron2_config_args (`dict`, *optional*): Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py) for details regarding default values. Example: ```python >>> from transformers import LayoutLMv2Config, LayoutLMv2Model >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration >>> configuration = LayoutLMv2Config() >>> # Initializing a model (with random weights) from the microsoft/layoutlmv2-base-uncased style configuration >>> model = LayoutLMv2Model(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "layoutlmv2" vocab_size: int = 30522 hidden_size: int = 768 num_hidden_layers: int = 12 num_attention_heads: int = 12 intermediate_size: int = 3072 hidden_act: str = "gelu" hidden_dropout_prob: float | int = 0.1 attention_probs_dropout_prob: float | int = 0.1 max_position_embeddings: int = 512 type_vocab_size: int = 2 initializer_range: float = 0.02 layer_norm_eps: float = 1e-12 pad_token_id: int | None = 0 max_2d_position_embeddings: int = 1024 max_rel_pos: int = 128 rel_pos_bins: int = 32 fast_qkv: bool = True max_rel_2d_pos: int = 256 rel_2d_pos_bins: int = 64 convert_sync_batchnorm: bool = True image_feature_pool_shape: list[int] | tuple[int, ...] = (7, 7, 256) coordinate_size: int = 128 shape_size: int = 128 has_relative_attention_bias: bool = True has_spatial_attention_bias: bool = True has_visual_segment_embedding: bool = False detectron2_config_args: dict | None = None def __post_init__(self, **kwargs): super().__post_init__(**kwargs) self.detectron2_config_args = ( self.detectron2_config_args if self.detectron2_config_args is not None else self.get_default_detectron2_config() ) @classmethod def get_default_detectron2_config(cls): return { "MODEL.MASK_ON": True, "MODEL.PIXEL_STD": [57.375, 57.120, 58.395], "MODEL.BACKBONE.NAME": "build_resnet_fpn_backbone", "MODEL.FPN.IN_FEATURES": ["res2", "res3", "res4", "res5"], "MODEL.ANCHOR_GENERATOR.SIZES": [[32], [64], [128], [256], [512]], "MODEL.RPN.IN_FEATURES": ["p2", "p3", "p4", "p5", "p6"], "MODEL.RPN.PRE_NMS_TOPK_TRAIN": 2000, "MODEL.RPN.PRE_NMS_TOPK_TEST": 1000, "MODEL.RPN.POST_NMS_TOPK_TRAIN": 1000, "MODEL.POST_NMS_TOPK_TEST": 1000, "MODEL.ROI_HEADS.NAME": "StandardROIHeads", "MODEL.ROI_HEADS.NUM_CLASSES": 5, "MODEL.ROI_HEADS.IN_FEATURES": ["p2", "p3", "p4", "p5"], "MODEL.ROI_BOX_HEAD.NAME": "FastRCNNConvFCHead", "MODEL.ROI_BOX_HEAD.NUM_FC": 2, "MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION": 14, "MODEL.ROI_MASK_HEAD.NAME": "MaskRCNNConvUpsampleHead", "MODEL.ROI_MASK_HEAD.NUM_CONV": 4, "MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION": 7, "MODEL.RESNETS.DEPTH": 101, "MODEL.RESNETS.SIZES": [[32], [64], [128], [256], [512]], "MODEL.RESNETS.ASPECT_RATIOS": [[0.5, 1.0, 2.0]], "MODEL.RESNETS.OUT_FEATURES": ["res2", "res3", "res4", "res5"], "MODEL.RESNETS.NUM_GROUPS": 32, "MODEL.RESNETS.WIDTH_PER_GROUP": 8, "MODEL.RESNETS.STRIDE_IN_1X1": False, } def get_detectron2_config(self): detectron2_config = detectron2.config.get_cfg() for k, v in self.detectron2_config_args.items(): attributes = k.split(".") to_set = detectron2_config for attribute in attributes[:-1]: to_set = getattr(to_set, attribute) setattr(to_set, attributes[-1], v) return detectron2_config __all__ = ["LayoutLMv2Config"]