# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/uvdoc/modular_uvdoc.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_uvdoc.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch import torch.nn as nn from torch import Tensor from ...activations import ACT2FN from ...backbone_utils import BackboneMixin, filter_output_hidden_states from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from .configuration_uvdoc import UVDocBackboneConfig, UVDocConfig class UVDocConvLayer(nn.Module): """Convolutional layer with batch normalization and activation.""" def __init__( self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, padding: int = 0, padding_mode: str = "zeros", bias: bool = False, dilation: int = 1, activation: str = "relu", ): super().__init__() self.convolution = nn.Conv2d( in_channels, out_channels, bias=bias, kernel_size=kernel_size, stride=stride, padding=padding, padding_mode=padding_mode, dilation=dilation, ) self.normalization = nn.BatchNorm2d(out_channels) self.activation = ACT2FN[activation] if activation is not None else nn.Identity() def forward(self, input: Tensor) -> Tensor: hidden_state = self.convolution(input) hidden_state = self.normalization(hidden_state) hidden_state = self.activation(hidden_state) return hidden_state class UVDocResidualBlock(nn.Module): """Base residual block with dilation support.""" def __init__( self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding: int = 0, dilation: int = 1, downsample: bool = False, activation: str = "relu", ): super().__init__() self.conv_down = ( UVDocConvLayer( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=True, activation=None, ) if downsample else nn.Identity() ) self.conv_start = UVDocConvLayer( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True, ) self.conv_final = UVDocConvLayer( in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=True, dilation=dilation, activation=None, ) self.act_fn = ACT2FN[activation] if activation is not None else nn.Identity() def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = self.conv_down(hidden_states) hidden_states = self.conv_start(hidden_states) hidden_states = self.conv_final(hidden_states) hidden_states = hidden_states + residual hidden_states = self.act_fn(hidden_states) return hidden_states class UVDocResNetStage(nn.Module): """A ResNet stage containing multiple residual blocks.""" def __init__(self, config, stage_index): super().__init__() stages = config.resnet_configs[stage_index] self.layers = nn.ModuleList([]) for in_channels, out_channels, dilation, downsample in stages: self.layers.append( UVDocResidualBlock( in_channels=in_channels, out_channels=out_channels, stride=2 if downsample else 1, padding=dilation * 2, dilation=dilation, downsample=downsample, kernel_size=config.kernel_size, ) ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: for layer in self.layers: hidden_states = layer(hidden_states) return hidden_states class UVDocResNet(nn.Module): """Initial resnet_head and resnet_down.""" def __init__(self, config): super().__init__() self.resnet_head = nn.ModuleList([]) for i in range(len(config.resnet_head)): self.resnet_head.append( UVDocConvLayer( in_channels=config.resnet_head[i][0], out_channels=config.resnet_head[i][1], kernel_size=config.kernel_size, stride=2, padding=config.kernel_size // 2, ) ) self.resnet_down = nn.ModuleList([]) for stage_index in range(len(config.resnet_configs)): stage = UVDocResNetStage(config, stage_index) self.resnet_down.append(stage) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: for head in self.resnet_head: hidden_states = head(hidden_states) for stage in self.resnet_down: hidden_states = stage(hidden_states) return hidden_states class UVDocBridgeBlock(GradientCheckpointingLayer): """Bridge module with dilated convolutions for long-range dependencies.""" def __init__(self, config, bridge_index): super().__init__() self.blocks = nn.ModuleList([]) bridge = config.stage_configs[bridge_index] for in_channels, dilation in bridge: self.blocks.append(UVDocConvLayer(in_channels, in_channels, padding=dilation, dilation=dilation)) def forward( self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: for block in self.blocks: hidden_states = block(hidden_states) return hidden_states class UVDocPointPositions2D(nn.Module): """Module for predicting 2D point positions for document rectification.""" def __init__(self, config): super().__init__() self.conv_down = UVDocConvLayer( in_channels=config.out_point_positions2D[0][0], out_channels=config.out_point_positions2D[0][1], kernel_size=config.kernel_size, stride=1, padding=config.kernel_size // 2, padding_mode=config.padding_mode, activation=config.hidden_act, ) self.conv_up = nn.Conv2d( in_channels=config.out_point_positions2D[1][0], out_channels=config.out_point_positions2D[1][1], kernel_size=config.kernel_size, stride=1, padding=config.kernel_size // 2, padding_mode=config.padding_mode, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.conv_down(hidden_states) hidden_states = self.conv_up(hidden_states) return hidden_states @auto_docstring class UVDocPreTrainedModel(PreTrainedModel): """ Base class for all PPOCRV5 Server Det pre-trained models. Handles model initialization, configuration, and loading of pre-trained weights, following the Transformers library conventions. """ config: UVDocConfig base_model_prefix = "uvdoc" main_input_name = "pixel_values" input_modalities = ("image",) _can_compile_fullgraph = True supports_gradient_checkpointing = True _can_record_outputs = { "hidden_states": UVDocBridgeBlock, } @torch.no_grad() def _init_weights(self, module): super()._init_weights(module) """Initialize the weights.""" if isinstance(module, nn.PReLU): module.reset_parameters() class UVDocBridge(UVDocPreTrainedModel): def __init__(self, config): super().__init__(config) self.bridge = nn.ModuleList([]) for bridge_index in range(len(config.stage_configs)): self.bridge.append(UVDocBridgeBlock(config, bridge_index)) self.post_init() @merge_with_config_defaults @capture_outputs def forward( self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: for layer in self.bridge: feature = layer(hidden_states) return BaseModelOutputWithNoAttention(last_hidden_state=feature) @auto_docstring( custom_intro=""" UVDoc backbone model for feature extraction. """ ) class UVDocBackbone(BackboneMixin, UVDocPreTrainedModel): has_attentions = False base_model_prefix = "backbone" def __init__(self, config: UVDocBackboneConfig): super().__init__(config) num_features = [config.resnet_head[-1][-1]] for stage in config.stage_configs: num_features.append(stage[0][1]) self.num_features = num_features self.resnet = UVDocResNet(config) self.bridge = UVDocBridge(config) self.post_init() @can_return_tuple @filter_output_hidden_states @auto_docstring def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], ) -> BackboneOutput: kwargs["output_hidden_states"] = True # required to extract layers for the stages hidden_states = self.resnet(pixel_values) outputs = self.bridge(hidden_states, **kwargs) feature_maps = () for idx, stage in enumerate(self.stage_names): if stage in self.out_features: feature_maps += (outputs.hidden_states[idx],) return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states, ) class UVDocHead(nn.Module): def __init__(self, config): super().__init__() self.num_bridge_layers = len(config.backbone_config.stage_configs) self.bridge_connector = UVDocConvLayer( in_channels=config.bridge_connector[0] * self.num_bridge_layers, out_channels=config.bridge_connector[1], kernel_size=1, stride=1, padding=0, dilation=1, ) self.out_point_positions2D = UVDocPointPositions2D(config) def forward( self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs], ) -> torch.torch.Tensor: hidden_states = self.bridge_connector(hidden_states) hidden_states = self.out_point_positions2D(hidden_states) return hidden_states @auto_docstring( custom_intro=r""" The model takes raw document images (pixel values) as input, processes them through the UVDoc backbone to predict spatial transformation parameters, and outputs the rectified (corrected) document image tensor. """ ) class UVDocModel(UVDocPreTrainedModel): def __init__(self, config: UVDocConfig): super().__init__(config) self.backbone = UVDocBackbone(config.backbone_config) self.head = UVDocHead(config) self.post_init() @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.FloatTensor] | BaseModelOutputWithNoAttention: backbone_outputs = self.backbone(pixel_values, **kwargs) fused_outputs = torch.cat(backbone_outputs.feature_maps, dim=1) last_hidden_state = self.head(fused_outputs, **kwargs) return BaseModelOutputWithNoAttention( last_hidden_state=last_hidden_state, hidden_states=backbone_outputs.hidden_states, ) __all__ = ["UVDocBridge", "UVDocBackbone", "UVDocModel", "UVDocPreTrainedModel"]