yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
							#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/uvdoc/modular_uvdoc.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_uvdoc.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import torch
import torch.nn as nn
from torch import Tensor

from ...activations import ACT2FN
from ...backbone_utils import BackboneMixin, filter_output_hidden_states
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention
from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
from ...utils.generic import merge_with_config_defaults
from ...utils.output_capturing import capture_outputs
from .configuration_uvdoc import UVDocBackboneConfig, UVDocConfig


class UVDocConvLayer(nn.Module):
    """Convolutional layer with batch normalization and activation."""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        padding: int = 0,
        padding_mode: str = "zeros",
        bias: bool = False,
        dilation: int = 1,
        activation: str = "relu",
    ):
        super().__init__()

        self.convolution = nn.Conv2d(
            in_channels,
            out_channels,
            bias=bias,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            padding_mode=padding_mode,
            dilation=dilation,
        )
        self.normalization = nn.BatchNorm2d(out_channels)
        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()

    def forward(self, input: Tensor) -> Tensor:
        hidden_state = self.convolution(input)
        hidden_state = self.normalization(hidden_state)
        hidden_state = self.activation(hidden_state)
        return hidden_state


class UVDocResidualBlock(nn.Module):
    """Base residual block with dilation support."""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        downsample: bool = False,
        activation: str = "relu",
    ):
        super().__init__()

        self.conv_down = (
            UVDocConvLayer(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=kernel_size // 2,
                bias=True,
                activation=None,
            )
            if downsample
            else nn.Identity()
        )

        self.conv_start = UVDocConvLayer(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=True,
        )

        self.conv_final = UVDocConvLayer(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=1,
            padding=padding,
            bias=True,
            dilation=dilation,
            activation=None,
        )

        self.act_fn = ACT2FN[activation] if activation is not None else nn.Identity()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        residual = self.conv_down(hidden_states)
        hidden_states = self.conv_start(hidden_states)
        hidden_states = self.conv_final(hidden_states)
        hidden_states = hidden_states + residual
        hidden_states = self.act_fn(hidden_states)
        return hidden_states


class UVDocResNetStage(nn.Module):
    """A ResNet stage containing multiple residual blocks."""

    def __init__(self, config, stage_index):
        super().__init__()

        stages = config.resnet_configs[stage_index]
        self.layers = nn.ModuleList([])
        for in_channels, out_channels, dilation, downsample in stages:
            self.layers.append(
                UVDocResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    stride=2 if downsample else 1,
                    padding=dilation * 2,
                    dilation=dilation,
                    downsample=downsample,
                    kernel_size=config.kernel_size,
                )
            )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states


class UVDocResNet(nn.Module):
    """Initial resnet_head and resnet_down."""

    def __init__(self, config):
        super().__init__()
        self.resnet_head = nn.ModuleList([])
        for i in range(len(config.resnet_head)):
            self.resnet_head.append(
                UVDocConvLayer(
                    in_channels=config.resnet_head[i][0],
                    out_channels=config.resnet_head[i][1],
                    kernel_size=config.kernel_size,
                    stride=2,
                    padding=config.kernel_size // 2,
                )
            )

        self.resnet_down = nn.ModuleList([])
        for stage_index in range(len(config.resnet_configs)):
            stage = UVDocResNetStage(config, stage_index)
            self.resnet_down.append(stage)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        for head in self.resnet_head:
            hidden_states = head(hidden_states)
        for stage in self.resnet_down:
            hidden_states = stage(hidden_states)
        return hidden_states


class UVDocBridgeBlock(GradientCheckpointingLayer):
    """Bridge module with dilated convolutions for long-range dependencies."""

    def __init__(self, config, bridge_index):
        super().__init__()
        self.blocks = nn.ModuleList([])
        bridge = config.stage_configs[bridge_index]
        for in_channels, dilation in bridge:
            self.blocks.append(UVDocConvLayer(in_channels, in_channels, padding=dilation, dilation=dilation))

    def forward(
        self,
        hidden_states: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.Tensor:
        for block in self.blocks:
            hidden_states = block(hidden_states)
        return hidden_states


class UVDocPointPositions2D(nn.Module):
    """Module for predicting 2D point positions for document rectification."""

    def __init__(self, config):
        super().__init__()

        self.conv_down = UVDocConvLayer(
            in_channels=config.out_point_positions2D[0][0],
            out_channels=config.out_point_positions2D[0][1],
            kernel_size=config.kernel_size,
            stride=1,
            padding=config.kernel_size // 2,
            padding_mode=config.padding_mode,
            activation=config.hidden_act,
        )

        self.conv_up = nn.Conv2d(
            in_channels=config.out_point_positions2D[1][0],
            out_channels=config.out_point_positions2D[1][1],
            kernel_size=config.kernel_size,
            stride=1,
            padding=config.kernel_size // 2,
            padding_mode=config.padding_mode,
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.conv_down(hidden_states)
        hidden_states = self.conv_up(hidden_states)
        return hidden_states


@auto_docstring
class UVDocPreTrainedModel(PreTrainedModel):
    """
    Base class for all PPOCRV5 Server Det pre-trained models. Handles model initialization,
    configuration, and loading of pre-trained weights, following the Transformers library conventions.
    """

    config: UVDocConfig
    base_model_prefix = "uvdoc"
    main_input_name = "pixel_values"
    input_modalities = ("image",)
    _can_compile_fullgraph = True
    supports_gradient_checkpointing = True
    _can_record_outputs = {
        "hidden_states": UVDocBridgeBlock,
    }

    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        """Initialize the weights."""
        if isinstance(module, nn.PReLU):
            module.reset_parameters()


class UVDocBridge(UVDocPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bridge = nn.ModuleList([])
        for bridge_index in range(len(config.stage_configs)):
            self.bridge.append(UVDocBridgeBlock(config, bridge_index))
        self.post_init()

    @merge_with_config_defaults
    @capture_outputs
    def forward(
        self,
        hidden_states: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.Tensor:
        for layer in self.bridge:
            feature = layer(hidden_states)
        return BaseModelOutputWithNoAttention(last_hidden_state=feature)


@auto_docstring(
    custom_intro="""
    UVDoc backbone model for feature extraction.
    """
)
class UVDocBackbone(BackboneMixin, UVDocPreTrainedModel):
    has_attentions = False
    base_model_prefix = "backbone"

    def __init__(self, config: UVDocBackboneConfig):
        super().__init__(config)

        num_features = [config.resnet_head[-1][-1]]
        for stage in config.stage_configs:
            num_features.append(stage[0][1])
        self.num_features = num_features

        self.resnet = UVDocResNet(config)
        self.bridge = UVDocBridge(config)

        self.post_init()

    @can_return_tuple
    @filter_output_hidden_states
    @auto_docstring
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BackboneOutput:
        kwargs["output_hidden_states"] = True  # required to extract layers for the stages
        hidden_states = self.resnet(pixel_values)
        outputs = self.bridge(hidden_states, **kwargs)

        feature_maps = ()
        for idx, stage in enumerate(self.stage_names):
            if stage in self.out_features:
                feature_maps += (outputs.hidden_states[idx],)

        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states,
        )


class UVDocHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_bridge_layers = len(config.backbone_config.stage_configs)

        self.bridge_connector = UVDocConvLayer(
            in_channels=config.bridge_connector[0] * self.num_bridge_layers,
            out_channels=config.bridge_connector[1],
            kernel_size=1,
            stride=1,
            padding=0,
            dilation=1,
        )

        self.out_point_positions2D = UVDocPointPositions2D(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.torch.Tensor:
        hidden_states = self.bridge_connector(hidden_states)
        hidden_states = self.out_point_positions2D(hidden_states)
        return hidden_states


@auto_docstring(
    custom_intro=r"""
    The model takes raw document images (pixel values) as input, processes them through the UVDoc backbone to predict spatial transformation parameters,
    and outputs the rectified (corrected) document image tensor.
    """
)
class UVDocModel(UVDocPreTrainedModel):
    def __init__(self, config: UVDocConfig):
        super().__init__(config)

        self.backbone = UVDocBackbone(config.backbone_config)
        self.head = UVDocHead(config)
        self.post_init()

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.FloatTensor] | BaseModelOutputWithNoAttention:
        backbone_outputs = self.backbone(pixel_values, **kwargs)
        fused_outputs = torch.cat(backbone_outputs.feature_maps, dim=1)
        last_hidden_state = self.head(fused_outputs, **kwargs)

        return BaseModelOutputWithNoAttention(
            last_hidden_state=last_hidden_state,
            hidden_states=backbone_outputs.hidden_states,
        )


__all__ = ["UVDocBridge", "UVDocBackbone", "UVDocModel", "UVDocPreTrainedModel"]