| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710 |
- """ MobileViT
- Paper:
- V1: `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer` - https://arxiv.org/abs/2110.02178
- V2: `Separable Self-attention for Mobile Vision Transformers` - https://arxiv.org/abs/2206.02680
- MobileVitBlock and checkpoints adapted from https://github.com/apple/ml-cvnets (original copyright below)
- License: https://github.com/apple/ml-cvnets/blob/main/LICENSE (Apple open source)
- Rest of code, ByobNet, and Transformer block hacked together by / Copyright 2022, Ross Wightman
- """
- #
- # For licensing see accompanying LICENSE file.
- # Copyright (C) 2020 Apple Inc. All Rights Reserved.
- #
- import math
- from typing import Callable, Tuple, Optional, Type
- import torch
- import torch.nn.functional as F
- from torch import nn
- from timm.layers import to_2tuple, make_divisible, GroupNorm1, ConvMlp, DropPath, is_exportable
- from ._builder import build_model_with_cfg
- from ._features_fx import register_notrace_module
- from ._registry import register_model, generate_default_cfgs, register_model_deprecations
- from .byobnet import register_block, ByoBlockCfg, ByoModelCfg, ByobNet, LayerFn, num_groups
- from .vision_transformer import Block as TransformerBlock
- __all__ = []
- def _inverted_residual_block(d, c, s, br=4.0):
- # inverted residual is a bottleneck block with bottle_ratio > 1 applied to in_chs, linear output, gs=1 (depthwise)
- return ByoBlockCfg(
- type='bottle', d=d, c=c, s=s, gs=1, br=br,
- block_kwargs=dict(bottle_in=True, linear_out=True))
- def _mobilevit_block(d, c, s, transformer_dim, transformer_depth, patch_size=4, br=4.0):
- # inverted residual + mobilevit blocks as per MobileViT network
- return (
- _inverted_residual_block(d=d, c=c, s=s, br=br),
- ByoBlockCfg(
- type='mobilevit', d=1, c=c, s=1,
- block_kwargs=dict(
- transformer_dim=transformer_dim,
- transformer_depth=transformer_depth,
- patch_size=patch_size)
- )
- )
- def _mobilevitv2_block(d, c, s, transformer_depth, patch_size=2, br=2.0, transformer_br=0.5):
- # inverted residual + mobilevit blocks as per MobileViT network
- return (
- _inverted_residual_block(d=d, c=c, s=s, br=br),
- ByoBlockCfg(
- type='mobilevit2', d=1, c=c, s=1, br=transformer_br, gs=1,
- block_kwargs=dict(
- transformer_depth=transformer_depth,
- patch_size=patch_size)
- )
- )
- def _mobilevitv2_cfg(multiplier=1.0):
- chs = (64, 128, 256, 384, 512)
- if multiplier != 1.0:
- chs = tuple([int(c * multiplier) for c in chs])
- cfg = ByoModelCfg(
- blocks=(
- _inverted_residual_block(d=1, c=chs[0], s=1, br=2.0),
- _inverted_residual_block(d=2, c=chs[1], s=2, br=2.0),
- _mobilevitv2_block(d=1, c=chs[2], s=2, transformer_depth=2),
- _mobilevitv2_block(d=1, c=chs[3], s=2, transformer_depth=4),
- _mobilevitv2_block(d=1, c=chs[4], s=2, transformer_depth=3),
- ),
- stem_chs=int(32 * multiplier),
- stem_type='3x3',
- stem_pool='',
- downsample='',
- act_layer='silu',
- )
- return cfg
- model_cfgs = dict(
- mobilevit_xxs=ByoModelCfg(
- blocks=(
- _inverted_residual_block(d=1, c=16, s=1, br=2.0),
- _inverted_residual_block(d=3, c=24, s=2, br=2.0),
- _mobilevit_block(d=1, c=48, s=2, transformer_dim=64, transformer_depth=2, patch_size=2, br=2.0),
- _mobilevit_block(d=1, c=64, s=2, transformer_dim=80, transformer_depth=4, patch_size=2, br=2.0),
- _mobilevit_block(d=1, c=80, s=2, transformer_dim=96, transformer_depth=3, patch_size=2, br=2.0),
- ),
- stem_chs=16,
- stem_type='3x3',
- stem_pool='',
- downsample='',
- act_layer='silu',
- num_features=320,
- ),
- mobilevit_xs=ByoModelCfg(
- blocks=(
- _inverted_residual_block(d=1, c=32, s=1),
- _inverted_residual_block(d=3, c=48, s=2),
- _mobilevit_block(d=1, c=64, s=2, transformer_dim=96, transformer_depth=2, patch_size=2),
- _mobilevit_block(d=1, c=80, s=2, transformer_dim=120, transformer_depth=4, patch_size=2),
- _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=3, patch_size=2),
- ),
- stem_chs=16,
- stem_type='3x3',
- stem_pool='',
- downsample='',
- act_layer='silu',
- num_features=384,
- ),
- mobilevit_s=ByoModelCfg(
- blocks=(
- _inverted_residual_block(d=1, c=32, s=1),
- _inverted_residual_block(d=3, c=64, s=2),
- _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2),
- _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2),
- _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2),
- ),
- stem_chs=16,
- stem_type='3x3',
- stem_pool='',
- downsample='',
- act_layer='silu',
- num_features=640,
- ),
- semobilevit_s=ByoModelCfg(
- blocks=(
- _inverted_residual_block(d=1, c=32, s=1),
- _inverted_residual_block(d=3, c=64, s=2),
- _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2),
- _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2),
- _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2),
- ),
- stem_chs=16,
- stem_type='3x3',
- stem_pool='',
- downsample='',
- attn_layer='se',
- attn_kwargs=dict(rd_ratio=1/8),
- num_features=640,
- ),
- mobilevitv2_050=_mobilevitv2_cfg(.50),
- mobilevitv2_075=_mobilevitv2_cfg(.75),
- mobilevitv2_125=_mobilevitv2_cfg(1.25),
- mobilevitv2_100=_mobilevitv2_cfg(1.0),
- mobilevitv2_150=_mobilevitv2_cfg(1.5),
- mobilevitv2_175=_mobilevitv2_cfg(1.75),
- mobilevitv2_200=_mobilevitv2_cfg(2.0),
- )
- @register_notrace_module
- class MobileVitBlock(nn.Module):
- """ MobileViT block
- Paper: https://arxiv.org/abs/2110.02178?context=cs.LG
- """
- def __init__(
- self,
- in_chs: int,
- out_chs: Optional[int] = None,
- kernel_size: int = 3,
- stride: int = 1,
- bottle_ratio: float = 1.0,
- group_size: Optional[int] = None,
- dilation: Tuple[int, int] = (1, 1),
- mlp_ratio: float = 2.0,
- transformer_dim: Optional[int] = None,
- transformer_depth: int = 2,
- patch_size: int = 8,
- num_heads: int = 4,
- attn_drop: float = 0.,
- drop: int = 0.,
- no_fusion: bool = False,
- drop_path_rate: float = 0.,
- layers: LayerFn = None,
- transformer_norm_layer: Type[nn.Module] = nn.LayerNorm,
- device=None,
- dtype=None,
- **kwargs, # eat unused args
- ):
- dd = {'device': device, 'dtype': dtype}
- super().__init__()
- layers = layers or LayerFn()
- groups = num_groups(group_size, in_chs)
- out_chs = out_chs or in_chs
- transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs)
- self.conv_kxk = layers.conv_norm_act(
- in_chs,
- in_chs,
- kernel_size=kernel_size,
- stride=stride,
- groups=groups,
- dilation=dilation[0],
- **dd,
- )
- self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False, **dd)
- self.transformer = nn.Sequential(*[
- TransformerBlock(
- transformer_dim,
- mlp_ratio=mlp_ratio,
- num_heads=num_heads,
- qkv_bias=True,
- attn_drop=attn_drop,
- proj_drop=drop,
- drop_path=drop_path_rate,
- act_layer=layers.act,
- norm_layer=transformer_norm_layer,
- **dd,
- )
- for _ in range(transformer_depth)
- ])
- self.norm = transformer_norm_layer(transformer_dim, **dd)
- self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1, **dd)
- if no_fusion:
- self.conv_fusion = None
- else:
- self.conv_fusion = layers.conv_norm_act(in_chs + out_chs, out_chs, kernel_size=kernel_size, stride=1, **dd)
- self.patch_size = to_2tuple(patch_size)
- self.patch_area = self.patch_size[0] * self.patch_size[1]
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- shortcut = x
- # Local representation
- x = self.conv_kxk(x)
- x = self.conv_1x1(x)
- # Unfold (feature map -> patches)
- patch_h, patch_w = self.patch_size
- B, C, H, W = x.shape
- new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w
- num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w # n_h, n_w
- num_patches = num_patch_h * num_patch_w # N
- interpolate = False
- if new_h != H or new_w != W:
- # Note: Padding can be done, but then it needs to be handled in attention function.
- x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=False)
- interpolate = True
- # [B, C, H, W] --> [B * C * n_h, n_w, p_h, p_w]
- x = x.reshape(B * C * num_patch_h, patch_h, num_patch_w, patch_w).transpose(1, 2)
- # [B * C * n_h, n_w, p_h, p_w] --> [BP, N, C] where P = p_h * p_w and N = n_h * n_w
- x = x.reshape(B, C, num_patches, self.patch_area).transpose(1, 3).reshape(B * self.patch_area, num_patches, -1)
- # Global representations
- x = self.transformer(x)
- x = self.norm(x)
- # Fold (patch -> feature map)
- # [B, P, N, C] --> [B*C*n_h, n_w, p_h, p_w]
- x = x.contiguous().view(B, self.patch_area, num_patches, -1)
- x = x.transpose(1, 3).reshape(B * C * num_patch_h, num_patch_w, patch_h, patch_w)
- # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
- x = x.transpose(1, 2).reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w)
- if interpolate:
- x = F.interpolate(x, size=(H, W), mode="bilinear", align_corners=False)
- x = self.conv_proj(x)
- if self.conv_fusion is not None:
- x = self.conv_fusion(torch.cat((shortcut, x), dim=1))
- return x
- class LinearSelfAttention(nn.Module):
- """
- This layer applies a self-attention with linear complexity, as described in `https://arxiv.org/abs/2206.02680`
- This layer can be used for self- as well as cross-attention.
- Args:
- embed_dim (int): :math:`C` from an expected input of size :math:`(N, C, H, W)`
- attn_drop (float): Dropout value for context scores. Default: 0.0
- bias (bool): Use bias in learnable layers. Default: True
- Shape:
- - Input: :math:`(N, C, P, N)` where :math:`N` is the batch size, :math:`C` is the input channels,
- :math:`P` is the number of pixels in the patch, and :math:`N` is the number of patches
- - Output: same as the input
- .. note::
- For MobileViTv2, we unfold the feature map [B, C, H, W] into [B, C, P, N] where P is the number of pixels
- in a patch and N is the number of patches. Because channel is the first dimension in this unfolded tensor,
- we use point-wise convolution (instead of a linear layer). This avoids a transpose operation (which may be
- expensive on resource-constrained devices) that may be required to convert the unfolded tensor from
- channel-first to channel-last format in case of a linear layer.
- """
- def __init__(
- self,
- embed_dim: int,
- attn_drop: float = 0.0,
- proj_drop: float = 0.0,
- bias: bool = True,
- device=None,
- dtype=None,
- ) -> None:
- dd = {'device': device, 'dtype': dtype}
- super().__init__()
- self.embed_dim = embed_dim
- self.qkv_proj = nn.Conv2d(
- in_channels=embed_dim,
- out_channels=1 + (2 * embed_dim),
- bias=bias,
- kernel_size=1,
- **dd,
- )
- self.attn_drop = nn.Dropout(attn_drop)
- self.out_proj = nn.Conv2d(
- in_channels=embed_dim,
- out_channels=embed_dim,
- bias=bias,
- kernel_size=1,
- **dd,
- )
- self.out_drop = nn.Dropout(proj_drop)
- def _forward_self_attn(self, x: torch.Tensor) -> torch.Tensor:
- # [B, C, P, N] --> [B, h + 2d, P, N]
- qkv = self.qkv_proj(x)
- # Project x into query, key and value
- # Query --> [B, 1, P, N]
- # value, key --> [B, d, P, N]
- query, key, value = qkv.split([1, self.embed_dim, self.embed_dim], dim=1)
- # apply softmax along N dimension
- context_scores = F.softmax(query, dim=-1)
- context_scores = self.attn_drop(context_scores)
- # Compute context vector
- # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N] --> [B, d, P, 1]
- context_vector = (key * context_scores).sum(dim=-1, keepdim=True)
- # combine context vector with values
- # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
- out = F.relu(value) * context_vector.expand_as(value)
- out = self.out_proj(out)
- out = self.out_drop(out)
- return out
- @torch.jit.ignore()
- def _forward_cross_attn(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
- # x --> [B, C, P, N]
- # x_prev = [B, C, P, M]
- batch_size, in_dim, kv_patch_area, kv_num_patches = x.shape
- q_patch_area, q_num_patches = x.shape[-2:]
- assert (
- kv_patch_area == q_patch_area
- ), "The number of pixels in a patch for query and key_value should be the same"
- # compute query, key, and value
- # [B, C, P, M] --> [B, 1 + d, P, M]
- qk = F.conv2d(
- x_prev,
- weight=self.qkv_proj.weight[:self.embed_dim + 1],
- bias=self.qkv_proj.bias[:self.embed_dim + 1],
- )
- # [B, 1 + d, P, M] --> [B, 1, P, M], [B, d, P, M]
- query, key = qk.split([1, self.embed_dim], dim=1)
- # [B, C, P, N] --> [B, d, P, N]
- value = F.conv2d(
- x,
- weight=self.qkv_proj.weight[self.embed_dim + 1],
- bias=self.qkv_proj.bias[self.embed_dim + 1] if self.qkv_proj.bias is not None else None,
- )
- # apply softmax along M dimension
- context_scores = F.softmax(query, dim=-1)
- context_scores = self.attn_drop(context_scores)
- # compute context vector
- # [B, d, P, M] * [B, 1, P, M] -> [B, d, P, M] --> [B, d, P, 1]
- context_vector = (key * context_scores).sum(dim=-1, keepdim=True)
- # combine context vector with values
- # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
- out = F.relu(value) * context_vector.expand_as(value)
- out = self.out_proj(out)
- out = self.out_drop(out)
- return out
- def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
- if x_prev is None:
- return self._forward_self_attn(x)
- else:
- return self._forward_cross_attn(x, x_prev=x_prev)
- class LinearTransformerBlock(nn.Module):
- """
- This class defines the pre-norm transformer encoder with linear self-attention in `MobileViTv2 paper <>`_
- Args:
- embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, P, N)`
- mlp_ratio (float): Inner dimension ratio of the FFN relative to embed_dim
- drop (float): Dropout rate. Default: 0.0
- attn_drop (float): Dropout rate for attention in multi-head attention. Default: 0.0
- drop_path (float): Stochastic depth rate Default: 0.0
- norm_layer (Callable): Normalization layer. Default: layer_norm_2d
- Shape:
- - Input: :math:`(B, C_{in}, P, N)` where :math:`B` is batch size, :math:`C_{in}` is input embedding dim,
- :math:`P` is number of pixels in a patch, and :math:`N` is number of patches,
- - Output: same shape as the input
- """
- def __init__(
- self,
- embed_dim: int,
- mlp_ratio: float = 2.0,
- drop: float = 0.0,
- attn_drop: float = 0.0,
- drop_path: float = 0.0,
- act_layer: Optional[Type[nn.Module]] = None,
- norm_layer: Optional[Type[nn.Module]] = None,
- device=None,
- dtype=None,
- ) -> None:
- dd = {'device': device, 'dtype': dtype}
- super().__init__()
- act_layer = act_layer or nn.SiLU
- norm_layer = norm_layer or GroupNorm1
- self.norm1 = norm_layer(embed_dim, **dd)
- self.attn = LinearSelfAttention(embed_dim=embed_dim, attn_drop=attn_drop, proj_drop=drop, **dd)
- self.drop_path1 = DropPath(drop_path)
- self.norm2 = norm_layer(embed_dim, **dd)
- self.mlp = ConvMlp(
- in_features=embed_dim,
- hidden_features=int(embed_dim * mlp_ratio),
- act_layer=act_layer,
- drop=drop,
- **dd)
- self.drop_path2 = DropPath(drop_path)
- def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor:
- if x_prev is None:
- # self-attention
- x = x + self.drop_path1(self.attn(self.norm1(x)))
- else:
- # cross-attention
- res = x
- x = self.norm1(x) # norm
- x = self.attn(x, x_prev) # attn
- x = self.drop_path1(x) + res # residual
- # Feed forward network
- x = x + self.drop_path2(self.mlp(self.norm2(x)))
- return x
- @register_notrace_module
- class MobileVitV2Block(nn.Module):
- """
- This class defines the `MobileViTv2 block <>`_
- """
- def __init__(
- self,
- in_chs: int,
- out_chs: Optional[int] = None,
- kernel_size: int = 3,
- bottle_ratio: float = 1.0,
- group_size: Optional[int] = 1,
- dilation: Tuple[int, int] = (1, 1),
- mlp_ratio: float = 2.0,
- transformer_dim: Optional[int] = None,
- transformer_depth: int = 2,
- patch_size: int = 8,
- attn_drop: float = 0.,
- drop: int = 0.,
- drop_path_rate: float = 0.,
- layers: LayerFn = None,
- transformer_norm_layer: Type[nn.Module] = GroupNorm1,
- device=None,
- dtype=None,
- **kwargs, # eat unused args
- ):
- dd = {'device': device, 'dtype': dtype}
- super().__init__()
- layers = layers or LayerFn()
- groups = num_groups(group_size, in_chs)
- out_chs = out_chs or in_chs
- transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs)
- self.conv_kxk = layers.conv_norm_act(
- in_chs,
- in_chs,
- kernel_size=kernel_size,
- stride=1,
- groups=groups,
- dilation=dilation[0],
- **dd,
- )
- self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False, **dd)
- self.transformer = nn.Sequential(*[
- LinearTransformerBlock(
- transformer_dim,
- mlp_ratio=mlp_ratio,
- attn_drop=attn_drop,
- drop=drop,
- drop_path=drop_path_rate,
- act_layer=layers.act,
- norm_layer=transformer_norm_layer,
- **dd,
- )
- for _ in range(transformer_depth)
- ])
- self.norm = transformer_norm_layer(transformer_dim, **dd)
- self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1, apply_act=False, **dd)
- self.patch_size = to_2tuple(patch_size)
- self.patch_area = self.patch_size[0] * self.patch_size[1]
- self.coreml_exportable = is_exportable()
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- B, C, H, W = x.shape
- patch_h, patch_w = self.patch_size
- new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w
- num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w # n_h, n_w
- num_patches = num_patch_h * num_patch_w # N
- if new_h != H or new_w != W:
- x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=True)
- # Local representation
- x = self.conv_kxk(x)
- x = self.conv_1x1(x)
- # Unfold (feature map -> patches), [B, C, H, W] -> [B, C, P, N]
- C = x.shape[1]
- if self.coreml_exportable:
- x = F.unfold(x, kernel_size=(patch_h, patch_w), stride=(patch_h, patch_w))
- else:
- x = x.reshape(B, C, num_patch_h, patch_h, num_patch_w, patch_w).permute(0, 1, 3, 5, 2, 4)
- x = x.reshape(B, C, -1, num_patches)
- # Global representations
- x = self.transformer(x)
- x = self.norm(x)
- # Fold (patches -> feature map), [B, C, P, N] --> [B, C, H, W]
- if self.coreml_exportable:
- # adopted from https://github.com/apple/ml-cvnets/blob/main/cvnets/modules/mobilevit_block.py#L609-L624
- x = x.reshape(B, C * patch_h * patch_w, num_patch_h, num_patch_w)
- x = F.pixel_shuffle(x, upscale_factor=patch_h)
- else:
- x = x.reshape(B, C, patch_h, patch_w, num_patch_h, num_patch_w).permute(0, 1, 4, 2, 5, 3)
- x = x.reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w)
- x = self.conv_proj(x)
- return x
- register_block('mobilevit', MobileVitBlock)
- register_block('mobilevit2', MobileVitV2Block)
- def _create_mobilevit(variant, cfg_variant=None, pretrained=False, **kwargs):
- return build_model_with_cfg(
- ByobNet, variant, pretrained,
- model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
- feature_cfg=dict(flatten_sequential=True),
- **kwargs)
- def _create_mobilevit2(variant, cfg_variant=None, pretrained=False, **kwargs):
- return build_model_with_cfg(
- ByobNet, variant, pretrained,
- model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
- feature_cfg=dict(flatten_sequential=True),
- **kwargs)
- def _cfg(url='', **kwargs):
- return {
- 'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
- 'crop_pct': 0.9, 'interpolation': 'bicubic',
- 'mean': (0., 0., 0.), 'std': (1., 1., 1.),
- 'first_conv': 'stem.conv', 'classifier': 'head.fc',
- 'fixed_input_size': False,
- 'license': 'cvnets-license',
- **kwargs
- }
- default_cfgs = generate_default_cfgs({
- 'mobilevit_xxs.cvnets_in1k': _cfg(hf_hub_id='timm/'),
- 'mobilevit_xs.cvnets_in1k': _cfg(hf_hub_id='timm/'),
- 'mobilevit_s.cvnets_in1k': _cfg(hf_hub_id='timm/'),
- 'mobilevitv2_050.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_075.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_100.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_125.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_150.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_175.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_200.cvnets_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_150.cvnets_in22k_ft_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_175.cvnets_in22k_ft_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_200.cvnets_in22k_ft_in1k': _cfg(
- hf_hub_id='timm/',
- crop_pct=0.888),
- 'mobilevitv2_150.cvnets_in22k_ft_in1k_384': _cfg(
- hf_hub_id='timm/',
- input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
- 'mobilevitv2_175.cvnets_in22k_ft_in1k_384': _cfg(
- hf_hub_id='timm/',
- input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
- 'mobilevitv2_200.cvnets_in22k_ft_in1k_384': _cfg(
- hf_hub_id='timm/',
- input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
- })
- @register_model
- def mobilevit_xxs(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevit_xxs', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevit_xs(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevit_xs', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevit_s(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevit_s', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_050(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_050', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_075(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_075', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_100(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_100', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_125(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_125', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_150(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_150', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_175(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_175', pretrained=pretrained, **kwargs)
- @register_model
- def mobilevitv2_200(pretrained=False, **kwargs) -> ByobNet:
- return _create_mobilevit('mobilevitv2_200', pretrained=pretrained, **kwargs)
- register_model_deprecations(__name__, {
- 'mobilevitv2_150_in22ft1k': 'mobilevitv2_150.cvnets_in22k_ft_in1k',
- 'mobilevitv2_175_in22ft1k': 'mobilevitv2_175.cvnets_in22k_ft_in1k',
- 'mobilevitv2_200_in22ft1k': 'mobilevitv2_200.cvnets_in22k_ft_in1k',
- 'mobilevitv2_150_384_in22ft1k': 'mobilevitv2_150.cvnets_in22k_ft_in1k_384',
- 'mobilevitv2_175_384_in22ft1k': 'mobilevitv2_175.cvnets_in22k_ft_in1k_384',
- 'mobilevitv2_200_384_in22ft1k': 'mobilevitv2_200.cvnets_in22k_ft_in1k_384',
- })
|