encoder.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. # LICENSE HEADER MANAGED BY add-license-header
  2. #
  3. # Copyright 2018 Kornia Team
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. from typing import Any, Optional, Tuple
  18. import torch
  19. from torch import nn
  20. from kornia.core import Module, Tensor
  21. from .vgg import vgg19_bn
  22. class VGG19(Module):
  23. def __init__(self, amp: bool = False, amp_dtype: torch.dtype = torch.float16) -> None:
  24. super().__init__()
  25. self.layers = nn.ModuleList(vgg19_bn().features[:40]) # type: ignore
  26. # Maxpool layers: 6, 13, 26, 39
  27. self.amp = amp
  28. self.amp_dtype = amp_dtype
  29. def forward(self, x: Tensor, **kwargs): # type: ignore[no-untyped-def]
  30. with torch.autocast("cuda", enabled=self.amp, dtype=self.amp_dtype):
  31. feats = []
  32. sizes = []
  33. for layer in self.layers:
  34. if isinstance(layer, nn.MaxPool2d):
  35. feats.append(x)
  36. sizes.append(x.shape[-2:])
  37. x = layer(x)
  38. return feats, sizes
  39. class FrozenDINOv2(Module):
  40. def __init__(self, amp: bool = True, amp_dtype: torch.dtype = torch.float16, dinov2_weights: Optional[Any] = None):
  41. super().__init__()
  42. if dinov2_weights is None:
  43. dinov2_weights = torch.hub.load_state_dict_from_url(
  44. "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", map_location="cpu"
  45. )
  46. from .transformer import vit_large
  47. vit_kwargs = dict(
  48. img_size=518,
  49. patch_size=14,
  50. init_values=1.0,
  51. ffn_layer="mlp",
  52. block_chunks=0,
  53. )
  54. dinov2_vitl14 = vit_large(**vit_kwargs).eval()
  55. dinov2_vitl14.load_state_dict(dinov2_weights)
  56. self.amp = amp
  57. self.amp_dtype = amp_dtype
  58. if self.amp:
  59. dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
  60. self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP
  61. def forward(self, x: Tensor): # type: ignore[no-untyped-def]
  62. B, _C, H, W = x.shape
  63. if self.dinov2_vitl14[0].device != x.device:
  64. self.dinov2_vitl14[0] = self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
  65. with torch.inference_mode():
  66. dinov2_features_16 = self.dinov2_vitl14[0].forward_features(x.to(self.amp_dtype))
  67. features_16 = dinov2_features_16["x_norm_patchtokens"].permute(0, 2, 1).reshape(B, 1024, H // 14, W // 14)
  68. return [features_16.clone()], [(H // 14, W // 14)] # clone from inference mode to use in autograd
  69. class VGG_DINOv2(Module):
  70. def __init__(self, vgg_kwargs=None, dinov2_kwargs=None): # type: ignore[no-untyped-def]
  71. if (vgg_kwargs is None) or (dinov2_kwargs is None):
  72. raise ValueError("Input kwargs please")
  73. super().__init__()
  74. self.vgg = VGG19(**vgg_kwargs)
  75. self.frozen_dinov2 = FrozenDINOv2(**dinov2_kwargs)
  76. def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int, int]]:
  77. feats_vgg, sizes_vgg = self.vgg(x)
  78. feat_dinov2, size_dinov2 = self.frozen_dinov2(x)
  79. return feats_vgg + feat_dinov2, sizes_vgg + size_dinov2