configuration_dpt.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """DPT model configuration"""
  15. from huggingface_hub.dataclasses import strict
  16. from ...backbone_utils import consolidate_backbone_kwargs_to_config
  17. from ...configuration_utils import PreTrainedConfig
  18. from ...utils import auto_docstring
  19. from ..auto.configuration_auto import AutoConfig
  20. @auto_docstring(checkpoint="Intel/dpt-large")
  21. @strict
  22. class DPTConfig(PreTrainedConfig):
  23. r"""
  24. is_hybrid (`bool`, *optional*, defaults to `False`):
  25. Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
  26. backbone_out_indices (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
  27. Indices of the intermediate hidden states to use from backbone.
  28. readout_type (`str`, *optional*, defaults to `"project"`):
  29. The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
  30. the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
  31. - "ignore" simply ignores the CLS token.
  32. - "add" passes the information from the CLS token to all other tokens by adding the representations.
  33. - "project" passes information to the other tokens by concatenating the readout to all other tokens before
  34. projecting the
  35. representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
  36. reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
  37. The up/downsampling factors of the reassemble layers.
  38. neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
  39. The hidden sizes to project to for the feature maps of the backbone.
  40. fusion_hidden_size (`int`, *optional*, defaults to 256):
  41. The number of channels before fusion.
  42. head_in_index (`int`, *optional*, defaults to -1):
  43. The index of the features to use in the heads.
  44. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
  45. Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
  46. use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
  47. Whether to use bias in the pre-activate residual units of the fusion blocks.
  48. add_projection (`bool`, *optional*, defaults to `False`):
  49. Whether to add a projection layer before the depth estimation head.
  50. use_auxiliary_head (`bool`, *optional*, defaults to `True`):
  51. Whether to use an auxiliary head during training.
  52. auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
  53. Weight of the cross-entropy loss of the auxiliary head.
  54. semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
  55. The dropout ratio for the semantic classification head.
  56. backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
  57. Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
  58. neck_ignore_stages (`list[int]`, *optional*, defaults to `[0, 1]`):
  59. Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
  60. pooler_output_size (`int`, *optional*):
  61. Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
  62. pooler_act (`str`, *optional*, defaults to `"tanh"`):
  63. The activation function to be used by the pooler.
  64. Example:
  65. ```python
  66. >>> from transformers import DPTModel, DPTConfig
  67. >>> # Initializing a DPT dpt-large style configuration
  68. >>> configuration = DPTConfig()
  69. >>> # Initializing a model from the dpt-large style configuration
  70. >>> model = DPTModel(configuration)
  71. >>> # Accessing the model configuration
  72. >>> configuration = model.config
  73. ```"""
  74. model_type = "dpt"
  75. sub_configs = {"backbone_config": AutoConfig}
  76. # NOTE: some values are typed as `None` on purpose
  77. # DPT creates one of: backbone or the general model only
  78. # so official checkpoint saved them as `None`
  79. hidden_size: int = 768
  80. num_hidden_layers: None | int = 12
  81. num_attention_heads: int | None = 12
  82. intermediate_size: int | None = 3072
  83. hidden_act: str = "gelu"
  84. hidden_dropout_prob: float | int | None = 0.0
  85. attention_probs_dropout_prob: float | int | None = 0.0
  86. initializer_range: float = 0.02
  87. layer_norm_eps: float | None = 1e-12
  88. image_size: int | list[int] | tuple[int, int] | None = 384
  89. patch_size: int | list[int] | tuple[int, int] | None = 16
  90. num_channels: int | None = 3
  91. is_hybrid: bool = False
  92. qkv_bias: bool | None = True
  93. backbone_out_indices: list[int] | tuple[int, ...] | None = (2, 5, 8, 11)
  94. readout_type: str = "project"
  95. reassemble_factors: list[int | float] | tuple[int | float, ...] = (4, 2, 1, 0.5)
  96. neck_hidden_sizes: list[int] | tuple[int, ...] = (96, 192, 384, 768)
  97. fusion_hidden_size: int = 256
  98. head_in_index: int = -1
  99. use_batch_norm_in_fusion_residual: bool | None = False
  100. use_bias_in_fusion_residual: bool | None = None
  101. add_projection: bool = False
  102. use_auxiliary_head: bool | None = True
  103. auxiliary_loss_weight: float = 0.4
  104. semantic_loss_ignore_index: int = 255
  105. semantic_classifier_dropout: float | int = 0.1
  106. backbone_featmap_shape: list[int] | tuple[int, ...] | None = (1, 1024, 24, 24)
  107. neck_ignore_stages: list[int] | tuple[int, ...] = (0, 1)
  108. backbone_config: dict | PreTrainedConfig | None = None
  109. pooler_output_size: int | None = None
  110. pooler_act: str = "tanh"
  111. def __post_init__(self, **kwargs):
  112. if self.readout_type not in ["ignore", "add", "project"]:
  113. raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
  114. if self.is_hybrid:
  115. if isinstance(self.backbone_config, dict):
  116. self.backbone_config.setdefault("model_type", "bit")
  117. self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
  118. backbone_config=self.backbone_config,
  119. default_config_type="bit",
  120. default_config_kwargs={
  121. "global_padding": "same",
  122. "layer_type": "bottleneck",
  123. "depths": [3, 4, 9],
  124. "out_features": ["stage1", "stage2", "stage3"],
  125. "embedding_dynamic_padding": True,
  126. },
  127. **kwargs,
  128. )
  129. if self.readout_type != "project":
  130. raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
  131. elif kwargs.get("backbone") is not None or self.backbone_config is not None:
  132. self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
  133. backbone_config=self.backbone_config,
  134. **kwargs,
  135. )
  136. self.backbone_out_indices = None
  137. self.backbone_featmap_shape = self.backbone_featmap_shape if self.is_hybrid else None
  138. self.neck_ignore_stages = self.neck_ignore_stages if self.is_hybrid else []
  139. self.pooler_output_size = self.pooler_output_size if self.pooler_output_size else self.hidden_size
  140. super().__post_init__(**kwargs)
  141. __all__ = ["DPTConfig"]