| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349 |
- # Copyright 2024 Intel Labs and The HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """PyTorch ZoeDepth model."""
- import math
- from dataclasses import dataclass
- import torch
- from torch import nn
- from ... import initialization as init
- from ...activations import ACT2FN
- from ...backbone_utils import load_backbone
- from ...modeling_outputs import DepthEstimatorOutput
- from ...modeling_utils import PreTrainedModel
- from ...utils import ModelOutput, auto_docstring, logging
- from .configuration_zoedepth import ZoeDepthConfig
- logger = logging.get_logger(__name__)
- @dataclass
- @auto_docstring(
- custom_intro="""
- Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).
- """
- )
- class ZoeDepthDepthEstimatorOutput(ModelOutput):
- r"""
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
- Classification (or regression if config.num_labels==1) loss.
- domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
- Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.
- """
- loss: torch.FloatTensor | None = None
- predicted_depth: torch.FloatTensor | None = None
- domain_logits: torch.FloatTensor | None = None
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
- attentions: tuple[torch.FloatTensor, ...] | None = None
- class ZoeDepthReassembleStage(nn.Module):
- """
- This class reassembles the hidden states of the backbone into image-like feature representations at various
- resolutions.
- This happens in 3 stages:
- 1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
- `config.readout_type`.
- 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
- 3. Resizing the spatial dimensions (height, width).
- Args:
- config (`[ZoeDepthConfig]`):
- Model configuration class defining the model architecture.
- """
- def __init__(self, config):
- super().__init__()
- self.readout_type = config.readout_type
- self.layers = nn.ModuleList()
- for neck_hidden_size, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
- self.layers.append(ZoeDepthReassembleLayer(config, channels=neck_hidden_size, factor=factor))
- if config.readout_type == "project":
- self.readout_projects = nn.ModuleList()
- hidden_size = config.backbone_hidden_size
- for _ in config.neck_hidden_sizes:
- self.readout_projects.append(
- nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
- )
- def forward(self, hidden_states: list[torch.Tensor], patch_height, patch_width) -> list[torch.Tensor]:
- """
- Args:
- hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
- List of hidden states from the backbone.
- """
- batch_size = hidden_states[0].shape[0]
- # stack along batch dimension
- # shape (batch_size*num_stages, sequence_length + 1, hidden_size)
- hidden_states = torch.cat(hidden_states, dim=0)
- cls_token, hidden_states = hidden_states[:, 0], hidden_states[:, 1:]
- # reshape hidden_states to (batch_size*num_stages, num_channels, height, width)
- total_batch_size, sequence_length, num_channels = hidden_states.shape
- hidden_states = hidden_states.reshape(total_batch_size, patch_height, patch_width, num_channels)
- hidden_states = hidden_states.permute(0, 3, 1, 2).contiguous()
- if self.readout_type == "project":
- # reshape to (batch_size*num_stages, height*width, num_channels)
- hidden_states = hidden_states.flatten(2).permute((0, 2, 1))
- readout = cls_token.unsqueeze(dim=1).expand_as(hidden_states)
- # concatenate the readout token to the hidden states
- # to get (batch_size*num_stages, height*width, 2*num_channels)
- hidden_states = torch.cat((hidden_states, readout), -1)
- elif self.readout_type == "add":
- hidden_states = hidden_states + cls_token.unsqueeze(-1)
- out = []
- for stage_idx, hidden_state in enumerate(hidden_states.split(batch_size, dim=0)):
- if self.readout_type == "project":
- hidden_state = self.readout_projects[stage_idx](hidden_state)
- # reshape back to (batch_size, num_channels, height, width)
- hidden_state = hidden_state.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width)
- hidden_state = self.layers[stage_idx](hidden_state)
- out.append(hidden_state)
- return out
- class ZoeDepthReassembleLayer(nn.Module):
- def __init__(self, config, channels, factor):
- super().__init__()
- # projection
- hidden_size = config.backbone_hidden_size
- self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
- # up/down sampling depending on factor
- if factor > 1:
- self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
- elif factor == 1:
- self.resize = nn.Identity()
- elif factor < 1:
- # so should downsample
- self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
- # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward with DPT->ZoeDepth
- def forward(self, hidden_state):
- hidden_state = self.projection(hidden_state)
- hidden_state = self.resize(hidden_state)
- return hidden_state
- # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->ZoeDepth
- class ZoeDepthFeatureFusionStage(nn.Module):
- def __init__(self, config: ZoeDepthConfig):
- super().__init__()
- self.layers = nn.ModuleList()
- for _ in range(len(config.neck_hidden_sizes)):
- self.layers.append(ZoeDepthFeatureFusionLayer(config))
- def forward(self, hidden_states):
- # reversing the hidden_states, we start from the last
- hidden_states = hidden_states[::-1]
- fused_hidden_states = []
- fused_hidden_state = None
- for hidden_state, layer in zip(hidden_states, self.layers):
- if fused_hidden_state is None:
- # first layer only uses the last hidden_state
- fused_hidden_state = layer(hidden_state)
- else:
- fused_hidden_state = layer(fused_hidden_state, hidden_state)
- fused_hidden_states.append(fused_hidden_state)
- return fused_hidden_states
- # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer with DPT->ZoeDepth
- class ZoeDepthPreActResidualLayer(nn.Module):
- """
- ResidualConvUnit, pre-activate residual unit.
- Args:
- config (`[ZoeDepthConfig]`):
- Model configuration class defining the model architecture.
- """
- # Ignore copy
- def __init__(self, config):
- super().__init__()
- self.use_batch_norm = config.use_batch_norm_in_fusion_residual
- use_bias_in_fusion_residual = (
- config.use_bias_in_fusion_residual
- if config.use_bias_in_fusion_residual is not None
- else not self.use_batch_norm
- )
- self.activation1 = nn.ReLU()
- self.convolution1 = nn.Conv2d(
- config.fusion_hidden_size,
- config.fusion_hidden_size,
- kernel_size=3,
- stride=1,
- padding=1,
- bias=use_bias_in_fusion_residual,
- )
- self.activation2 = nn.ReLU()
- self.convolution2 = nn.Conv2d(
- config.fusion_hidden_size,
- config.fusion_hidden_size,
- kernel_size=3,
- stride=1,
- padding=1,
- bias=use_bias_in_fusion_residual,
- )
- if self.use_batch_norm:
- self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
- self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
- def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
- residual = hidden_state
- hidden_state = self.activation1(hidden_state)
- hidden_state = self.convolution1(hidden_state)
- if self.use_batch_norm:
- hidden_state = self.batch_norm1(hidden_state)
- hidden_state = self.activation2(hidden_state)
- hidden_state = self.convolution2(hidden_state)
- if self.use_batch_norm:
- hidden_state = self.batch_norm2(hidden_state)
- return hidden_state + residual
- # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer with DPT->ZoeDepth
- class ZoeDepthFeatureFusionLayer(nn.Module):
- """Feature fusion layer, merges feature maps from different stages.
- Args:
- config (`[ZoeDepthConfig]`):
- Model configuration class defining the model architecture.
- align_corners (`bool`, *optional*, defaults to `True`):
- The align_corner setting for bilinear upsample.
- """
- def __init__(self, config: ZoeDepthConfig, align_corners: bool = True):
- super().__init__()
- self.align_corners = align_corners
- self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
- self.residual_layer1 = ZoeDepthPreActResidualLayer(config)
- self.residual_layer2 = ZoeDepthPreActResidualLayer(config)
- def forward(self, hidden_state: torch.Tensor, residual: torch.Tensor | None = None) -> torch.Tensor:
- if residual is not None:
- if hidden_state.shape != residual.shape:
- residual = nn.functional.interpolate(
- residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
- )
- hidden_state = hidden_state + self.residual_layer1(residual)
- hidden_state = self.residual_layer2(hidden_state)
- hidden_state = nn.functional.interpolate(
- hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
- )
- hidden_state = self.projection(hidden_state)
- return hidden_state
- class ZoeDepthNeck(nn.Module):
- """
- ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
- input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:
- * ZoeDepthReassembleStage
- * ZoeDepthFeatureFusionStage.
- Args:
- config (dict): config dict.
- """
- # Copied from transformers.models.dpt.modeling_dpt.DPTNeck.__init__ with DPT->ZoeDepth
- def __init__(self, config: ZoeDepthConfig):
- super().__init__()
- self.config = config
- # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
- if config.backbone_config is not None and config.backbone_config.model_type == "swinv2":
- self.reassemble_stage = None
- else:
- self.reassemble_stage = ZoeDepthReassembleStage(config)
- self.convs = nn.ModuleList()
- for channel in config.neck_hidden_sizes:
- self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
- # fusion
- self.fusion_stage = ZoeDepthFeatureFusionStage(config)
- def forward(self, hidden_states: list[torch.Tensor], patch_height, patch_width) -> list[torch.Tensor]:
- """
- Args:
- hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
- List of hidden states from the backbone.
- """
- if not isinstance(hidden_states, (tuple, list)):
- raise TypeError("hidden_states should be a tuple or list of tensors")
- if len(hidden_states) != len(self.config.neck_hidden_sizes):
- raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
- # postprocess hidden states
- if self.reassemble_stage is not None:
- hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
- features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
- # fusion blocks
- output = self.fusion_stage(features)
- return output, features[-1]
- class ZoeDepthRelativeDepthEstimationHead(nn.Module):
- """
- Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
- the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
- supplementary material).
- """
- def __init__(self, config):
- super().__init__()
- self.head_in_index = config.head_in_index
- self.projection = None
- if config.add_projection:
- self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
- features = config.fusion_hidden_size
- self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
- self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
- self.conv2 = nn.Conv2d(features // 2, config.num_relative_features, kernel_size=3, stride=1, padding=1)
- self.conv3 = nn.Conv2d(config.num_relative_features, 1, kernel_size=1, stride=1, padding=0)
- def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor:
- # use last features
- hidden_states = hidden_states[self.head_in_index]
- if self.projection is not None:
- hidden_states = self.projection(hidden_states)
- hidden_states = nn.ReLU()(hidden_states)
- hidden_states = self.conv1(hidden_states)
- hidden_states = self.upsample(hidden_states)
- hidden_states = self.conv2(hidden_states)
- hidden_states = nn.ReLU()(hidden_states)
- # we need the features here (after second conv + ReLu)
- features = hidden_states
- hidden_states = self.conv3(hidden_states)
- hidden_states = nn.ReLU()(hidden_states)
- predicted_depth = hidden_states.squeeze(dim=1)
- return predicted_depth, features
- def log_binom(n, k, eps=1e-7):
- """log(nCk) using stirling approximation"""
- n = n + eps
- k = k + eps
- return n * torch.log(n) - k * torch.log(k) - (n - k) * torch.log(n - k + eps)
- class LogBinomialSoftmax(nn.Module):
- def __init__(self, n_classes=256, act=torch.softmax):
- """Compute log binomial distribution for n_classes
- Args:
- n_classes (`int`, *optional*, defaults to 256):
- Number of output classes.
- act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
- Activation function to apply to the output.
- """
- super().__init__()
- self.k = n_classes
- self.act = act
- self.register_buffer("k_idx", torch.arange(0, n_classes).view(1, -1, 1, 1), persistent=False)
- self.register_buffer("k_minus_1", torch.tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False)
- def forward(self, probabilities, temperature=1.0, eps=1e-4):
- """Compute the log binomial distribution for probabilities.
- Args:
- probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
- Tensor containing probabilities of each class.
- temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
- Temperature of distribution.
- eps (`float`, *optional*, defaults to 1e-4):
- Small number for numerical stability.
- Returns:
- `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
- Log binomial distribution logbinomial(p;t).
- """
- if probabilities.ndim == 3:
- probabilities = probabilities.unsqueeze(1) # make it (batch_size, num_channels, height, width)
- one_minus_probabilities = torch.clamp(1 - probabilities, eps, 1)
- probabilities = torch.clamp(probabilities, eps, 1)
- y = (
- log_binom(self.k_minus_1, self.k_idx)
- + self.k_idx * torch.log(probabilities)
- + (self.k_minus_1 - self.k_idx) * torch.log(one_minus_probabilities)
- )
- return self.act(y / temperature, dim=1)
- class ZoeDepthConditionalLogBinomialSoftmax(nn.Module):
- def __init__(
- self,
- config,
- in_features,
- condition_dim,
- n_classes=256,
- bottleneck_factor=2,
- ):
- """Per-pixel MLP followed by a Conditional Log Binomial softmax.
- Args:
- in_features (`int`):
- Number of input channels in the main feature.
- condition_dim (`int`):
- Number of input channels in the condition feature.
- n_classes (`int`, *optional*, defaults to 256):
- Number of classes.
- bottleneck_factor (`int`, *optional*, defaults to 2):
- Hidden dim factor.
- """
- super().__init__()
- bottleneck = (in_features + condition_dim) // bottleneck_factor
- self.mlp = nn.Sequential(
- nn.Conv2d(in_features + condition_dim, bottleneck, kernel_size=1, stride=1, padding=0),
- nn.GELU(),
- # 2 for probabilities linear norm, 2 for temperature linear norm
- nn.Conv2d(bottleneck, 2 + 2, kernel_size=1, stride=1, padding=0),
- nn.Softplus(),
- )
- self.p_eps = 1e-4
- self.max_temp = config.max_temp
- self.min_temp = config.min_temp
- self.log_binomial_transform = LogBinomialSoftmax(n_classes, act=torch.softmax)
- def forward(self, main_feature, condition_feature):
- """
- Args:
- main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
- Main feature.
- condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
- Condition feature.
- Returns:
- `torch.Tensor`:
- Output log binomial distribution
- """
- probabilities_and_temperature = self.mlp(torch.concat((main_feature, condition_feature), dim=1))
- probabilities, temperature = (
- probabilities_and_temperature[:, :2, ...],
- probabilities_and_temperature[:, 2:, ...],
- )
- probabilities = probabilities + self.p_eps
- probabilities = probabilities[:, 0, ...] / (probabilities[:, 0, ...] + probabilities[:, 1, ...])
- temperature = temperature + self.p_eps
- temperature = temperature[:, 0, ...] / (temperature[:, 0, ...] + temperature[:, 1, ...])
- temperature = temperature.unsqueeze(1)
- temperature = (self.max_temp - self.min_temp) * temperature + self.min_temp
- return self.log_binomial_transform(probabilities, temperature)
- class ZoeDepthSeedBinRegressor(nn.Module):
- def __init__(self, config, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
- """Bin center regressor network.
- Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.
- Args:
- config (`int`):
- Model configuration.
- n_bins (`int`, *optional*, defaults to 16):
- Number of bin centers.
- mlp_dim (`int`, *optional*, defaults to 256):
- Hidden dimension.
- min_depth (`float`, *optional*, defaults to 1e-3):
- Min depth value.
- max_depth (`float`, *optional*, defaults to 10):
- Max depth value.
- """
- super().__init__()
- self.in_features = config.bottleneck_features
- self.bin_centers_type = config.bin_centers_type
- self.min_depth = min_depth
- self.max_depth = max_depth
- self.conv1 = nn.Conv2d(self.in_features, mlp_dim, 1, 1, 0)
- self.act1 = nn.ReLU(inplace=True)
- self.conv2 = nn.Conv2d(mlp_dim, n_bins, 1, 1, 0)
- self.act2 = nn.ReLU(inplace=True) if self.bin_centers_type == "normed" else nn.Softplus()
- def forward(self, x):
- """
- Returns tensor of bin_width vectors (centers). One vector b for every pixel
- """
- x = self.conv1(x)
- x = self.act1(x)
- x = self.conv2(x)
- bin_centers = self.act2(x)
- if self.bin_centers_type == "normed":
- bin_centers = bin_centers + 1e-3
- bin_widths_normed = bin_centers / bin_centers.sum(dim=1, keepdim=True)
- # shape (batch_size, num_channels, height, width)
- bin_widths = (self.max_depth - self.min_depth) * bin_widths_normed
- # pad has the form (left, right, top, bottom, front, back)
- bin_widths = nn.functional.pad(bin_widths, (0, 0, 0, 0, 1, 0), mode="constant", value=self.min_depth)
- # shape (batch_size, num_channels, height, width)
- bin_edges = torch.cumsum(bin_widths, dim=1)
- bin_centers = 0.5 * (bin_edges[:, :-1, ...] + bin_edges[:, 1:, ...])
- return bin_widths_normed, bin_centers
- else:
- return bin_centers, bin_centers
- @torch.jit.script
- def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
- """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
- This is the default one according to the accompanying paper.
- Args:
- dx (`torch.Tensor`):
- The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
- alpha (`float`, *optional*, defaults to 300):
- Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
- gamma (`int`, *optional*, defaults to 2):
- Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
- Lower gamma = farther reach.
- Returns:
- torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
- """
- return dx.div(1 + alpha * dx.pow(gamma))
- class ZoeDepthAttractorLayer(nn.Module):
- def __init__(
- self,
- config,
- n_bins,
- n_attractors=16,
- min_depth=1e-3,
- max_depth=10,
- memory_efficient=False,
- ):
- """
- Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
- """
- super().__init__()
- self.alpha = config.attractor_alpha
- self.gemma = config.attractor_gamma
- self.kind = config.attractor_kind
- self.n_attractors = n_attractors
- self.n_bins = n_bins
- self.min_depth = min_depth
- self.max_depth = max_depth
- self.memory_efficient = memory_efficient
- # MLP to predict attractor points
- in_features = mlp_dim = config.bin_embedding_dim
- self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
- self.act1 = nn.ReLU(inplace=True)
- self.conv2 = nn.Conv2d(mlp_dim, n_attractors * 2, 1, 1, 0) # x2 for linear norm
- self.act2 = nn.ReLU(inplace=True)
- def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
- """
- The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
- and the attractor points (the latter are predicted by the MLP).
- Args:
- x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
- Feature block.
- prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
- Previous bin centers normed.
- prev_bin_embedding (`torch.Tensor`, *optional*):
- Optional previous bin embeddings.
- interpolate (`bool`, *optional*, defaults to `True`):
- Whether to interpolate the previous bin embeddings to the size of the input features.
- Returns:
- `tuple[`torch.Tensor`, `torch.Tensor`]:
- New bin centers normed and scaled.
- """
- if prev_bin_embedding is not None:
- if interpolate:
- prev_bin_embedding = nn.functional.interpolate(
- prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
- )
- x = x + prev_bin_embedding
- x = self.conv1(x)
- x = self.act1(x)
- x = self.conv2(x)
- attractors = self.act2(x)
- attractors = attractors + 1e-3
- batch_size, _, height, width = attractors.shape
- attractors = attractors.view(batch_size, self.n_attractors, 2, height, width)
- # batch_size, num_attractors, 2, height, width
- # note: original repo had a bug here: https://github.com/isl-org/ZoeDepth/blame/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/layers/attractor.py#L105C9-L106C50
- # we include the bug to maintain compatibility with the weights
- attractors_normed = attractors[:, :, 0, ...] # batch_size, batch_size*num_attractors, height, width
- bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
- # note: only attractor_type = "exp" is supported here, since no checkpoints were released with other attractor types
- if not self.memory_efficient:
- func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
- # shape (batch_size, num_bins, height, width)
- delta_c = func(inv_attractor(attractors_normed.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
- else:
- delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
- for i in range(self.n_attractors):
- # shape (batch_size, num_bins, height, width)
- delta_c += inv_attractor(attractors_normed[:, i, ...].unsqueeze(1) - bin_centers)
- if self.kind == "mean":
- delta_c = delta_c / self.n_attractors
- bin_new_centers = bin_centers + delta_c
- bin_centers = (self.max_depth - self.min_depth) * bin_new_centers + self.min_depth
- bin_centers, _ = torch.sort(bin_centers, dim=1)
- bin_centers = torch.clip(bin_centers, self.min_depth, self.max_depth)
- return bin_new_centers, bin_centers
- class ZoeDepthAttractorLayerUnnormed(nn.Module):
- def __init__(
- self,
- config,
- n_bins,
- n_attractors=16,
- min_depth=1e-3,
- max_depth=10,
- memory_efficient=True,
- ):
- """
- Attractor layer for bin centers. Bin centers are unbounded
- """
- super().__init__()
- self.n_attractors = n_attractors
- self.n_bins = n_bins
- self.min_depth = min_depth
- self.max_depth = max_depth
- self.alpha = config.attractor_alpha
- self.gamma = config.attractor_alpha
- self.kind = config.attractor_kind
- self.memory_efficient = memory_efficient
- in_features = mlp_dim = config.bin_embedding_dim
- self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
- self.act1 = nn.ReLU(inplace=True)
- self.conv2 = nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0)
- self.act2 = nn.Softplus()
- def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
- """
- The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
- and the attractor points (the latter are predicted by the MLP).
- Args:
- x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
- Feature block.
- prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
- Previous bin centers normed.
- prev_bin_embedding (`torch.Tensor`, *optional*):
- Optional previous bin embeddings.
- interpolate (`bool`, *optional*, defaults to `True`):
- Whether to interpolate the previous bin embeddings to the size of the input features.
- Returns:
- `tuple[`torch.Tensor`, `torch.Tensor`]:
- New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
- """
- if prev_bin_embedding is not None:
- if interpolate:
- prev_bin_embedding = nn.functional.interpolate(
- prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
- )
- x = x + prev_bin_embedding
- x = self.conv1(x)
- x = self.act1(x)
- x = self.conv2(x)
- attractors = self.act2(x)
- height, width = attractors.shape[-2:]
- bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
- if not self.memory_efficient:
- func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
- # shape batch_size, num_bins, height, width
- delta_c = func(inv_attractor(attractors.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
- else:
- delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
- for i in range(self.n_attractors):
- # shape batch_size, num_bins, height, width
- delta_c += inv_attractor(attractors[:, i, ...].unsqueeze(1) - bin_centers)
- if self.kind == "mean":
- delta_c = delta_c / self.n_attractors
- bin_new_centers = bin_centers + delta_c
- bin_centers = bin_new_centers
- return bin_new_centers, bin_centers
- class ZoeDepthProjector(nn.Module):
- def __init__(self, in_features, out_features, mlp_dim=128):
- """Projector MLP.
- Args:
- in_features (`int`):
- Number of input channels.
- out_features (`int`):
- Number of output channels.
- mlp_dim (`int`, *optional*, defaults to 128):
- Hidden dimension.
- """
- super().__init__()
- self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
- self.act = nn.ReLU(inplace=True)
- self.conv2 = nn.Conv2d(mlp_dim, out_features, 1, 1, 0)
- def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
- hidden_state = self.conv1(hidden_state)
- hidden_state = self.act(hidden_state)
- hidden_state = self.conv2(hidden_state)
- return hidden_state
- # Copied from transformers.models.grounding_dino.modeling_grounding_dino.GroundingDinoMultiheadAttention with GroundingDino->ZoeDepth
- class ZoeDepthMultiheadAttention(nn.Module):
- """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
- # Ignore copy
- def __init__(self, hidden_size, num_attention_heads, dropout):
- super().__init__()
- if hidden_size % num_attention_heads != 0:
- raise ValueError(
- f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
- f"heads ({num_attention_heads})"
- )
- self.num_attention_heads = num_attention_heads
- self.attention_head_size = int(hidden_size / num_attention_heads)
- self.all_head_size = self.num_attention_heads * self.attention_head_size
- self.query = nn.Linear(hidden_size, self.all_head_size)
- self.key = nn.Linear(hidden_size, self.all_head_size)
- self.value = nn.Linear(hidden_size, self.all_head_size)
- self.out_proj = nn.Linear(hidden_size, hidden_size)
- self.dropout = nn.Dropout(dropout)
- def forward(
- self,
- queries: torch.Tensor,
- keys: torch.Tensor,
- values: torch.Tensor,
- attention_mask: torch.FloatTensor | None = None,
- output_attentions: bool | None = False,
- ) -> tuple[torch.Tensor]:
- batch_size, seq_length, _ = queries.shape
- query_layer = (
- self.query(queries)
- .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
- .transpose(1, 2)
- )
- key_layer = (
- self.key(keys).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
- )
- value_layer = (
- self.value(values).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
- )
- # Take the dot product between "query" and "key" to get the raw attention scores.
- attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
- attention_scores = attention_scores / math.sqrt(self.attention_head_size)
- if attention_mask is not None:
- # Apply the attention mask is (precomputed for all layers in ZoeDepthModel forward() function)
- attention_scores = attention_scores + attention_mask
- # Normalize the attention scores to probabilities.
- attention_probs = nn.functional.softmax(attention_scores, dim=-1)
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = self.dropout(attention_probs)
- context_layer = torch.matmul(attention_probs, value_layer)
- context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
- new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
- context_layer = context_layer.view(new_context_layer_shape)
- context_layer = self.out_proj(context_layer)
- outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
- return outputs
- class ZoeDepthTransformerEncoderLayer(nn.Module):
- def __init__(self, config, dropout=0.1, activation="relu"):
- super().__init__()
- hidden_size = config.patch_transformer_hidden_size
- intermediate_size = config.patch_transformer_intermediate_size
- num_attention_heads = config.patch_transformer_num_attention_heads
- self.self_attn = ZoeDepthMultiheadAttention(hidden_size, num_attention_heads, dropout=dropout)
- self.linear1 = nn.Linear(hidden_size, intermediate_size)
- self.dropout = nn.Dropout(dropout)
- self.linear2 = nn.Linear(intermediate_size, hidden_size)
- self.norm1 = nn.LayerNorm(hidden_size)
- self.norm2 = nn.LayerNorm(hidden_size)
- self.dropout1 = nn.Dropout(dropout)
- self.dropout2 = nn.Dropout(dropout)
- self.activation = ACT2FN[activation]
- def forward(
- self,
- src,
- src_mask: torch.Tensor | None = None,
- ):
- queries = keys = src
- src2 = self.self_attn(queries=queries, keys=keys, values=src, attention_mask=src_mask)[0]
- src = src + self.dropout1(src2)
- src = self.norm1(src)
- src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
- src = src + self.dropout2(src2)
- src = self.norm2(src)
- return src
- class ZoeDepthPatchTransformerEncoder(nn.Module):
- def __init__(self, config):
- """ViT-like transformer block
- Args:
- config (`ZoeDepthConfig`):
- Model configuration class defining the model architecture.
- """
- super().__init__()
- in_channels = config.bottleneck_features
- self.transformer_encoder = nn.ModuleList(
- [ZoeDepthTransformerEncoderLayer(config) for _ in range(config.num_patch_transformer_layers)]
- )
- self.embedding_convPxP = nn.Conv2d(
- in_channels, config.patch_transformer_hidden_size, kernel_size=1, stride=1, padding=0
- )
- def positional_encoding_1d(self, batch_size, sequence_length, embedding_dim, device="cpu", dtype=torch.float32):
- """Generate positional encodings
- Args:
- sequence_length (int): Sequence length
- embedding_dim (int): Embedding dimension
- Returns:
- torch.Tensor: Positional encodings.
- """
- position = torch.arange(0, sequence_length, dtype=dtype, device=device).unsqueeze(1)
- index = torch.arange(0, embedding_dim, 2, dtype=dtype, device=device).unsqueeze(0)
- div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
- pos_encoding = position * div_term
- pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
- pos_encoding = pos_encoding.unsqueeze(dim=0).repeat(batch_size, 1, 1)
- return pos_encoding
- def forward(self, x):
- """Forward pass
- Args:
- x (torch.Tensor - NCHW): Input feature tensor
- Returns:
- torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
- """
- embeddings = self.embedding_convPxP(x).flatten(2) # shape (batch_size, num_channels, sequence_length)
- # add an extra special CLS token at the start for global accumulation
- embeddings = nn.functional.pad(embeddings, (1, 0))
- embeddings = embeddings.permute(0, 2, 1)
- batch_size, sequence_length, embedding_dim = embeddings.shape
- embeddings = embeddings + self.positional_encoding_1d(
- batch_size, sequence_length, embedding_dim, device=embeddings.device, dtype=embeddings.dtype
- )
- for i in range(4):
- embeddings = self.transformer_encoder[i](embeddings)
- return embeddings
- class ZoeDepthMLPClassifier(nn.Module):
- def __init__(self, in_features, out_features) -> None:
- super().__init__()
- hidden_features = in_features
- self.linear1 = nn.Linear(in_features, hidden_features)
- self.activation = nn.ReLU()
- self.linear2 = nn.Linear(hidden_features, out_features)
- def forward(self, hidden_state):
- hidden_state = self.linear1(hidden_state)
- hidden_state = self.activation(hidden_state)
- domain_logits = self.linear2(hidden_state)
- return domain_logits
- class ZoeDepthMultipleMetricDepthEstimationHeads(nn.Module):
- """
- Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
- """
- def __init__(self, config):
- super().__init__()
- bin_embedding_dim = config.bin_embedding_dim
- n_attractors = config.num_attractors
- self.bin_configurations = config.bin_configurations
- self.bin_centers_type = config.bin_centers_type
- # Bottleneck convolution
- bottleneck_features = config.bottleneck_features
- self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
- # Transformer classifier on the bottleneck
- self.patch_transformer = ZoeDepthPatchTransformerEncoder(config)
- # MLP classifier
- self.mlp_classifier = ZoeDepthMLPClassifier(in_features=128, out_features=2)
- # Regressor and attractor
- if self.bin_centers_type == "normed":
- Attractor = ZoeDepthAttractorLayer
- elif self.bin_centers_type == "softplus":
- Attractor = ZoeDepthAttractorLayerUnnormed
- # We have bins for each bin configuration
- # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
- self.seed_bin_regressors = nn.ModuleDict(
- {
- conf["name"]: ZoeDepthSeedBinRegressor(
- config,
- n_bins=conf["n_bins"],
- mlp_dim=bin_embedding_dim // 2,
- min_depth=conf["min_depth"],
- max_depth=conf["max_depth"],
- )
- for conf in config.bin_configurations
- }
- )
- self.seed_projector = ZoeDepthProjector(
- in_features=bottleneck_features, out_features=bin_embedding_dim, mlp_dim=bin_embedding_dim // 2
- )
- self.projectors = nn.ModuleList(
- [
- ZoeDepthProjector(
- in_features=config.fusion_hidden_size,
- out_features=bin_embedding_dim,
- mlp_dim=bin_embedding_dim // 2,
- )
- for _ in range(4)
- ]
- )
- # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
- self.attractors = nn.ModuleDict(
- {
- configuration["name"]: nn.ModuleList(
- [
- Attractor(
- config,
- n_bins=n_attractors[i],
- min_depth=configuration["min_depth"],
- max_depth=configuration["max_depth"],
- )
- for i in range(len(n_attractors))
- ]
- )
- for configuration in config.bin_configurations
- }
- )
- last_in = config.num_relative_features
- # conditional log binomial for each bin configuration
- self.conditional_log_binomial = nn.ModuleDict(
- {
- configuration["name"]: ZoeDepthConditionalLogBinomialSoftmax(
- config,
- last_in,
- bin_embedding_dim,
- configuration["n_bins"],
- bottleneck_factor=4,
- )
- for configuration in config.bin_configurations
- }
- )
- def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
- x = self.conv2(bottleneck)
- # Predict which path to take
- # Embedding is of shape (batch_size, hidden_size)
- embedding = self.patch_transformer(x)[:, 0, :]
- # MLP classifier to get logits of shape (batch_size, 2)
- domain_logits = self.mlp_classifier(embedding)
- domain_vote = torch.softmax(domain_logits.sum(dim=0, keepdim=True), dim=-1)
- # Get the path
- names = [configuration["name"] for configuration in self.bin_configurations]
- bin_configurations_name = names[torch.argmax(domain_vote, dim=-1).squeeze().item()]
- try:
- conf = [config for config in self.bin_configurations if config["name"] == bin_configurations_name][0]
- except IndexError:
- raise ValueError(f"bin_configurations_name {bin_configurations_name} not found in bin_configurationss")
- min_depth = conf["min_depth"]
- max_depth = conf["max_depth"]
- seed_bin_regressor = self.seed_bin_regressors[bin_configurations_name]
- _, seed_bin_centers = seed_bin_regressor(x)
- if self.bin_centers_type in ["normed", "hybrid2"]:
- prev_bin = (seed_bin_centers - min_depth) / (max_depth - min_depth)
- else:
- prev_bin = seed_bin_centers
- prev_bin_embedding = self.seed_projector(x)
- attractors = self.attractors[bin_configurations_name]
- for projector, attractor, feature in zip(self.projectors, attractors, feature_blocks):
- bin_embedding = projector(feature)
- bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
- prev_bin = bin
- prev_bin_embedding = bin_embedding
- last = outconv_activation
- bin_centers = nn.functional.interpolate(bin_centers, last.shape[-2:], mode="bilinear", align_corners=True)
- bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
- conditional_log_binomial = self.conditional_log_binomial[bin_configurations_name]
- x = conditional_log_binomial(last, bin_embedding)
- # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
- out = torch.sum(x * bin_centers, dim=1, keepdim=True)
- return out, domain_logits
- class ZoeDepthMetricDepthEstimationHead(nn.Module):
- def __init__(self, config):
- super().__init__()
- bin_configuration = config.bin_configurations[0]
- n_bins = bin_configuration["n_bins"]
- min_depth = bin_configuration["min_depth"]
- max_depth = bin_configuration["max_depth"]
- bin_embedding_dim = config.bin_embedding_dim
- n_attractors = config.num_attractors
- bin_centers_type = config.bin_centers_type
- self.min_depth = min_depth
- self.max_depth = max_depth
- self.bin_centers_type = bin_centers_type
- # Bottleneck convolution
- bottleneck_features = config.bottleneck_features
- self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
- # Regressor and attractor
- if self.bin_centers_type == "normed":
- Attractor = ZoeDepthAttractorLayer
- elif self.bin_centers_type == "softplus":
- Attractor = ZoeDepthAttractorLayerUnnormed
- self.seed_bin_regressor = ZoeDepthSeedBinRegressor(
- config, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth
- )
- self.seed_projector = ZoeDepthProjector(in_features=bottleneck_features, out_features=bin_embedding_dim)
- self.projectors = nn.ModuleList(
- [
- ZoeDepthProjector(in_features=config.fusion_hidden_size, out_features=bin_embedding_dim)
- for _ in range(4)
- ]
- )
- self.attractors = nn.ModuleList(
- [
- Attractor(
- config,
- n_bins=n_bins,
- n_attractors=n_attractors[i],
- min_depth=min_depth,
- max_depth=max_depth,
- )
- for i in range(4)
- ]
- )
- last_in = config.num_relative_features + 1 # +1 for relative depth
- # use log binomial instead of softmax
- self.conditional_log_binomial = ZoeDepthConditionalLogBinomialSoftmax(
- config,
- last_in,
- bin_embedding_dim,
- n_classes=n_bins,
- )
- def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
- x = self.conv2(bottleneck)
- _, seed_bin_centers = self.seed_bin_regressor(x)
- if self.bin_centers_type in ["normed", "hybrid2"]:
- prev_bin = (seed_bin_centers - self.min_depth) / (self.max_depth - self.min_depth)
- else:
- prev_bin = seed_bin_centers
- prev_bin_embedding = self.seed_projector(x)
- # unroll this loop for better performance
- for projector, attractor, feature in zip(self.projectors, self.attractors, feature_blocks):
- bin_embedding = projector(feature)
- bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
- prev_bin = bin.clone()
- prev_bin_embedding = bin_embedding.clone()
- last = outconv_activation
- # concatenative relative depth with last. First interpolate relative depth to last size
- relative_conditioning = relative_depth.unsqueeze(1)
- relative_conditioning = nn.functional.interpolate(
- relative_conditioning, size=last.shape[2:], mode="bilinear", align_corners=True
- )
- last = torch.cat([last, relative_conditioning], dim=1)
- bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
- x = self.conditional_log_binomial(last, bin_embedding)
- # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
- bin_centers = nn.functional.interpolate(bin_centers, x.shape[-2:], mode="bilinear", align_corners=True)
- out = torch.sum(x * bin_centers, dim=1, keepdim=True)
- return out, None
- # Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth
- # avoiding sdpa and flash_attn_2 support, it's done int the backend
- @auto_docstring
- class ZoeDepthPreTrainedModel(PreTrainedModel):
- config: ZoeDepthConfig
- base_model_prefix = "zoedepth"
- main_input_name = "pixel_values"
- input_modalities = ("image",)
- supports_gradient_checkpointing = True
- def _init_weights(self, module):
- super()._init_weights(module)
- if isinstance(module, LogBinomialSoftmax):
- init.copy_(module.k_idx, torch.arange(0, module.k).view(1, -1, 1, 1))
- init.copy_(module.k_minus_1, torch.tensor([module.k - 1]).view(1, -1, 1, 1))
- @auto_docstring(
- custom_intro="""
- ZoeDepth model with one or multiple metric depth estimation head(s) on top.
- """
- )
- class ZoeDepthForDepthEstimation(ZoeDepthPreTrainedModel):
- def __init__(self, config):
- super().__init__(config)
- self.backbone = load_backbone(config)
- if hasattr(self.backbone.config, "hidden_size") and hasattr(self.backbone.config, "patch_size"):
- config.backbone_hidden_size = self.backbone.config.hidden_size
- self.patch_size = self.backbone.config.patch_size
- else:
- raise ValueError(
- "ZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributes"
- )
- self.neck = ZoeDepthNeck(config)
- self.relative_head = ZoeDepthRelativeDepthEstimationHead(config)
- self.metric_head = (
- ZoeDepthMultipleMetricDepthEstimationHeads(config)
- if len(config.bin_configurations) > 1
- else ZoeDepthMetricDepthEstimationHead(config)
- )
- # Initialize weights and apply final processing
- self.post_init()
- @auto_docstring
- def forward(
- self,
- pixel_values: torch.FloatTensor,
- labels: torch.LongTensor | None = None,
- output_attentions: bool | None = None,
- output_hidden_states: bool | None = None,
- return_dict: bool | None = None,
- **kwargs,
- ) -> tuple[torch.Tensor] | DepthEstimatorOutput:
- r"""
- labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
- Ground truth depth estimation maps for computing the loss.
- Examples:
- ```python
- >>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
- >>> import torch
- >>> import numpy as np
- >>> from PIL import Image
- >>> import httpx
- >>> from io import BytesIO
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> with httpx.stream("GET", url) as response:
- ... image = Image.open(BytesIO(response.read()))
- >>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
- >>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")
- >>> # prepare image for the model
- >>> inputs = image_processor(images=image, return_tensors="pt")
- >>> with torch.no_grad():
- ... outputs = model(**inputs)
- >>> # interpolate to original size
- >>> post_processed_output = image_processor.post_process_depth_estimation(
- ... outputs,
- ... source_sizes=[(image.height, image.width)],
- ... )
- >>> # visualize the prediction
- >>> predicted_depth = post_processed_output[0]["predicted_depth"]
- >>> depth = predicted_depth * 255 / predicted_depth.max()
- >>> depth = depth.detach().cpu().numpy()
- >>> depth = Image.fromarray(depth.astype("uint8"))
- ```"""
- loss = None
- if labels is not None:
- raise NotImplementedError("Training is not implemented yet")
- return_dict = return_dict if return_dict is not None else self.config.return_dict
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- outputs = self.backbone.forward_with_filtered_kwargs(
- pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
- )
- hidden_states = outputs.feature_maps
- _, _, height, width = pixel_values.shape
- patch_size = self.patch_size
- patch_height = height // patch_size
- patch_width = width // patch_size
- hidden_states, features = self.neck(hidden_states, patch_height, patch_width)
- out = [features] + hidden_states
- relative_depth, features = self.relative_head(hidden_states)
- out = [features] + out
- metric_depth, domain_logits = self.metric_head(
- outconv_activation=out[0], bottleneck=out[1], feature_blocks=out[2:], relative_depth=relative_depth
- )
- metric_depth = metric_depth.squeeze(dim=1)
- if not return_dict:
- if domain_logits is not None:
- output = (metric_depth, domain_logits) + outputs[1:]
- else:
- output = (metric_depth,) + outputs[1:]
- return ((loss,) + output) if loss is not None else output
- return ZoeDepthDepthEstimatorOutput(
- loss=loss,
- predicted_depth=metric_depth,
- domain_logits=domain_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
- __all__ = ["ZoeDepthForDepthEstimation", "ZoeDepthPreTrainedModel"]
|