| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361 |
- # Copyright 2023 The Meta AI Authors and The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """PyTorch SAM model."""
- import collections
- from collections.abc import Callable
- from dataclasses import dataclass
- import numpy as np
- import torch
- import torch.nn.functional as F
- from torch import Tensor, nn
- from ... import initialization as init
- from ...activations import ACT2FN
- from ...modeling_layers import GradientCheckpointingLayer
- from ...modeling_outputs import BaseModelOutput
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
- from ...processing_utils import Unpack
- from ...utils import ModelOutput, auto_docstring, logging
- from ...utils.generic import TransformersKwargs, merge_with_config_defaults
- from ...utils.output_capturing import OutputRecorder, capture_outputs
- from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
- logger = logging.get_logger(__name__)
- @dataclass
- @auto_docstring(
- custom_intro="""
- Base class for sam vision model's outputs that also contains image embeddings obtained by applying the projection
- layer to the pooler_output.
- """
- )
- class SamVisionEncoderOutput(ModelOutput):
- r"""
- image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
- The image embeddings obtained by applying the projection layer to the pooler_output.
- """
- image_embeds: torch.FloatTensor | None = None
- last_hidden_state: torch.FloatTensor | None = None
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
- attentions: tuple[torch.FloatTensor, ...] | None = None
- @dataclass
- @auto_docstring(
- custom_intro="""
- Base class for Segment-Anything model's output
- """
- )
- class SamImageSegmentationOutput(ModelOutput):
- r"""
- iou_scores (`torch.FloatTensor` of shape `(batch_size, num_masks)`):
- The iou scores of the predicted masks.
- pred_masks (`torch.FloatTensor` of shape `(batch_size, num_masks, height, width)`):
- The predicted low resolutions masks. Needs to be post-processed by the processor
- vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
- one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
- Hidden-states of the vision model at the output of each layer plus the optional initial embedding outputs.
- vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
- sequence_length)`.
- Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
- heads.
- mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
- sequence_length)`.
- Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
- heads.
- """
- iou_scores: torch.FloatTensor | None = None
- pred_masks: torch.FloatTensor | None = None
- vision_hidden_states: tuple[torch.FloatTensor, ...] | None = None
- vision_attentions: tuple[torch.FloatTensor, ...] | None = None
- mask_decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
- class SamPatchEmbeddings(nn.Module):
- """
- This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
- `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
- Transformer.
- """
- def __init__(self, config):
- super().__init__()
- image_size, patch_size = config.image_size, config.patch_size
- num_channels, hidden_size = config.num_channels, config.hidden_size
- image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
- patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
- num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
- self.image_size = image_size
- self.patch_size = patch_size
- self.num_channels = num_channels
- self.num_patches = num_patches
- self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
- def forward(self, pixel_values):
- batch_size, num_channels, height, width = pixel_values.shape
- if num_channels != self.num_channels:
- raise ValueError(
- "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
- )
- if height != self.image_size[0] or width != self.image_size[1]:
- raise ValueError(
- f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
- )
- embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
- return embeddings
- class SamMLPBlock(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
- self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
- self.act = ACT2FN[config.hidden_act]
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
- hidden_states = self.lin1(hidden_states)
- hidden_states = self.act(hidden_states)
- hidden_states = self.lin2(hidden_states)
- return hidden_states
- # Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->Sam
- class SamLayerNorm(nn.LayerNorm):
- r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
- The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
- width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
- """
- def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
- super().__init__(normalized_shape, eps=eps, **kwargs)
- if data_format not in ["channels_last", "channels_first"]:
- raise NotImplementedError(f"Unsupported data format: {data_format}")
- self.data_format = data_format
- def forward(self, features: torch.Tensor) -> torch.Tensor:
- """
- Args:
- features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
- """
- if self.data_format == "channels_first":
- features = features.permute(0, 2, 3, 1)
- features = super().forward(features)
- features = features.permute(0, 3, 1, 2)
- else:
- features = super().forward(features)
- return features
- def eager_attention_forward(
- module: nn.Module,
- query: torch.Tensor,
- key: torch.Tensor,
- value: torch.Tensor,
- attention_mask: torch.Tensor | None,
- scaling: float,
- dropout: float = 0.0,
- **kwargs,
- ):
- attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
- if attention_mask is not None:
- attn_weights = attn_weights + attention_mask
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
- attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
- attn_output = torch.matmul(attn_weights, value)
- attn_output = attn_output.transpose(1, 2).contiguous()
- return attn_output, attn_weights
- class SamAttention(nn.Module):
- """
- SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
- values.
- """
- def __init__(self, config, downsample_rate=None):
- super().__init__()
- self.config = config
- self.hidden_size = config.hidden_size
- downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate
- self.internal_dim = config.hidden_size // downsample_rate
- self.num_attention_heads = config.num_attention_heads
- if self.internal_dim % config.num_attention_heads != 0:
- raise ValueError("num_attention_heads must divide hidden_size.")
- self.scaling = (self.internal_dim // config.num_attention_heads) ** -0.5
- self.q_proj = nn.Linear(self.hidden_size, self.internal_dim)
- self.k_proj = nn.Linear(self.hidden_size, self.internal_dim)
- self.v_proj = nn.Linear(self.hidden_size, self.internal_dim)
- self.out_proj = nn.Linear(self.internal_dim, self.hidden_size)
- self.is_causal = False
- def _separate_heads(self, hidden_states: Tensor, num_attention_heads: int) -> Tensor:
- batch, point_batch_size, n_tokens, channel = hidden_states.shape
- c_per_head = channel // num_attention_heads
- hidden_states = hidden_states.reshape(batch * point_batch_size, n_tokens, num_attention_heads, c_per_head)
- return hidden_states.transpose(1, 2)
- def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tensor:
- batch, n_tokens, n_heads, c_per_head = hidden_states.shape
- return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head)
- def forward(
- self,
- query: Tensor,
- key: Tensor,
- value: Tensor,
- attention_similarity: Tensor | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> Tensor:
- # Input projections
- query = self.q_proj(query)
- key = self.k_proj(key)
- value = self.v_proj(value)
- point_batch_size = query.shape[1]
- # Separate into heads
- query = self._separate_heads(query, self.num_attention_heads)
- key = self._separate_heads(key, self.num_attention_heads)
- value = self._separate_heads(value, self.num_attention_heads)
- # SamAttention
- attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
- self.config._attn_implementation, eager_attention_forward
- )
- attn_output, attn_weights = attention_interface(
- self,
- query,
- key,
- value,
- attention_mask=attention_similarity,
- dropout=0.0,
- scaling=self.scaling,
- is_causal=self.is_causal,
- **kwargs,
- )
- attn_output = self._recombine_heads(attn_output, point_batch_size)
- attn_output = self.out_proj(attn_output)
- return attn_output, attn_weights
- class SamTwoWayAttentionBlock(nn.Module):
- def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False):
- """
- A transformer block with four layers:
- (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
- sparse inputs (4) cross attention of dense inputs -> sparse inputs
- Arguments:
- config (`SamMaskDecoderConfig`):
- The configuration file used to instantiate the block
- attention_downsample_rate (*optionalk*, int, defaults to 2):
- The downsample ratio of the block used to reduce the inner dim of the attention.
- skip_first_layer_pe (*optional*, bool, defaults to `False`):
- Whether or not to skip the addition of the query_point_embedding on the first layer.
- """
- super().__init__()
- self.hidden_size = config.hidden_size
- self.layer_norm_eps = config.layer_norm_eps
- self.self_attn = SamAttention(config, downsample_rate=1)
- self.layer_norm1 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
- self.cross_attn_token_to_image = SamAttention(config, downsample_rate=attention_downsample_rate)
- self.layer_norm2 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
- self.mlp = SamMLPBlock(config)
- self.layer_norm3 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
- self.layer_norm4 = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
- self.cross_attn_image_to_token = SamAttention(config, downsample_rate=attention_downsample_rate)
- self.skip_first_layer_pe = skip_first_layer_pe
- def forward(
- self,
- queries: Tensor,
- keys: Tensor,
- query_point_embedding: Tensor,
- key_point_embedding: Tensor,
- attention_similarity: Tensor,
- **kwargs: Unpack[TransformersKwargs],
- ):
- # Self attention block
- if self.skip_first_layer_pe:
- queries, _ = self.self_attn(query=queries, key=queries, value=queries)
- else:
- query = queries + query_point_embedding
- attn_out, _ = self.self_attn(query=query, key=query, value=queries)
- queries = queries + attn_out
- queries = self.layer_norm1(queries)
- # Cross attention block, tokens attending to image embedding
- query = queries + query_point_embedding
- key = keys + key_point_embedding
- attn_out, _ = self.cross_attn_token_to_image(
- query=query, key=key, value=keys, attention_similarity=attention_similarity
- )
- queries = queries + attn_out
- queries = self.layer_norm2(queries)
- # MLP block
- mlp_out = self.mlp(queries)
- queries = queries + mlp_out
- queries = self.layer_norm3(queries)
- # Cross attention block, image embedding attending to tokens
- query = queries + query_point_embedding
- key = keys + key_point_embedding
- attn_out, _ = self.cross_attn_image_to_token(query=key, key=query, value=queries)
- keys = keys + attn_out
- keys = self.layer_norm4(keys)
- return queries, keys, attn_out
- class SamTwoWayTransformer(nn.Module):
- def __init__(self, config: SamMaskDecoderConfig):
- super().__init__()
- self.config = config
- self.num_hidden_layers = config.num_hidden_layers
- self.layers = nn.ModuleList()
- for i in range(self.num_hidden_layers):
- self.layers.append(SamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0)))
- self.final_attn_token_to_image = SamAttention(config)
- self.layer_norm_final_attn = nn.LayerNorm(config.hidden_size)
- def forward(
- self,
- point_embeddings: Tensor,
- image_embeddings: Tensor,
- image_positional_embeddings: Tensor,
- attention_similarity: Tensor,
- target_embedding=None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | BaseModelOutput:
- if image_embeddings is None:
- raise ValueError("You have to specify an image_embedding")
- image_embeddings = image_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1)
- image_positional_embeddings = image_positional_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1)
- # Prepare queries
- queries = point_embeddings
- keys = image_embeddings
- # Apply transformer blocks and final layernorm
- for layer in self.layers:
- if target_embedding is not None:
- queries += target_embedding
- queries, keys, _ = layer(
- queries=queries,
- keys=keys,
- query_point_embedding=point_embeddings,
- key_point_embedding=image_positional_embeddings,
- attention_similarity=attention_similarity,
- **kwargs,
- )
- # Apply the final attention layer from the points to the image
- query = queries + point_embeddings
- key = keys + image_positional_embeddings
- attn_out, _ = self.final_attn_token_to_image(query=query, key=key, value=keys)
- queries = queries + attn_out
- queries = self.layer_norm_final_attn(queries)
- return queries, keys
- class SamFeedForward(nn.Module):
- def __init__(
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False
- ):
- super().__init__()
- self.num_layers = num_layers
- self.activation = nn.ReLU()
- self.proj_in = nn.Linear(input_dim, hidden_dim)
- self.proj_out = nn.Linear(hidden_dim, output_dim)
- self.layers = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)])
- self.sigmoid_output = sigmoid_output
- def forward(self, hidden_states):
- hidden_states = self.proj_in(hidden_states)
- hidden_states = self.activation(hidden_states)
- for layer in self.layers:
- hidden_states = self.activation(layer(hidden_states))
- hidden_states = self.proj_out(hidden_states)
- if self.sigmoid_output:
- hidden_states = F.sigmoid(hidden_states)
- return hidden_states
- class SamMaskDecoder(nn.Module):
- def __init__(self, config: SamMaskDecoderConfig):
- super().__init__()
- self.config = config
- self.hidden_size = config.hidden_size
- self.num_multimask_outputs = config.num_multimask_outputs
- self.num_mask_tokens = config.num_multimask_outputs + 1
- self.iou_token = nn.Embedding(1, self.hidden_size)
- self.mask_tokens = nn.Embedding(self.num_mask_tokens, self.hidden_size)
- self.transformer = SamTwoWayTransformer(config)
- # should we create a new class for this?
- self.upscale_conv1 = nn.ConvTranspose2d(self.hidden_size, self.hidden_size // 4, kernel_size=2, stride=2)
- self.upscale_conv2 = nn.ConvTranspose2d(self.hidden_size // 4, self.hidden_size // 8, kernel_size=2, stride=2)
- self.upscale_layer_norm = SamLayerNorm(self.hidden_size // 4, data_format="channels_first")
- self.activation = nn.GELU()
- mlps_list = []
- for _ in range(self.num_mask_tokens):
- mlps_list += [SamFeedForward(self.hidden_size, self.hidden_size, self.hidden_size // 8, 3)]
- self.output_hypernetworks_mlps = nn.ModuleList(mlps_list)
- self.iou_prediction_head = SamFeedForward(
- self.hidden_size, config.iou_head_hidden_dim, self.num_mask_tokens, config.iou_head_depth
- )
- def forward(
- self,
- image_embeddings: torch.Tensor,
- image_positional_embeddings: torch.Tensor,
- sparse_prompt_embeddings: torch.Tensor,
- dense_prompt_embeddings: torch.Tensor,
- multimask_output: bool,
- attention_similarity: torch.Tensor | None = None,
- target_embedding: torch.Tensor | None = None,
- ) -> tuple[torch.Tensor, torch.Tensor]:
- """
- Predict masks given image and prompt embeddings.
- Args:
- image_embeddings (`torch.Tensor`):
- the embeddings from the image encoder
- image_positional_embedding (`torch.Tensor`):
- positional encoding with the shape of image_embeddings
- sparse_prompt_embeddings (`torch.Tensor`):
- The embeddings of the points and boxes
- dense_prompt_embeddings (`torch.Tensor`):
- the embeddings of the mask inputs
- multimask_output (bool):
- Whether to return multiple masks or a single mask.
- """
- batch_size, num_channels, height, width = image_embeddings.shape
- point_batch_size = sparse_prompt_embeddings.shape[1] if sparse_prompt_embeddings is not None else 1
- # Concatenate output tokens
- output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
- output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1)
- if sparse_prompt_embeddings is not None:
- tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2)
- else:
- tokens = output_tokens
- point_embeddings = tokens.to(self.iou_token.weight.dtype)
- # Expand per-image data in batch direction to be per-point
- image_embeddings = image_embeddings + dense_prompt_embeddings
- image_embeddings = image_embeddings.repeat_interleave(point_batch_size, 0)
- image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0)
- # Run the transformer, image_positional_embedding are consumed
- point_embedding, image_embeddings = self.transformer(
- point_embeddings=point_embeddings,
- image_embeddings=image_embeddings,
- image_positional_embeddings=image_positional_embeddings,
- attention_similarity=attention_similarity,
- target_embedding=target_embedding,
- )
- iou_token_out = point_embedding[:, :, 0, :]
- mask_tokens_out = point_embedding[:, :, 1 : (1 + self.num_mask_tokens), :]
- # Upscale mask embeddings and predict masks using the mask tokens
- image_embeddings = image_embeddings.transpose(2, 3).reshape(
- batch_size * point_batch_size, num_channels, height, width
- )
- upscaled_embedding = self.upscale_conv1(image_embeddings)
- upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding))
- upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding))
- hyper_in_list = []
- for i in range(self.num_mask_tokens):
- current_mlp = self.output_hypernetworks_mlps[i]
- hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])]
- hyper_in = torch.stack(hyper_in_list, dim=2)
- _, num_channels, height, width = upscaled_embedding.shape
- upscaled_embedding = upscaled_embedding.reshape(batch_size, point_batch_size, num_channels, height * width)
- masks = (hyper_in @ upscaled_embedding).reshape(batch_size, point_batch_size, -1, height, width)
- # Generate mask quality predictions
- iou_pred = self.iou_prediction_head(iou_token_out)
- # Select the correct mask or masks for output
- if multimask_output:
- mask_slice = slice(1, None)
- else:
- mask_slice = slice(0, 1)
- masks = masks[:, :, mask_slice, :, :]
- iou_pred = iou_pred[:, :, mask_slice]
- return masks, iou_pred
- class SamPositionalEmbedding(nn.Module):
- def __init__(self, config):
- super().__init__()
- self.scale = config.scale
- self.positional_embedding = nn.Parameter(self.scale * torch.randn((2, config.num_pos_feats)))
- def forward(self, input_coords, input_shape=None):
- """Positionally encode points that are normalized to [0,1]."""
- coordinates = input_coords.clone()
- if input_shape is not None:
- coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
- coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
- # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
- coordinates = 2 * coordinates - 1
- coordinates = coordinates.to(self.positional_embedding.dtype)
- coordinates = coordinates @ self.positional_embedding
- coordinates = 2 * np.pi * coordinates
- # outputs d_1 x ... x d_n x channel shape
- return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
- class SamMaskEmbedding(nn.Module):
- def __init__(self, config: SamPromptEncoderConfig):
- super().__init__()
- self.mask_input_channels = config.mask_input_channels // 4
- self.activation = ACT2FN[config.hidden_act]
- self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2)
- self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2)
- self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1)
- self.layer_norm1 = SamLayerNorm(
- self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first"
- )
- self.layer_norm2 = SamLayerNorm(
- self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first"
- )
- def forward(self, masks):
- hidden_states = self.conv1(masks)
- hidden_states = self.layer_norm1(hidden_states)
- hidden_states = self.activation(hidden_states)
- hidden_states = self.conv2(hidden_states)
- hidden_states = self.layer_norm2(hidden_states)
- hidden_states = self.activation(hidden_states)
- dense_embeddings = self.conv3(hidden_states)
- return dense_embeddings
- class SamPromptEncoder(nn.Module):
- def __init__(self, config: SamConfig):
- super().__init__()
- self.shared_embedding = SamPositionalEmbedding(config.vision_config)
- config = config.prompt_encoder_config
- self.mask_embed = SamMaskEmbedding(config)
- self.no_mask_embed = nn.Embedding(1, config.hidden_size)
- self.image_embedding_size = (config.image_embedding_size, config.image_embedding_size)
- self.input_image_size = config.image_size
- self.point_embed = nn.ModuleList(
- [nn.Embedding(1, config.hidden_size) for i in range(config.num_point_embeddings)]
- )
- self.hidden_size = config.hidden_size
- self.not_a_point_embed = nn.Embedding(1, config.hidden_size)
- def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor:
- """Embeds point prompts."""
- points = points + 0.5 # Shift to center of pixel
- if pad:
- target_point_shape = (points.shape[0], points.shape[1], 1, points.shape[-1])
- target_labels_shape = (points.shape[0], points.shape[1], 1)
- padding_point = torch.zeros(target_point_shape, device=points.device)
- padding_label = -torch.ones(target_labels_shape, device=labels.device)
- points = torch.cat([points, padding_point], dim=2)
- labels = torch.cat([labels, padding_label], dim=2)
- input_shape = (self.input_image_size, self.input_image_size)
- point_embedding = self.shared_embedding(points, input_shape)
- # torch.where and expanding the labels tensor is required by the ONNX export
- point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding)
- # This is required for the ONNX export. The dtype, device need to be explicitly
- # specified as otherwise torch.onnx.export interprets as double
- point_embedding = torch.where(labels[..., None] != -10, point_embedding, torch.zeros_like(point_embedding))
- point_embedding = torch.where(
- (labels == 0)[:, :, :, None],
- point_embedding + self.point_embed[0].weight[None, None, :, :],
- point_embedding,
- )
- point_embedding = torch.where(
- (labels == 1)[:, :, :, None],
- point_embedding + self.point_embed[1].weight[None, None, :, :],
- point_embedding,
- )
- return point_embedding
- def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
- """Embeds box prompts."""
- boxes = boxes + 0.5 # Shift to center of pixel
- batch_size, nb_boxes = boxes.shape[:2]
- coords = boxes.reshape(batch_size, nb_boxes, 2, 2)
- input_shape = (self.input_image_size, self.input_image_size)
- corner_embedding = self.shared_embedding(coords, input_shape)
- corner_embedding[:, :, 0, :] += self.point_embed[2].weight
- corner_embedding[:, :, 1, :] += self.point_embed[3].weight
- return corner_embedding
- def forward(
- self,
- input_points: tuple[torch.Tensor, torch.Tensor] | None,
- input_labels: torch.Tensor | None,
- input_boxes: torch.Tensor | None,
- input_masks: torch.Tensor | None,
- ) -> tuple[torch.Tensor, torch.Tensor]:
- """
- Embeds different types of prompts, returning both sparse and dense embeddings.
- Args:
- points (`torch.Tensor`, *optional*):
- point coordinates and labels to embed.
- boxes (`torch.Tensor`, *optional*):
- boxes to embed
- masks (`torch.Tensor`, *optional*):
- masks to embed
- """
- sparse_embeddings = None
- batch_size = 1
- if input_points is not None:
- batch_size = input_points.shape[0]
- if input_labels is None:
- raise ValueError("If points are provided, labels must also be provided.")
- point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
- sparse_embeddings = point_embeddings
- if input_boxes is not None:
- batch_size = input_boxes.shape[0]
- box_embeddings = self._embed_boxes(input_boxes)
- if sparse_embeddings is None:
- sparse_embeddings = box_embeddings
- else:
- sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=2)
- if input_masks is not None:
- dense_embeddings = self.mask_embed(input_masks)
- else:
- dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
- batch_size, -1, self.image_embedding_size[0], self.image_embedding_size[1]
- )
- return sparse_embeddings, dense_embeddings
- class SamVisionAttention(nn.Module):
- """Multi-head Attention block with relative position embeddings."""
- def __init__(self, config, window_size):
- super().__init__()
- input_size = (
- (config.image_size // config.patch_size, config.image_size // config.patch_size)
- if window_size == 0
- else (window_size, window_size)
- )
- self.num_attention_heads = config.num_attention_heads
- head_dim = config.hidden_size // config.num_attention_heads
- self.scale = head_dim**-0.5
- self.dropout = config.attention_dropout
- self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
- self.proj = nn.Linear(config.hidden_size, config.hidden_size)
- self.use_rel_pos = config.use_rel_pos
- if self.use_rel_pos:
- if input_size is None:
- raise ValueError("Input size must be provided if using relative positional encoding.")
- # initialize relative positional embeddings
- self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
- self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
- def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
- """
- Get relative positional embeddings according to the relative positions of
- query and key sizes.
- Args:
- q_size (int):
- size of the query.
- k_size (int):
- size of key k.
- rel_pos (`torch.Tensor`):
- relative position embeddings (L, channel).
- Returns:
- Extracted positional embeddings according to relative positions.
- """
- max_rel_dist = int(2 * max(q_size, k_size) - 1)
- # Interpolate rel pos.
- rel_pos_resized = F.interpolate(
- rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
- size=max_rel_dist,
- mode="linear",
- )
- rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
- # Scale the coords with short length if shapes for q and k are different.
- q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
- k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
- relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
- return rel_pos_resized[relative_coords.long()]
- def get_decomposed_rel_pos(
- self,
- query: torch.Tensor,
- rel_pos_h: torch.Tensor,
- rel_pos_w: torch.Tensor,
- q_size: tuple[int, int],
- k_size: tuple[int, int],
- ) -> torch.Tensor:
- """
- Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
- https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
- Args:
- query (`torch.Tensor`):
- query q in the attention layer with shape (batch_size, query_height * query_width, channel).
- rel_pos_h (`torch.Tensor`):
- relative position embeddings (Lh, channel) for height axis.
- rel_pos_w (`torch.Tensor`):
- relative position embeddings (Lw, channel) for width axis.
- q_size (tuple):
- spatial sequence size of query q with (query_height, query_width).
- k_size (tuple):
- spatial sequence size of key k with (key_height, key_width).
- Returns:
- decomposed_rel_pos (`torch.Tensor`):
- decomposed relative position embeddings.
- """
- query_height, query_width = q_size
- key_height, key_width = k_size
- relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
- relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
- batch_size, _, dim = query.shape
- reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
- rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
- rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
- decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
- return decomposed_rel_pos
- def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]:
- batch_size, height, width, _ = hidden_states.shape
- # qkv with shape (3, batch_size, nHead, height * width, channel)
- qkv = (
- self.qkv(hidden_states)
- .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
- .permute(2, 0, 3, 1, 4)
- )
- # q, k, v with shape (batch_size * nHead, height * width, channel)
- query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
- attn_weights = (query * self.scale) @ key.transpose(-2, -1)
- if self.use_rel_pos:
- decomposed_rel_pos = self.get_decomposed_rel_pos(
- query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
- )
- decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
- attn_weights = attn_weights + decomposed_rel_pos
- attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
- attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
- attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
- attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
- attn_output = self.proj(attn_output)
- return attn_output, attn_weights
- class SamVisionSdpaAttention(SamVisionAttention):
- """
- Multi-head Attention block with relative position embeddings.
- Using SDPA instead of the default attention.
- """
- def __init__(self, config, window_size):
- super().__init__(config, window_size)
- def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
- if output_attentions:
- logger.warning_once(
- f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will "
- "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model."
- )
- batch_size, height, width, _ = hidden_states.shape
- # qkv with shape (3, B, nHead, H * W, C)
- qkv = (
- self.qkv(hidden_states)
- .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
- .permute(2, 0, 3, 1, 4)
- )
- # q, k, v with shape (B * nHead, H * W, C)
- query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
- attn_bias = None
- if self.use_rel_pos:
- decomposed_rel_pos = self.get_decomposed_rel_pos(
- query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
- )
- decomposed_rel_pos = decomposed_rel_pos.reshape(
- batch_size, self.num_attention_heads, height * width, height * width
- )
- attn_bias = decomposed_rel_pos
- query = query.view(batch_size, self.num_attention_heads, height * width, -1)
- key = key.view(batch_size, self.num_attention_heads, height * width, -1)
- value = value.view(batch_size, self.num_attention_heads, height * width, -1)
- attn_output = torch.nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=attn_bias)
- attn_output = (
- attn_output.view(batch_size, self.num_attention_heads, height, width, -1)
- .permute(0, 2, 3, 1, 4)
- .reshape(batch_size, height, width, -1)
- )
- attn_output = self.proj(attn_output)
- return attn_output, None
- SAM_VISION_ATTENTION_CLASSES = {
- "eager": SamVisionAttention,
- "sdpa": SamVisionSdpaAttention,
- }
- class SamVisionLayer(GradientCheckpointingLayer):
- def __init__(self, config, window_size):
- super().__init__()
- self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.attn = SAM_VISION_ATTENTION_CLASSES[config._attn_implementation](config, window_size)
- self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.mlp = SamMLPBlock(config)
- self.window_size = window_size
- def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
- """
- Args:
- Partition into non-overlapping windows with padding if needed.
- hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
- size.
- Returns:
- windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
- (pad_height, pad_width): padded height and width before partition
- """
- batch_size, height, width, channel = hidden_states.shape
- pad_h = (window_size - height % window_size) % window_size
- pad_w = (window_size - width % window_size) % window_size
- hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
- pad_height, pad_width = height + pad_h, width + pad_w
- hidden_states = hidden_states.reshape(
- batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
- )
- windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
- return windows, (pad_height, pad_width)
- def window_unpartition(
- self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int]
- ) -> torch.Tensor:
- """
- Args:
- Window unpartition into original sequences and removing padding.
- hidden_states (tensor):
- input tokens with [batch_size * num_windows, window_size, window_size, channel].
- window_size (int):
- window size.
- padding_shape (Tuple):
- padded height and width (pad_height, pad_width).
- original_shape (Tuple): original height and width (height, width) before padding.
- Returns:
- hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
- """
- pad_height, pad_width = padding_shape
- height, width = original_shape
- batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
- hidden_states = windows.reshape(
- batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
- )
- hidden_states = (
- hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
- )
- hidden_states = hidden_states[:, :height, :width, :].contiguous()
- return hidden_states
- def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
- residual = hidden_states
- hidden_states = self.layer_norm1(hidden_states)
- # Window partition
- if self.window_size > 0:
- height, width = hidden_states.shape[1], hidden_states.shape[2]
- hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
- hidden_states, attn_weights = self.attn(
- hidden_states=hidden_states,
- )
- # Reverse window partition
- if self.window_size > 0:
- hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
- hidden_states = residual + hidden_states
- layernorm_output = self.layer_norm2(hidden_states)
- hidden_states = hidden_states + self.mlp(layernorm_output)
- return hidden_states
- class SamVisionNeck(nn.Module):
- def __init__(self, config: SamVisionConfig):
- super().__init__()
- self.config = config
- self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
- self.layer_norm1 = SamLayerNorm(config.output_channels, data_format="channels_first")
- self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
- self.layer_norm2 = SamLayerNorm(config.output_channels, data_format="channels_first")
- def forward(self, hidden_states):
- hidden_states = hidden_states.permute(0, 3, 1, 2)
- hidden_states = self.conv1(hidden_states)
- hidden_states = self.layer_norm1(hidden_states)
- hidden_states = self.conv2(hidden_states)
- hidden_states = self.layer_norm2(hidden_states)
- return hidden_states
- @auto_docstring
- class SamPreTrainedModel(PreTrainedModel):
- config: SamConfig
- base_model_prefix = "sam"
- main_input_name = "pixel_values"
- input_modalities = ("image",)
- _no_split_modules = ["SamVisionAttention"]
- supports_gradient_checkpointing = True
- _supports_sdpa = True
- @torch.no_grad()
- def _init_weights(self, module: nn.Module):
- super()._init_weights(module)
- if isinstance(module, SamVisionAttention):
- if module.use_rel_pos:
- init.zeros_(module.rel_pos_h)
- init.zeros_(module.rel_pos_w)
- elif isinstance(module, SamVisionEncoder):
- if self.config.use_abs_pos:
- init.zeros_(module.pos_embed)
- elif isinstance(module, SamPositionalEmbedding):
- init.normal_(module.positional_embedding, std=module.scale)
- class SamVisionEncoder(SamPreTrainedModel):
- _can_record_outputs = {"hidden_states": SamVisionLayer, "attentions": SamVisionAttention}
- def __init__(self, config: SamVisionConfig):
- super().__init__(config)
- self.config = config
- self.image_size = config.image_size
- self.patch_embed = SamPatchEmbeddings(config)
- self.pos_embed = None
- if config.use_abs_pos:
- # Initialize absolute positional embedding with pretrain image size.
- self.pos_embed = nn.Parameter(
- torch.zeros(
- 1,
- config.image_size // config.patch_size,
- config.image_size // config.patch_size,
- config.hidden_size,
- )
- )
- self.layers = nn.ModuleList()
- for i in range(config.num_hidden_layers):
- layer = SamVisionLayer(
- config,
- window_size=config.window_size if i not in config.global_attn_indexes else 0,
- )
- self.layers.append(layer)
- self.neck = SamVisionNeck(config)
- self.gradient_checkpointing = False
- self.post_init()
- def get_input_embeddings(self):
- return self.patch_embed
- @merge_with_config_defaults
- @capture_outputs(tie_last_hidden_states=False)
- def forward(
- self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
- ) -> tuple | SamVisionEncoderOutput:
- if pixel_values is None:
- raise ValueError("You have to specify pixel_values")
- hidden_states = self.patch_embed(pixel_values)
- if self.pos_embed is not None:
- hidden_states = hidden_states + self.pos_embed
- for layer_module in self.layers:
- hidden_states = layer_module(hidden_states)
- hidden_states = self.neck(hidden_states)
- return SamVisionEncoderOutput(
- last_hidden_state=hidden_states,
- )
- @auto_docstring(
- custom_intro="""
- The vision model from Sam without any head or projection on top.
- """
- )
- class SamVisionModel(SamPreTrainedModel):
- config: SamVisionConfig
- main_input_name = "pixel_values"
- def __init__(self, config: SamVisionConfig):
- super().__init__(config)
- self.vision_encoder = SamVisionEncoder(config)
- self.post_init()
- def get_input_embeddings(self) -> nn.Module:
- return self.vision_encoder.patch_embed
- @auto_docstring
- def forward(
- self,
- pixel_values: torch.FloatTensor | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | SamVisionEncoderOutput:
- return self.vision_encoder(pixel_values, **kwargs)
- @auto_docstring(
- custom_intro="""
- Segment Anything Model (SAM) for generating segmentation masks, given an input image and
- input points and labels, boxes, or masks.
- """
- )
- class SamModel(SamPreTrainedModel):
- input_modalities = ("image", "text")
- _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(SamTwoWayAttentionBlock, index=2)}
- _tied_weights_keys = {
- "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
- }
- def __init__(self, config: SamConfig):
- super().__init__(config)
- self.shared_image_embedding = SamPositionalEmbedding(config.vision_config)
- self.vision_encoder = SamVisionEncoder(config.vision_config)
- self.prompt_encoder = SamPromptEncoder(config)
- # The module using it is not a PreTrainedModel subclass so we need this
- config.mask_decoder_config._attn_implementation = config._attn_implementation
- self.mask_decoder = SamMaskDecoder(config.mask_decoder_config)
- self.post_init()
- def get_input_embeddings(self):
- return self.vision_encoder.get_input_embeddings()
- def get_image_wide_positional_embeddings(self):
- size = self.config.prompt_encoder_config.image_embedding_size
- target_device = self.shared_image_embedding.positional_embedding.device
- target_dtype = self.shared_image_embedding.positional_embedding.dtype
- grid = torch.ones((size, size), device=target_device, dtype=target_dtype)
- y_embed = grid.cumsum(dim=0) - 0.5
- x_embed = grid.cumsum(dim=1) - 0.5
- y_embed = y_embed / size
- x_embed = x_embed / size
- positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1))
- return positional_embedding.permute(2, 0, 1).unsqueeze(0) # channel x height x width
- @torch.no_grad()
- def get_image_embeddings(self, pixel_values, **kwargs: Unpack[TransformersKwargs]):
- r"""
- Returns the image embeddings by passing the pixel values through the vision encoder.
- Args:
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
- Input pixel values
- """
- vision_output = self.vision_encoder(
- pixel_values,
- **kwargs,
- )
- image_embeddings = vision_output[0]
- return image_embeddings
- @torch.no_grad()
- def get_prompt_embeddings(
- self,
- input_points: torch.FloatTensor | None = None,
- input_labels: torch.LongTensor | None = None,
- input_boxes: torch.FloatTensor | None = None,
- input_masks: torch.LongTensor | None = None,
- ):
- r"""
- Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.
- Args:
- input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
- Optional input points for the prompt encoder. The padding of the point is automatically done by the
- processor. `point_batch_size` refers to the number of masks that we want the model to predict per
- point. The model will output `point_batch_size` times 3 masks in total.
- input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
- Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
- processor, or can be fed by the user.
- input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`):
- Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
- processor. users can also pass manually the input boxes.
- input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`):
- Optional input masks for the prompt encoder.
- """
- prompt_output = self.prompt_encoder(
- input_points=input_points,
- input_labels=input_labels,
- input_boxes=input_boxes,
- input_masks=input_masks,
- )
- return prompt_output
- @merge_with_config_defaults
- @capture_outputs
- @auto_docstring
- def forward(
- self,
- pixel_values: torch.FloatTensor | None = None,
- input_points: torch.FloatTensor | None = None,
- input_labels: torch.LongTensor | None = None,
- input_boxes: torch.FloatTensor | None = None,
- input_masks: torch.LongTensor | None = None,
- image_embeddings: torch.FloatTensor | None = None,
- multimask_output: bool = True,
- attention_similarity: torch.FloatTensor | None = None,
- target_embedding: torch.FloatTensor | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> SamImageSegmentationOutput:
- r"""
- input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`):
- Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
- better results. The points can be obtained by passing a list of list of list to the processor that will
- create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the
- second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict
- per input point), the third dimension is the number of points per segmentation mask (it is possible to pass
- multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
- coordinates of the point. If a different number of points is passed either for each image, or for each
- mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
- computation of the embedding will be skipped for these points using the labels.
- input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`):
- Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
- official implementation, there are 3 types of labels
- - `1`: the point is a point that contains the object of interest
- - `0`: the point is a point that does not contain the object of interest
- - `-1`: the point corresponds to the background
- We added the label:
- - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
- The padding labels should be automatically done by the processor.
- input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
- Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
- much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
- that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
- size, the number of boxes per image and the coordinates of the top left and bottom right point of the box.
- In the order (`x1`, `y1`, `x2`, `y2`):
- - `x1`: the x coordinate of the top left point of the input box
- - `y1`: the y coordinate of the top left point of the input box
- - `x2`: the x coordinate of the bottom right point of the input box
- - `y2`: the y coordinate of the bottom right point of the input box
- input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`):
- SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
- generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
- manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
- image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`):
- Image embeddings, this is used by the mask decder to generate masks and iou scores. For more memory
- efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
- method, and then feed them to the `forward` method instead of feeding the `pixel_values`.
- multimask_output (`bool`, *optional*):
- In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
- bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
- "best" mask, by specifying `multimask_output=False`.
- attention_similarity (`torch.FloatTensor`, *optional*):
- Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
- model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).
- target_embedding (`torch.FloatTensor`, *optional*):
- Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
- the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).
- Example:
- ```python
- >>> from PIL import Image
- >>> import httpx
- >>> from io import BytesIO
- >>> from transformers import AutoModel, AutoProcessor
- >>> model = AutoModel.from_pretrained("facebook/sam-vit-base")
- >>> processor = AutoProcessor.from_pretrained("facebook/sam-vit-base")
- >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
- >>> with httpx.stream("GET", url) as response:
- ... raw_image = Image.open(BytesIO(response.read())).convert("RGB")
- >>> input_points = [[[400, 650]]] # 2D location of a window on the car
- >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
- >>> # Get segmentation mask
- >>> outputs = model(**inputs)
- >>> # Postprocess masks
- >>> masks = processor.post_process_masks(
- ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
- ... )
- ```
- """
- if pixel_values is None and image_embeddings is None:
- raise ValueError("Either pixel_values or image_embeddings must be provided.")
- if pixel_values is not None and image_embeddings is not None:
- raise ValueError("Only one of pixel_values and image_embeddings can be provided.")
- if input_points is not None and len(input_points.shape) != 4:
- raise ValueError(
- "The input_points must be a 4D tensor. Of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.",
- f" got {input_points.shape}.",
- )
- if input_boxes is not None and len(input_boxes.shape) != 3:
- raise ValueError(
- "The input_points must be a 3D tensor. Of shape `batch_size`, `nb_boxes`, `4`.",
- f" got {input_boxes.shape}.",
- )
- if input_points is not None and input_boxes is not None:
- point_batch_size = input_points.shape[1]
- box_batch_size = input_boxes.shape[1]
- if point_batch_size != box_batch_size:
- raise ValueError(
- f"You should provide as many bounding boxes as input points per box. Got {point_batch_size} and {box_batch_size}."
- )
- image_positional_embeddings = self.get_image_wide_positional_embeddings()
- # repeat with batch size
- batch_size = pixel_values.shape[0] if pixel_values is not None else image_embeddings.shape[0]
- image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1)
- vision_attentions = None
- vision_hidden_states = None
- if pixel_values is not None:
- vision_outputs: SamVisionEncoderOutput = self.vision_encoder(pixel_values, **kwargs)
- image_embeddings = vision_outputs.last_hidden_state
- vision_hidden_states = vision_outputs.hidden_states
- vision_attentions = vision_outputs.attentions
- if input_points is not None and input_labels is None:
- input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device)
- if input_points is not None and image_embeddings.shape[0] != input_points.shape[0]:
- raise ValueError(
- "The batch size of the image embeddings and the input points must be the same. ",
- f"Got {image_embeddings.shape[0]} and {input_points.shape[0]} respectively.",
- " if you want to pass multiple points for the same image, make sure that you passed ",
- " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and ",
- " input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
- )
- sparse_embeddings, dense_embeddings = self.prompt_encoder(
- input_points=input_points,
- input_labels=input_labels,
- input_boxes=input_boxes,
- input_masks=input_masks,
- )
- low_res_masks, iou_predictions = self.mask_decoder(
- image_embeddings=image_embeddings,
- image_positional_embeddings=image_positional_embeddings,
- sparse_prompt_embeddings=sparse_embeddings,
- dense_prompt_embeddings=dense_embeddings,
- multimask_output=multimask_output,
- attention_similarity=attention_similarity,
- target_embedding=target_embedding,
- )
- return SamImageSegmentationOutput(
- iou_scores=iou_predictions,
- pred_masks=low_res_masks,
- vision_hidden_states=vision_hidden_states,
- vision_attentions=vision_attentions,
- )
- __all__ = ["SamVisionModel", "SamModel", "SamPreTrainedModel"]
|