| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- # Copyright 2025 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import torch
- import torch.nn as nn
- from ..image_transforms import center_to_corners_format
- from ..utils import is_scipy_available
- from .loss_for_object_detection import HungarianMatcher, ImageLoss, _set_aux_loss, generalized_box_iou
- if is_scipy_available():
- from scipy.optimize import linear_sum_assignment
- # Similar to the one used in `DeformableDetr` but we reduce with sum and normalize by num_boxes
- # instead of mean.
- def sigmoid_focal_loss(
- inputs: torch.Tensor,
- targets: torch.Tensor,
- num_boxes: int,
- alpha: float = 0.25,
- gamma: float = 2,
- ):
- """
- Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.
- Args:
- inputs (`torch.FloatTensor` of arbitrary shape):
- The predictions for each example.
- targets (`torch.FloatTensor` with the same shape as `inputs`)
- A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
- and 1 for the positive class).
- num_boxes (`int`):
- The total number of boxes in the batch.
- alpha (`float`, *optional*, defaults to 0.25):
- Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
- gamma (`int`, *optional*, defaults to 2):
- Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
- Returns:
- Loss tensor
- """
- prob = inputs.sigmoid()
- ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
- # add modulating factor
- p_t = prob * targets + (1 - prob) * (1 - targets)
- loss = ce_loss * ((1 - p_t) ** gamma)
- if alpha >= 0:
- alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
- loss = alpha_t * loss
- return loss.sum() / num_boxes
- class GroundingDinoHungarianMatcher(HungarianMatcher):
- @torch.no_grad()
- def forward(self, outputs, targets):
- """
- Args:
- outputs (`dict`):
- A dictionary that contains at least these entries:
- * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
- * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
- * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
- targets (`list[dict]`):
- A list of targets (len(targets) = batch_size), where each target is a dict containing:
- * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
- ground-truth
- objects in the target) containing the class labels
- * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
- Returns:
- `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- - index_i is the indices of the selected predictions (in order)
- - index_j is the indices of the corresponding selected targets (in order)
- For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
- """
- batch_size, num_queries = outputs["logits"].shape[:2]
- # We flatten to compute the cost matrices in a batch
- out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, hidden_dim]
- out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
- label_maps = outputs["label_maps"]
- # First take the label map for each class in each batch and then concatenate them
- label_maps = torch.cat([label_map[target["class_labels"]] for label_map, target in zip(label_maps, targets)])
- # Normalize label maps based on number of tokens per class
- label_maps = label_maps / label_maps.sum(dim=-1, keepdim=True)
- # Also concat the target labels and boxes
- target_bbox = torch.cat([v["boxes"] for v in targets])
- # Compute the classification cost.
- alpha = 0.25
- gamma = 2.0
- neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
- pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
- # Compute the classification cost by taking pos and neg cost in the appropriate index
- class_cost = (pos_cost_class - neg_cost_class) @ label_maps.t()
- # Compute the L1 cost between boxes
- bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
- # Compute the giou cost between boxes
- giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
- # Final cost matrix
- cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
- cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
- sizes = [len(v["boxes"]) for v in targets]
- indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
- return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
- class GroundingDinoImageLoss(ImageLoss):
- """
- This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
- compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
- matched ground-truth / prediction (supervise class and box).
- Args:
- matcher (`GroundingDinoHungarianMatcher`):
- Module able to compute a matching between targets and proposals.
- focal_alpha (`float`):
- Alpha parameter in focal loss.
- losses (`list[str]`):
- List of all the losses to be applied. See `get_loss` for a list of all available losses.
- """
- def __init__(self, matcher, focal_alpha, losses):
- nn.Module.__init__(self)
- self.matcher = matcher
- self.focal_alpha = focal_alpha
- self.losses = losses
- @torch.no_grad()
- def loss_cardinality(self, outputs, targets, indices, num_boxes):
- """
- Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
- This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
- """
- logits = outputs["logits"]
- device = logits.device
- target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
- # Count the number of predictions that are NOT "no-object" (sigmoid > 0.5 threshold)
- card_pred = (logits.sigmoid().max(-1).values > 0.5).sum(1)
- card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
- losses = {"cardinality_error": card_err}
- return losses
- def _get_target_classes_one_hot(self, outputs, targets, indices):
- """
- Create one_hot based on the matching indices
- """
- logits = outputs["logits"]
- # Add offsets to class_labels to select the correct label map
- class_labels = torch.cat(
- [
- target["class_labels"][J] + len(outputs["label_maps"][i]) if i > 0 else target["class_labels"][J]
- for i, (target, (_, J)) in enumerate(zip(targets, indices))
- ]
- )
- label_maps = torch.cat(outputs["label_maps"], dim=0)
- idx = self._get_source_permutation_idx(indices)
- target_classes_onehot = torch.zeros_like(logits, device=logits.device, dtype=torch.long)
- target_classes_onehot[idx] = label_maps[class_labels].to(torch.long)
- return target_classes_onehot
- def loss_labels(self, outputs, targets, indices, num_boxes):
- """
- Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
- of dim [nb_target_boxes]
- """
- if "logits" not in outputs:
- raise KeyError("No logits were found in the outputs")
- if "text_mask" not in outputs:
- raise KeyError("No text_mask were found in the outputs")
- target_classes_onehot = self._get_target_classes_one_hot(outputs, targets, indices)
- source_logits = outputs["logits"]
- text_mask = outputs["text_mask"]
- # Select only valid logits
- source_logits = torch.masked_select(source_logits, text_mask)
- target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
- target_classes_onehot = target_classes_onehot.float()
- loss_ce = sigmoid_focal_loss(
- inputs=source_logits,
- targets=target_classes_onehot,
- num_boxes=num_boxes,
- alpha=self.focal_alpha,
- gamma=2,
- )
- losses = {"loss_ce": loss_ce}
- return losses
- def GroundingDinoForObjectDetectionLoss(
- logits,
- labels,
- device,
- pred_boxes,
- config,
- label_maps,
- text_mask,
- outputs_class=None,
- outputs_coord=None,
- encoder_logits=None,
- encoder_pred_boxes=None,
- ):
- # First: create the matcher
- matcher = GroundingDinoHungarianMatcher(
- class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost
- )
- # Second: create the criterion
- losses = ["labels", "boxes", "cardinality"]
- criterion = GroundingDinoImageLoss(
- matcher=matcher,
- focal_alpha=config.focal_alpha,
- losses=losses,
- )
- criterion.to(device)
- # Third: compute the losses, based on outputs and labels
- outputs_loss = {}
- outputs_loss["logits"] = logits
- outputs_loss["pred_boxes"] = pred_boxes
- outputs_loss["label_maps"] = label_maps
- outputs_loss["text_mask"] = text_mask
- auxiliary_outputs = None
- if config.auxiliary_loss:
- auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
- for aux_output in auxiliary_outputs:
- aux_output["label_maps"] = label_maps
- aux_output["text_mask"] = text_mask
- outputs_loss["auxiliary_outputs"] = auxiliary_outputs
- loss_dict = criterion(outputs_loss, labels)
- if config.two_stage:
- encoder_outputs_loss = {
- "logits": encoder_logits,
- "pred_boxes": encoder_pred_boxes,
- "label_maps": label_maps,
- "text_mask": text_mask,
- }
- encoder_loss_dict = criterion(encoder_outputs_loss, labels)
- encoder_loss_dict = {k + "_enc": v for k, v in encoder_loss_dict.items()}
- loss_dict.update(encoder_loss_dict)
- # Fourth: compute total loss, as a weighted sum of the various losses
- weight_dict = {
- "loss_ce": 2.0,
- "loss_bbox": config.bbox_loss_coefficient,
- "loss_giou": config.giou_loss_coefficient,
- }
- if config.two_stage:
- enc_weight_dict = {k + "_enc": v for k, v in weight_dict.items()}
- weight_dict.update(enc_weight_dict)
- if config.auxiliary_loss:
- aux_weight_dict = {}
- for i in range(config.decoder_layers - 1):
- aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
- weight_dict.update(aux_weight_dict)
- loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
- return loss, loss_dict, auxiliary_outputs
|