| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071 |
- # Copyright 2022 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from collections import defaultdict
- from collections.abc import Collection, Iterable
- from math import ceil
- from typing import Optional, Union
- import numpy as np
- from .image_utils import (
- ChannelDimension,
- ImageInput,
- get_channel_dimension_axis,
- get_image_size,
- infer_channel_dimension_format,
- )
- from .utils import ExplicitEnum, TensorType, is_torch_tensor
- from .utils.import_utils import (
- is_torch_available,
- is_vision_available,
- requires_backends,
- )
- if is_vision_available():
- import PIL
- from .image_utils import PILImageResampling
- if is_torch_available():
- import torch
- def to_channel_dimension_format(
- image: np.ndarray,
- channel_dim: ChannelDimension | str,
- input_channel_dim: ChannelDimension | str | None = None,
- ) -> np.ndarray:
- """
- Converts `image` to the channel dimension format specified by `channel_dim`. The input
- can have arbitrary number of leading dimensions. Only last three dimension will be permuted
- to format the `image`.
- Args:
- image (`numpy.ndarray`):
- The image to have its channel dimension set.
- channel_dim (`ChannelDimension`):
- The channel dimension format to use.
- input_channel_dim (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If not provided, it will be inferred from the input image.
- Returns:
- `np.ndarray`: The image with the channel dimension set to `channel_dim`.
- """
- if not isinstance(image, np.ndarray):
- raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
- if input_channel_dim is None:
- input_channel_dim = infer_channel_dimension_format(image)
- target_channel_dim = ChannelDimension(channel_dim)
- if input_channel_dim == target_channel_dim:
- return image
- if target_channel_dim == ChannelDimension.FIRST:
- axes = list(range(image.ndim - 3)) + [image.ndim - 1, image.ndim - 3, image.ndim - 2]
- image = image.transpose(axes)
- elif target_channel_dim == ChannelDimension.LAST:
- axes = list(range(image.ndim - 3)) + [image.ndim - 2, image.ndim - 1, image.ndim - 3]
- image = image.transpose(axes)
- else:
- raise ValueError(f"Unsupported channel dimension format: {channel_dim}")
- return image
- def rescale(
- image: np.ndarray,
- scale: float,
- data_format: ChannelDimension | None = None,
- dtype: np.dtype = np.float32,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Rescales `image` by `scale`.
- Args:
- image (`np.ndarray`):
- The image to rescale.
- scale (`float`):
- The scale to use for rescaling the image.
- data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the image. If not provided, it will be the same as the input image.
- dtype (`np.dtype`, *optional*, defaults to `np.float32`):
- The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
- extractors.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If not provided, it will be inferred from the input image.
- Returns:
- `np.ndarray`: The rescaled image.
- """
- if not isinstance(image, np.ndarray):
- raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
- rescaled_image = image.astype(np.float64) * scale # Numpy type promotion has changed, so always upcast first
- if data_format is not None:
- rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)
- rescaled_image = rescaled_image.astype(dtype) # Finally downcast to the desired dtype at the end
- return rescaled_image
- def _rescale_for_pil_conversion(image):
- """
- Detects whether or not the image needs to be rescaled before being converted to a PIL image.
- The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be
- rescaled.
- """
- if image.dtype == np.uint8:
- do_rescale = False
- elif np.allclose(image, image.astype(int)):
- if np.all(image >= 0) and np.all(image <= 255):
- do_rescale = False
- else:
- raise ValueError(
- "The image to be converted to a PIL image contains values outside the range [0, 255], "
- f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
- )
- elif np.all(image >= 0) and np.all(image <= 1):
- do_rescale = True
- else:
- raise ValueError(
- "The image to be converted to a PIL image contains values outside the range [0, 1], "
- f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
- )
- return do_rescale
- def to_pil_image(
- image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor"],
- do_rescale: bool | None = None,
- image_mode: str | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> "PIL.Image.Image":
- """
- Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
- needed.
- Args:
- image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
- The image to convert to the `PIL.Image` format.
- do_rescale (`bool`, *optional*):
- Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
- to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
- and `False` otherwise.
- image_mode (`str`, *optional*):
- The mode to use for the PIL image. If unset, will use the default mode for the input image type.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If unset, will use the inferred format from the input.
- Returns:
- `PIL.Image.Image`: The converted image.
- """
- requires_backends(to_pil_image, ["vision"])
- if isinstance(image, PIL.Image.Image):
- return image
- # Convert all tensors to numpy arrays before converting to PIL image
- if is_torch_tensor(image):
- image = image.numpy()
- elif not isinstance(image, np.ndarray):
- raise ValueError(f"Input image type not supported: {type(image)}")
- # If the channel has been moved to first dim, we put it back at the end.
- image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
- # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
- image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
- # PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed.
- do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale
- if do_rescale:
- image = rescale(image, 255)
- image = image.astype(np.uint8)
- return PIL.Image.fromarray(image, mode=image_mode)
- def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
- """
- Computes the output image size given the input image size and the desired output size.
- Args:
- image_size (`tuple[int, int]`):
- The input image size.
- size (`int`):
- The desired output size.
- max_size (`int`, *optional*):
- The maximum allowed output size.
- """
- height, width = image_size
- raw_size = None
- if max_size is not None:
- min_original_size = float(min((height, width)))
- max_original_size = float(max((height, width)))
- if max_original_size / min_original_size * size > max_size:
- raw_size = max_size * min_original_size / max_original_size
- size = int(round(raw_size))
- if (height <= width and height == size) or (width <= height and width == size):
- oh, ow = height, width
- elif width < height:
- ow = size
- if max_size is not None and raw_size is not None:
- oh = int(raw_size * height / width)
- else:
- oh = int(size * height / width)
- else:
- oh = size
- if max_size is not None and raw_size is not None:
- ow = int(raw_size * width / height)
- else:
- ow = int(size * width / height)
- return (oh, ow)
- # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
- def get_resize_output_image_size(
- input_image: np.ndarray,
- size: int | tuple[int, int] | list[int] | tuple[int, ...],
- default_to_square: bool = True,
- max_size: int | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> tuple:
- """
- Find the target (height, width) dimension of the output image after resizing given the input image and the desired
- size.
- Args:
- input_image (`np.ndarray`):
- The image to resize.
- size (`int` or `tuple[int, int]` or list[int] or `tuple[int]`):
- The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
- this.
- If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
- `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
- number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
- default_to_square (`bool`, *optional*, defaults to `True`):
- How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
- (`size`,`size`). If set to `False`, will replicate
- [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
- with support for resizing only the smallest edge and providing an optional `max_size`.
- max_size (`int`, *optional*):
- The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
- than `max_size` after being resized according to `size`, then the image is resized again so that the longer
- edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
- than `size`. Only used if `default_to_square` is `False`.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If unset, will use the inferred format from the input.
- Returns:
- `tuple`: The target (height, width) dimension of the output image after resizing.
- """
- if isinstance(size, (tuple, list)):
- if len(size) == 2:
- return tuple(size)
- elif len(size) == 1:
- # Perform same logic as if size was an int
- size = size[0]
- else:
- raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
- if default_to_square:
- return (size, size)
- height, width = get_image_size(input_image, input_data_format)
- short, long = (width, height) if width <= height else (height, width)
- requested_new_short = size
- new_short, new_long = requested_new_short, int(requested_new_short * long / short)
- if max_size is not None:
- if max_size <= requested_new_short:
- raise ValueError(
- f"max_size = {max_size} must be strictly greater than the requested "
- f"size for the smaller edge size = {size}"
- )
- if new_long > max_size:
- new_short, new_long = int(max_size * new_short / new_long), max_size
- return (new_long, new_short) if width <= height else (new_short, new_long)
- def resize(
- image: np.ndarray,
- size: tuple[int, int],
- resample: Optional["PILImageResampling"] = None,
- reducing_gap: int | None = None,
- data_format: ChannelDimension | None = None,
- return_numpy: bool = True,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Resizes `image` to `(height, width)` specified by `size` using the PIL library.
- Args:
- image (`np.ndarray`):
- The image to resize.
- size (`tuple[int, int]`):
- The size to use for resizing the image.
- resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
- The filter to user for resampling.
- reducing_gap (`int`, *optional*):
- Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
- the fair resampling. See corresponding Pillow documentation for more details.
- data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the output image. If unset, will use the inferred format from the input.
- return_numpy (`bool`, *optional*, defaults to `True`):
- Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
- returned.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If unset, will use the inferred format from the input.
- Returns:
- `np.ndarray`: The resized image.
- """
- requires_backends(resize, ["vision"])
- resample = resample if resample is not None else PILImageResampling.BILINEAR
- if not len(size) == 2:
- raise ValueError("size must have 2 elements")
- # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
- # The resized image from PIL will always have channels last, so find the input format first.
- if input_data_format is None:
- input_data_format = infer_channel_dimension_format(image)
- data_format = input_data_format if data_format is None else data_format
- # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
- # the pillow library to resize the image and then convert back to numpy
- do_rescale = False
- if not isinstance(image, PIL.Image.Image):
- do_rescale = _rescale_for_pil_conversion(image)
- image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format)
- height, width = size
- # PIL images are in the format (width, height)
- resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
- if return_numpy:
- resized_image = np.array(resized_image)
- # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
- # so we need to add it back if necessary.
- resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
- # The image is always in channels last format after converting from a PIL image
- resized_image = to_channel_dimension_format(
- resized_image, data_format, input_channel_dim=ChannelDimension.LAST
- )
- # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
- # rescale it back to the original range.
- resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image
- return resized_image
- def normalize(
- image: np.ndarray,
- mean: float | Collection[float],
- std: float | Collection[float],
- data_format: ChannelDimension | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
- image = (image - mean) / std
- Args:
- image (`np.ndarray`):
- The image to normalize.
- mean (`float` or `Collection[float]`):
- The mean to use for normalization.
- std (`float` or `Collection[float]`):
- The standard deviation to use for normalization.
- data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the output image. If unset, will use the inferred format from the input.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format of the input image. If unset, will use the inferred format from the input.
- """
- if not isinstance(image, np.ndarray):
- raise TypeError("image must be a numpy array")
- if input_data_format is None:
- input_data_format = infer_channel_dimension_format(image)
- channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
- num_channels = image.shape[channel_axis]
- # We cast to float32 to avoid errors that can occur when subtracting uint8 values.
- # We preserve the original dtype if it is a float type to prevent upcasting float16.
- if not np.issubdtype(image.dtype, np.floating):
- image = image.astype(np.float32)
- if isinstance(mean, Collection):
- if len(mean) != num_channels:
- raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
- else:
- mean = [mean] * num_channels
- mean = np.array(mean, dtype=image.dtype)
- if isinstance(std, Collection):
- if len(std) != num_channels:
- raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
- else:
- std = [std] * num_channels
- std = np.array(std, dtype=image.dtype)
- if input_data_format == ChannelDimension.LAST:
- image = (image - mean) / std
- else:
- image = ((image.T - mean) / std).T
- image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
- return image
- def center_crop(
- image: np.ndarray,
- size: tuple[int, int],
- data_format: str | ChannelDimension | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
- the size given, it will be padded (so the returned result will always be of size `size`).
- Args:
- image (`np.ndarray`):
- The image to crop.
- size (`tuple[int, int]`):
- The target size for the cropped image.
- data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the output image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use the inferred format of the input image.
- input_data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use the inferred format of the input image.
- Returns:
- `np.ndarray`: The cropped image.
- """
- requires_backends(center_crop, ["vision"])
- if not isinstance(image, np.ndarray):
- raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
- if not isinstance(size, Iterable) or len(size) != 2:
- raise ValueError("size must have 2 elements representing the height and width of the output image")
- if input_data_format is None:
- input_data_format = infer_channel_dimension_format(image)
- output_data_format = data_format if data_format is not None else input_data_format
- # We perform the crop in (C, H, W) format and then convert to the output format
- image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
- orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST)
- crop_height, crop_width = size
- crop_height, crop_width = int(crop_height), int(crop_width)
- # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
- top = (orig_height - crop_height) // 2
- bottom = top + crop_height
- # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
- left = (orig_width - crop_width) // 2
- right = left + crop_width
- # Check if cropped area is within image boundaries
- if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
- image = image[..., top:bottom, left:right]
- image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST)
- return image
- # Otherwise, we may need to pad if the image is too small. Oh joy...
- new_height = max(crop_height, orig_height)
- new_width = max(crop_width, orig_width)
- new_shape = image.shape[:-2] + (new_height, new_width)
- new_image = np.zeros_like(image, shape=new_shape)
- # If the image is too small, pad it with zeros
- top_pad = ceil((new_height - orig_height) / 2)
- bottom_pad = top_pad + orig_height
- left_pad = ceil((new_width - orig_width) / 2)
- right_pad = left_pad + orig_width
- new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
- top += top_pad
- bottom += top_pad
- left += left_pad
- right += left_pad
- new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
- new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)
- return new_image
- def _center_to_corners_format_torch(bboxes_center: "torch.Tensor") -> "torch.Tensor":
- center_x, center_y, width, height = bboxes_center.unbind(-1)
- bbox_corners = torch.stack(
- # top left x, top left y, bottom right x, bottom right y
- [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
- dim=-1,
- )
- return bbox_corners
- def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
- center_x, center_y, width, height = bboxes_center.T
- bboxes_corners = np.stack(
- # top left x, top left y, bottom right x, bottom right y
- [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
- axis=-1,
- )
- return bboxes_corners
- # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
- def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
- """
- Converts bounding boxes from center format to corners format.
- center format: contains the coordinate for the center of the box and its width, height dimensions
- (center_x, center_y, width, height)
- corners format: contains the coordinates for the top-left and bottom-right corners of the box
- (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
- """
- # Function is used during model forward pass, so we use torch if relevant, without converting to numpy
- if is_torch_tensor(bboxes_center):
- return _center_to_corners_format_torch(bboxes_center)
- elif isinstance(bboxes_center, np.ndarray):
- return _center_to_corners_format_numpy(bboxes_center)
- raise ValueError(f"Unsupported input type {type(bboxes_center)}")
- def _corners_to_center_format_torch(bboxes_corners: "torch.Tensor") -> "torch.Tensor":
- top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
- b = [
- (top_left_x + bottom_right_x) / 2, # center x
- (top_left_y + bottom_right_y) / 2, # center y
- (bottom_right_x - top_left_x), # width
- (bottom_right_y - top_left_y), # height
- ]
- return torch.stack(b, dim=-1)
- def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
- top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
- bboxes_center = np.stack(
- [
- (top_left_x + bottom_right_x) / 2, # center x
- (top_left_y + bottom_right_y) / 2, # center y
- (bottom_right_x - top_left_x), # width
- (bottom_right_y - top_left_y), # height
- ],
- axis=-1,
- )
- return bboxes_center
- def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
- """
- Converts bounding boxes from corners format to center format.
- corners format: contains the coordinates for the top-left and bottom-right corners of the box
- (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
- center format: contains the coordinate for the center of the box and its the width, height dimensions
- (center_x, center_y, width, height)
- """
- # Inverse function accepts different input types so implemented here too
- if is_torch_tensor(bboxes_corners):
- return _corners_to_center_format_torch(bboxes_corners)
- elif isinstance(bboxes_corners, np.ndarray):
- return _corners_to_center_format_numpy(bboxes_corners)
- raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
- def safe_squeeze(
- tensor: Union[np.ndarray, "torch.Tensor"], axis: int | None = None
- ) -> Union[np.ndarray, "torch.Tensor"]:
- """
- Squeezes a tensor, but only if the axis specified has dim 1.
- """
- if axis is None:
- return tensor.squeeze()
- try:
- return tensor.squeeze(axis=axis)
- except ValueError:
- return tensor
- # 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
- # Copyright (c) 2018, Alexander Kirillov
- # All rights reserved.
- def rgb_to_id(color):
- """
- Converts RGB color to unique ID.
- """
- if isinstance(color, np.ndarray) and len(color.shape) == 3:
- if color.dtype == np.uint8:
- color = color.astype(np.int32)
- return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
- return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
- def id_to_rgb(id_map):
- """
- Converts unique ID to RGB color.
- """
- if isinstance(id_map, np.ndarray):
- id_map_copy = id_map.copy()
- rgb_shape = tuple(list(id_map.shape) + [3])
- rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
- for i in range(3):
- rgb_map[..., i] = id_map_copy % 256
- id_map_copy //= 256
- return rgb_map
- color = []
- for _ in range(3):
- color.append(id_map % 256)
- id_map //= 256
- return color
- class PaddingMode(ExplicitEnum):
- """
- Enum class for the different padding modes to use when padding images.
- """
- CONSTANT = "constant"
- REFLECT = "reflect"
- REPLICATE = "replicate"
- SYMMETRIC = "symmetric"
- def pad(
- image: np.ndarray,
- padding: int | tuple[int, int] | Iterable[tuple[int, int]],
- mode: PaddingMode = PaddingMode.CONSTANT,
- constant_values: float | Iterable[float] = 0.0,
- data_format: str | ChannelDimension | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Pads the `image` with the specified (height, width) `padding` and `mode`.
- Args:
- image (`np.ndarray`):
- The image to pad.
- padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
- Padding to apply to the edges of the height, width axes. Can be one of three formats:
- - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
- - `((before, after),)` yields same before and after pad for height and width.
- - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
- mode (`PaddingMode`):
- The padding mode to use. Can be one of:
- - `"constant"`: pads with a constant value.
- - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
- vector along each axis.
- - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
- - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
- constant_values (`float` or `Iterable[float]`, *optional*):
- The value to use for the padding if `mode` is `"constant"`.
- data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the output image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use same as the input image.
- input_data_format (`str` or `ChannelDimension`, *optional*):
- The channel dimension format for the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use the inferred format of the input image.
- Returns:
- `np.ndarray`: The padded image.
- """
- if input_data_format is None:
- input_data_format = infer_channel_dimension_format(image)
- def _expand_for_data_format(values):
- """
- Convert values to be in the format expected by np.pad based on the data format.
- """
- if isinstance(values, (int, float)):
- values = ((values, values), (values, values))
- elif isinstance(values, tuple) and len(values) == 1:
- values = ((values[0], values[0]), (values[0], values[0]))
- elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
- values = (values, values)
- elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
- pass
- else:
- raise ValueError(f"Unsupported format: {values}")
- # add 0 for channel dimension
- values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
- # Add additional padding if there's a batch dimension
- values = ((0, 0), *values) if image.ndim == 4 else values
- return values
- padding = _expand_for_data_format(padding)
- if mode == PaddingMode.CONSTANT:
- constant_values = _expand_for_data_format(constant_values)
- image = np.pad(image, padding, mode="constant", constant_values=constant_values)
- elif mode == PaddingMode.REFLECT:
- image = np.pad(image, padding, mode="reflect")
- elif mode == PaddingMode.REPLICATE:
- image = np.pad(image, padding, mode="edge")
- elif mode == PaddingMode.SYMMETRIC:
- image = np.pad(image, padding, mode="symmetric")
- else:
- raise ValueError(f"Invalid padding mode: {mode}")
- image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
- return image
- # TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default
- def convert_to_rgb(image: ImageInput) -> ImageInput:
- """
- Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
- as is.
- Args:
- image (Image):
- The image to convert.
- """
- requires_backends(convert_to_rgb, ["vision"])
- if not isinstance(image, PIL.Image.Image):
- return image
- if image.mode == "RGB":
- return image
- image = image.convert("RGB")
- return image
- def flip_channel_order(
- image: np.ndarray,
- data_format: ChannelDimension | None = None,
- input_data_format: str | ChannelDimension | None = None,
- ) -> np.ndarray:
- """
- Flips the channel order of the image.
- If the image is in RGB format, it will be converted to BGR and vice versa.
- Args:
- image (`np.ndarray`):
- The image to flip.
- data_format (`ChannelDimension`, *optional*):
- The channel dimension format for the output image. Can be one of:
- - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use same as the input image.
- input_data_format (`ChannelDimension`, *optional*):
- The channel dimension format for the input image. Can be one of:
- - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- If unset, will use the inferred format of the input image.
- """
- input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format
- if input_data_format == ChannelDimension.LAST:
- image = image[..., ::-1]
- elif input_data_format == ChannelDimension.FIRST:
- image = image[::-1, ...]
- else:
- raise ValueError(f"Unsupported channel dimension: {input_data_format}")
- if data_format is not None:
- image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
- return image
- def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_width: int) -> "torch.Tensor":
- # Split image into number of required tiles (width x height)
- batch_size, num_channels, height, width = images.size()
- images = images.view(
- batch_size,
- num_channels,
- num_tiles_height,
- height // num_tiles_height,
- num_tiles_width,
- width // num_tiles_width,
- )
- # Permute dimensions to reorder the axes
- image = images.permute(0, 2, 4, 1, 3, 5).contiguous()
- # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
- image = image.view(
- batch_size,
- num_tiles_width * num_tiles_height,
- num_channels,
- height // num_tiles_height,
- width // num_tiles_width,
- )
- return image
- def divide_to_patches(
- image: Union[np.ndarray, "torch.Tensor"], patch_size: int
- ) -> list[Union[np.ndarray, "torch.Tensor"]]:
- """
- Divides an image into patches of a specified size.
- Args:
- image (`np.array | "torch.Tensor"`):
- The input image.
- patch_size (`int`):
- The size of each patch.
- Returns:
- list: A list of `np.array | "torch.Tensor"` representing the patches.
- """
- patches = []
- height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
- for i in range(0, height, patch_size):
- for j in range(0, width, patch_size):
- patch = image[:, i : i + patch_size, j : j + patch_size]
- patches.append(patch)
- return patches
- def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
- """
- Helper function to flatten a single level of nested image and batch structures and group by shape.
- Args:
- nested_images (list):
- A list of images or a single tensor
- paired_inputs (Any, *optional*):
- Zero or more lists that mirror the structure of `nested_images` (flat list, or list of lists when
- `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
- same shape key. These paired values are grouped alongside `nested_images` but are not stacked in the output, so
- they do not need to be tensors.
- is_nested (bool, *optional*, defaults to False):
- Whether the images are nested.
- Returns:
- tuple[dict, ...]:
- - A dictionary with shape as key and list of images with that shape as value
- - A dictionary with shape as key and list of paired values with that shape as value
- - A dictionary mapping original indices to (shape, index) tuples
- - A dictionary mapping original indices to (shape, index) tuples for each paired input
- """
- grouped_images = defaultdict(list)
- grouped_images_index = {}
- paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
- # Normalize inputs to consistent nested structure
- normalized_images = [nested_images] if not is_nested else nested_images
- normalized_paired = []
- for paired_input in paired_inputs:
- normalized_paired.append([paired_input] if not is_nested else paired_input)
- # Process each image and group by shape
- for i, (sublist, *paired_sublists) in enumerate(zip(normalized_images, *normalized_paired)):
- for j, (image, *paired_values) in enumerate(zip(sublist, *paired_sublists)):
- key = (i, j) if is_nested else j
- shape = image.shape[1:]
- # Add to grouped structures
- grouped_images[shape].append(image)
- for paired_index, paired_value in enumerate(paired_values):
- paired_grouped_values[paired_index][shape].append(paired_value)
- grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
- # Store structure size for nested inputs to handle empty sublists during reconstruction
- if is_nested:
- grouped_images_index["_num_sublists"] = len(normalized_images)
- return grouped_images, *paired_grouped_values, grouped_images_index
- def _reconstruct_nested_structure(indices, processed_images):
- """Helper function to reconstruct a single level nested structure."""
- # Get the number of sublists (handles empty sublists like in [[], [image]])
- num_sublists = indices.pop("_num_sublists", None)
- # Group indices by outer index
- nested_indices = defaultdict(list)
- for i, j in indices:
- nested_indices[i].append(j)
- # Determine the number of outer sublists
- if num_sublists is not None:
- max_outer_idx = num_sublists - 1
- elif nested_indices:
- max_outer_idx = max(nested_indices.keys())
- else:
- return []
- # Create the result structure
- result = []
- for i in range(max_outer_idx + 1):
- if i not in nested_indices:
- result.append([])
- else:
- inner_max_idx = max(nested_indices[i])
- inner_list = [None] * (inner_max_idx + 1)
- for j in nested_indices[i]:
- shape, idx = indices[(i, j)]
- inner_list[j] = processed_images[shape][idx]
- result.append(inner_list)
- return result
- def _iterate_items(items, is_nested: bool):
- """
- Helper function to iterate over items yielding (key, item) pairs.
- For nested structures, yields ((row_index, col_index), item).
- For flat structures, yields (index, item).
- """
- if is_nested:
- for i, row in enumerate(items):
- for j, item in enumerate(row):
- yield (i, j), item
- else:
- for i, item in enumerate(items):
- yield i, item
- def _get_device_from_images(images, is_nested: bool) -> "torch.device":
- """
- Get the device from the first non-empty element in a (potentially nested) list of images.
- Handles cases like `images = [[], [image]]` where the first sublist may be empty.
- """
- if is_nested:
- for row in images:
- if isinstance(row, torch.Tensor):
- return row.device
- if isinstance(row, list) and len(row) > 0:
- return row[0].device
- return images[0].device
- def group_images_by_shape(
- images: Union[list["torch.Tensor"], "torch.Tensor"],
- *paired_inputs,
- disable_grouping: bool | None,
- is_nested: bool = False,
- ) -> tuple[dict, ...]:
- """
- Groups images by shape.
- Returns a dictionary with the shape as key and a list of images with that shape as value,
- and a dictionary with the index of the image in the original list as key and the shape and index in the grouped list as value.
- The function supports both flat lists of tensors and nested structures.
- The input must be either all flat or all nested, not a mix of both.
- Args:
- images (Union[list["torch.Tensor"], "torch.Tensor"]):
- A list of images or a single tensor
- paired_inputs (Any, *optional*):
- Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
- `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
- same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
- they do not need to be tensors.
- disable_grouping (bool):
- Whether to disable grouping. If None, will be set to True if the images are on CPU, and False otherwise.
- This choice is based on empirical observations, as detailed here: https://github.com/huggingface/transformers/pull/38157
- is_nested (bool, *optional*, defaults to False):
- Whether the images are nested.
- Returns:
- tuple[dict, ...]:
- - A dictionary with shape as key and list/batch of images with that shape as value
- - Zero or more dictionaries (one per argument in `*paired_inputs`) grouped consistently with `images`; these carry
- the corresponding per-item values and are not stacked
- - A dictionary mapping original indices to (shape, index) tuples
- """
- # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
- if disable_grouping is None:
- device = _get_device_from_images(images, is_nested)
- disable_grouping = device == "cpu"
- if disable_grouping:
- grouped_images_index = {key: (key, 0) for key, _ in _iterate_items(images, is_nested)}
- if is_nested:
- grouped_images_index["_num_sublists"] = len(images)
- return (
- {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
- *[
- {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
- for paired_list in paired_inputs
- ],
- grouped_images_index,
- )
- # Handle single level nested structure
- grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
- images, *paired_inputs, is_nested=is_nested
- )
- # Stack images with the same shape
- grouped_images = {shape: torch.stack(images_list, dim=0) for shape, images_list in grouped_images.items()}
- return grouped_images, *paired_grouped_values, grouped_images_index
- def reorder_images(
- processed_images: dict[tuple[int, int], "torch.Tensor"],
- grouped_images_index: dict[int | tuple[int, int], tuple[tuple[int, int], int]],
- is_nested: bool = False,
- ) -> Union[list["torch.Tensor"], "torch.Tensor"]:
- """
- Reconstructs images in the original order, preserving the original structure (nested or not).
- The input structure is either all flat or all nested.
- Args:
- processed_images (dict[tuple[int, int], "torch.Tensor"]):
- Dictionary mapping shapes to batched processed images.
- grouped_images_index (dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]):
- Dictionary mapping original indices to (shape, index) tuples.
- is_nested (bool, *optional*, defaults to False):
- Whether the images are nested. Cannot be inferred from the input, as some processing functions outputs nested images.
- even with non nested images,e.g functions splitting images into patches. We thus can't deduce is_nested from the input.
- Returns:
- Union[list["torch.Tensor"], "torch.Tensor"]:
- Images in the original structure.
- """
- if not is_nested:
- return [
- processed_images[grouped_images_index[i][0]][grouped_images_index[i][1]]
- for i in range(len(grouped_images_index))
- ]
- return _reconstruct_nested_structure(grouped_images_index, processed_images)
|