image_utils.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017
  1. # Copyright 2021 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import base64
  15. import os
  16. from collections.abc import Iterable
  17. from dataclasses import dataclass, fields
  18. from io import BytesIO
  19. from typing import Any, Union
  20. import httpx
  21. import numpy as np
  22. from .utils import (
  23. ExplicitEnum,
  24. is_numpy_array,
  25. is_torch_available,
  26. is_torch_tensor,
  27. is_torchvision_available,
  28. is_vision_available,
  29. logging,
  30. requires_backends,
  31. to_numpy,
  32. )
  33. from .utils.constants import ( # noqa: F401
  34. IMAGENET_DEFAULT_MEAN,
  35. IMAGENET_DEFAULT_STD,
  36. IMAGENET_STANDARD_MEAN,
  37. IMAGENET_STANDARD_STD,
  38. OPENAI_CLIP_MEAN,
  39. OPENAI_CLIP_STD,
  40. )
  41. if is_vision_available():
  42. import PIL.Image
  43. import PIL.ImageOps
  44. PILImageResampling = PIL.Image.Resampling
  45. if is_torchvision_available():
  46. from torchvision.transforms import InterpolationMode
  47. pil_torch_interpolation_mapping = {
  48. PILImageResampling.NEAREST: InterpolationMode.NEAREST_EXACT,
  49. PILImageResampling.BOX: InterpolationMode.BOX,
  50. PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
  51. PILImageResampling.HAMMING: InterpolationMode.HAMMING,
  52. PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
  53. PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
  54. }
  55. # Create inverse mapping: InterpolationMode -> PILImageResampling
  56. torch_pil_interpolation_mapping = {v: k for k, v in pil_torch_interpolation_mapping.items()}
  57. else:
  58. pil_torch_interpolation_mapping = {}
  59. torch_pil_interpolation_mapping = {}
  60. if is_torch_available():
  61. import torch
  62. logger = logging.get_logger(__name__)
  63. ImageInput = Union[
  64. "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"]
  65. ]
  66. class ChannelDimension(ExplicitEnum):
  67. FIRST = "channels_first"
  68. LAST = "channels_last"
  69. class AnnotationFormat(ExplicitEnum):
  70. COCO_DETECTION = "coco_detection"
  71. COCO_PANOPTIC = "coco_panoptic"
  72. AnnotationType = dict[str, int | str | list[dict]]
  73. def is_pil_image(img):
  74. return is_vision_available() and isinstance(img, PIL.Image.Image)
  75. class ImageType(ExplicitEnum):
  76. PIL = "pillow"
  77. TORCH = "torch"
  78. NUMPY = "numpy"
  79. def get_image_type(image):
  80. if is_pil_image(image):
  81. return ImageType.PIL
  82. if is_torch_tensor(image):
  83. return ImageType.TORCH
  84. if is_numpy_array(image):
  85. return ImageType.NUMPY
  86. raise ValueError(f"Unrecognized image type {type(image)}")
  87. def is_valid_image(img):
  88. return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img)
  89. def is_valid_list_of_images(images: list):
  90. return images and all(is_valid_image(image) for image in images)
  91. def concatenate_list(input_list):
  92. if isinstance(input_list[0], list):
  93. return [item for sublist in input_list for item in sublist]
  94. elif isinstance(input_list[0], np.ndarray):
  95. return np.concatenate(input_list, axis=0)
  96. elif isinstance(input_list[0], torch.Tensor):
  97. return torch.cat(input_list, dim=0)
  98. def valid_images(imgs):
  99. # If we have an list of images, make sure every image is valid
  100. if isinstance(imgs, (list, tuple)):
  101. for img in imgs:
  102. if not valid_images(img):
  103. return False
  104. # If not a list of tuple, we have been given a single image or batched tensor of images
  105. elif not is_valid_image(imgs):
  106. return False
  107. return True
  108. def is_batched(img):
  109. if isinstance(img, (list, tuple)):
  110. return is_valid_image(img[0])
  111. return False
  112. def is_scaled_image(image: np.ndarray) -> bool:
  113. """
  114. Checks to see whether the pixel values have already been rescaled to [0, 1].
  115. """
  116. if image.dtype == np.uint8:
  117. return False
  118. # It's possible the image has pixel values in [0, 255] but is of floating type
  119. return np.min(image) >= 0 and np.max(image) <= 1
  120. def make_list_of_images(images, expected_ndims: int = 3) -> list[ImageInput]:
  121. """
  122. Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
  123. If the input is a batch of images, it is converted to a list of images.
  124. Args:
  125. images (`ImageInput`):
  126. Image of images to turn into a list of images.
  127. expected_ndims (`int`, *optional*, defaults to 3):
  128. Expected number of dimensions for a single input image. If the input image has a different number of
  129. dimensions, an error is raised.
  130. """
  131. if is_batched(images):
  132. return images
  133. # Either the input is a single image, in which case we create a list of length 1
  134. if is_pil_image(images):
  135. # PIL images are never batched
  136. return [images]
  137. if is_valid_image(images):
  138. if images.ndim == expected_ndims + 1:
  139. # Batch of images
  140. images = list(images)
  141. elif images.ndim == expected_ndims:
  142. # Single image
  143. images = [images]
  144. else:
  145. raise ValueError(
  146. f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
  147. f" {images.ndim} dimensions."
  148. )
  149. return images
  150. raise ValueError(
  151. f"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, or torch.Tensor, but got {type(images)}."
  152. )
  153. def make_flat_list_of_images(
  154. images: list[ImageInput] | ImageInput,
  155. expected_ndims: int = 3,
  156. ) -> ImageInput:
  157. """
  158. Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
  159. If the input is a nested list of images, it is converted to a flat list of images.
  160. Args:
  161. images (`Union[list[ImageInput], ImageInput]`):
  162. The input image.
  163. expected_ndims (`int`, *optional*, defaults to 3):
  164. The expected number of dimensions for a single input image.
  165. Returns:
  166. list: A list of images or a 4d array of images.
  167. """
  168. # If the input is a nested list of images, we flatten it
  169. if (
  170. isinstance(images, (list, tuple))
  171. and all(isinstance(images_i, (list, tuple)) for images_i in images)
  172. and all(is_valid_list_of_images(images_i) or not images_i for images_i in images)
  173. ):
  174. return [img for img_list in images for img in img_list]
  175. if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
  176. if is_pil_image(images[0]) or images[0].ndim == expected_ndims:
  177. return images
  178. if images[0].ndim == expected_ndims + 1:
  179. return [img for img_list in images for img in img_list]
  180. if is_valid_image(images):
  181. if is_pil_image(images) or images.ndim == expected_ndims:
  182. return [images]
  183. if images.ndim == expected_ndims + 1:
  184. return list(images)
  185. raise ValueError(f"Could not make a flat list of images from {images}")
  186. def make_nested_list_of_images(
  187. images: list[ImageInput] | ImageInput,
  188. expected_ndims: int = 3,
  189. ) -> list[ImageInput]:
  190. """
  191. Ensure that the output is a nested list of images.
  192. Args:
  193. images (`Union[list[ImageInput], ImageInput]`):
  194. The input image.
  195. expected_ndims (`int`, *optional*, defaults to 3):
  196. The expected number of dimensions for a single input image.
  197. Returns:
  198. list: A list of list of images or a list of 4d array of images.
  199. """
  200. # If it's a list of batches, it's already in the right format
  201. if (
  202. isinstance(images, (list, tuple))
  203. and all(isinstance(images_i, (list, tuple)) for images_i in images)
  204. and all(is_valid_list_of_images(images_i) or not images_i for images_i in images)
  205. ):
  206. return images
  207. # If it's a list of images, it's a single batch, so convert it to a list of lists
  208. if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
  209. if is_pil_image(images[0]) or images[0].ndim == expected_ndims:
  210. return [images]
  211. if images[0].ndim == expected_ndims + 1:
  212. return [list(image) for image in images]
  213. # If it's a single image, convert it to a list of lists
  214. if is_valid_image(images):
  215. if is_pil_image(images) or images.ndim == expected_ndims:
  216. return [[images]]
  217. if images.ndim == expected_ndims + 1:
  218. return [list(images)]
  219. raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.")
  220. def to_numpy_array(img) -> np.ndarray:
  221. if not is_valid_image(img):
  222. raise ValueError(f"Invalid image type: {type(img)}")
  223. if is_vision_available() and isinstance(img, PIL.Image.Image):
  224. return np.array(img)
  225. return to_numpy(img)
  226. def infer_channel_dimension_format(
  227. image: np.ndarray, num_channels: int | tuple[int, ...] | None = None
  228. ) -> ChannelDimension:
  229. """
  230. Infers the channel dimension format of `image`.
  231. Args:
  232. image (`np.ndarray`):
  233. The image to infer the channel dimension of.
  234. num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
  235. The number of channels of the image.
  236. Returns:
  237. The channel dimension of the image.
  238. """
  239. num_channels = num_channels if num_channels is not None else (1, 3)
  240. num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
  241. if image.ndim == 3:
  242. first_dim, last_dim = 0, 2
  243. elif image.ndim == 4:
  244. first_dim, last_dim = 1, 3
  245. elif image.ndim == 5:
  246. first_dim, last_dim = 2, 4
  247. else:
  248. raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
  249. if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
  250. logger.warning(
  251. f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension."
  252. )
  253. return ChannelDimension.FIRST
  254. elif image.shape[first_dim] in num_channels:
  255. return ChannelDimension.FIRST
  256. elif image.shape[last_dim] in num_channels:
  257. return ChannelDimension.LAST
  258. raise ValueError("Unable to infer channel dimension format")
  259. def get_channel_dimension_axis(image: np.ndarray, input_data_format: ChannelDimension | str | None = None) -> int:
  260. """
  261. Returns the channel dimension axis of the image.
  262. Args:
  263. image (`np.ndarray`):
  264. The image to get the channel dimension axis of.
  265. input_data_format (`ChannelDimension` or `str`, *optional*):
  266. The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
  267. Returns:
  268. The channel dimension axis of the image.
  269. """
  270. if input_data_format is None:
  271. input_data_format = infer_channel_dimension_format(image)
  272. if input_data_format == ChannelDimension.FIRST:
  273. return image.ndim - 3
  274. elif input_data_format == ChannelDimension.LAST:
  275. return image.ndim - 1
  276. raise ValueError(f"Unsupported data format: {input_data_format}")
  277. def get_image_size(image: np.ndarray, channel_dim: ChannelDimension | None = None) -> tuple[int, int]:
  278. """
  279. Returns the (height, width) dimensions of the image.
  280. Args:
  281. image (`np.ndarray`):
  282. The image to get the dimensions of.
  283. channel_dim (`ChannelDimension`, *optional*):
  284. Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
  285. Returns:
  286. A tuple of the image's height and width.
  287. """
  288. if channel_dim is None:
  289. channel_dim = infer_channel_dimension_format(image)
  290. if channel_dim == ChannelDimension.FIRST:
  291. return image.shape[-2], image.shape[-1]
  292. elif channel_dim == ChannelDimension.LAST:
  293. return image.shape[-3], image.shape[-2]
  294. else:
  295. raise ValueError(f"Unsupported data format: {channel_dim}")
  296. def get_image_size_for_max_height_width(
  297. image_size: tuple[int, int],
  298. max_height: int,
  299. max_width: int,
  300. ) -> tuple[int, int]:
  301. """
  302. Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
  303. Important, even if image_height < max_height and image_width < max_width, the image will be resized
  304. to at least one of the edges be equal to max_height or max_width.
  305. For example:
  306. - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
  307. - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
  308. Args:
  309. image_size (`tuple[int, int]`):
  310. The image to resize.
  311. max_height (`int`):
  312. The maximum allowed height.
  313. max_width (`int`):
  314. The maximum allowed width.
  315. """
  316. height, width = image_size
  317. height_scale = max_height / height
  318. width_scale = max_width / width
  319. min_scale = min(height_scale, width_scale)
  320. new_height = int(height * min_scale)
  321. new_width = int(width * min_scale)
  322. return new_height, new_width
  323. def max_across_indices(values: Iterable[Any]) -> list[Any]:
  324. """
  325. Return the maximum value across all indices of an iterable of values.
  326. """
  327. return [max(values_i) for values_i in zip(*values)]
  328. def get_max_height_width(
  329. images: list[Union["torch.Tensor", np.ndarray]], input_data_format: str | ChannelDimension = ChannelDimension.FIRST
  330. ) -> list[int]:
  331. """
  332. Get the maximum height and width across all images in a batch.
  333. """
  334. if input_data_format == ChannelDimension.FIRST:
  335. _, max_height, max_width = max_across_indices([img.shape for img in images])
  336. elif input_data_format == ChannelDimension.LAST:
  337. max_height, max_width, _ = max_across_indices([img.shape for img in images])
  338. else:
  339. raise ValueError(f"Invalid channel dimension format: {input_data_format}")
  340. return (max_height, max_width)
  341. def is_valid_annotation_coco_detection(annotation: dict[str, list | tuple]) -> bool:
  342. if (
  343. isinstance(annotation, dict)
  344. and "image_id" in annotation
  345. and "annotations" in annotation
  346. and isinstance(annotation["annotations"], (list, tuple))
  347. and (
  348. # an image can have no annotations
  349. len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
  350. )
  351. ):
  352. return True
  353. return False
  354. def is_valid_annotation_coco_panoptic(annotation: dict[str, list | tuple]) -> bool:
  355. if (
  356. isinstance(annotation, dict)
  357. and "image_id" in annotation
  358. and "segments_info" in annotation
  359. and "file_name" in annotation
  360. and isinstance(annotation["segments_info"], (list, tuple))
  361. and (
  362. # an image can have no segments
  363. len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
  364. )
  365. ):
  366. return True
  367. return False
  368. def valid_coco_detection_annotations(annotations: Iterable[dict[str, list | tuple]]) -> bool:
  369. return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
  370. def valid_coco_panoptic_annotations(annotations: Iterable[dict[str, list | tuple]]) -> bool:
  371. return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
  372. def load_image(image: Union[str, "PIL.Image.Image"], timeout: float | None = None) -> "PIL.Image.Image":
  373. """
  374. Loads `image` to a PIL Image.
  375. Args:
  376. image (`str` or `PIL.Image.Image`):
  377. The image to convert to the PIL Image format.
  378. timeout (`float`, *optional*):
  379. The timeout value in seconds for the URL request.
  380. Returns:
  381. `PIL.Image.Image`: A PIL Image.
  382. """
  383. requires_backends(load_image, ["vision"])
  384. if isinstance(image, str):
  385. if image.startswith("http://") or image.startswith("https://"):
  386. # We need to actually check for a real protocol, otherwise it's impossible to use a local file
  387. # like http_huggingface_co.png
  388. image = PIL.Image.open(BytesIO(httpx.get(image, timeout=timeout, follow_redirects=True).content))
  389. elif os.path.isfile(image):
  390. image = PIL.Image.open(image)
  391. else:
  392. if image.startswith("data:image/"):
  393. image = image.split(",")[1]
  394. # Try to load as base64
  395. try:
  396. b64 = base64.decodebytes(image.encode())
  397. image = PIL.Image.open(BytesIO(b64))
  398. except Exception as e:
  399. raise ValueError(
  400. f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
  401. )
  402. elif not isinstance(image, PIL.Image.Image):
  403. raise TypeError(
  404. "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
  405. )
  406. image = PIL.ImageOps.exif_transpose(image)
  407. image = image.convert("RGB")
  408. return image
  409. def load_images(
  410. images: Union[list, tuple, str, "PIL.Image.Image"], timeout: float | None = None
  411. ) -> Union["PIL.Image.Image", list["PIL.Image.Image"], list[list["PIL.Image.Image"]]]:
  412. """Loads images, handling different levels of nesting.
  413. Args:
  414. images: A single image, a list of images, or a list of lists of images to load.
  415. timeout: Timeout for loading images.
  416. Returns:
  417. A single image, a list of images, a list of lists of images.
  418. """
  419. if isinstance(images, (list, tuple)):
  420. if len(images) and isinstance(images[0], (list, tuple)):
  421. return [[load_image(image, timeout=timeout) for image in image_group] for image_group in images]
  422. else:
  423. return [load_image(image, timeout=timeout) for image in images]
  424. else:
  425. return load_image(images, timeout=timeout)
  426. def validate_preprocess_arguments(
  427. do_rescale: bool | None = None,
  428. rescale_factor: float | None = None,
  429. do_normalize: bool | None = None,
  430. image_mean: float | list[float] | None = None,
  431. image_std: float | list[float] | None = None,
  432. do_pad: bool | None = None,
  433. pad_size: dict[str, int] | int | None = None,
  434. do_center_crop: bool | None = None,
  435. crop_size: dict[str, int] | None = None,
  436. do_resize: bool | None = None,
  437. size: dict[str, int] | None = None,
  438. resample: Union["PILImageResampling", "InterpolationMode", int] | None = None,
  439. ):
  440. """
  441. Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
  442. Raises `ValueError` if arguments incompatibility is caught.
  443. Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
  444. sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
  445. existing arguments when possible.
  446. """
  447. if do_rescale and rescale_factor is None:
  448. raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
  449. if do_pad and pad_size is None:
  450. # Processors pad images using different args depending on the model, so the below check is pointless
  451. # but we keep it for BC for now. TODO: remove in v5
  452. # Usually padding can be called with:
  453. # - "pad_size/size" if we're padding to specific values
  454. # - "size_divisor" if we're padding to any value divisible by X
  455. # - "None" if we're padding to the maximum size image in batch
  456. raise ValueError(
  457. "Depending on the model, `size_divisor` or `pad_size` or `size` must be specified if `do_pad` is `True`."
  458. )
  459. if do_normalize and (image_mean is None or image_std is None):
  460. raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
  461. if do_center_crop and crop_size is None:
  462. raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
  463. if do_resize and not (size is not None and resample is not None):
  464. raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
  465. class ImageFeatureExtractionMixin:
  466. """
  467. Mixin that contain utilities for preparing image features.
  468. """
  469. def _ensure_format_supported(self, image):
  470. if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
  471. raise ValueError(
  472. f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.ndarray` and "
  473. "`torch.Tensor` are."
  474. )
  475. def to_pil_image(self, image, rescale=None):
  476. """
  477. Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
  478. needed.
  479. Args:
  480. image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
  481. The image to convert to the PIL Image format.
  482. rescale (`bool`, *optional*):
  483. Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
  484. default to `True` if the image type is a floating type, `False` otherwise.
  485. """
  486. self._ensure_format_supported(image)
  487. if is_torch_tensor(image):
  488. image = image.numpy()
  489. if isinstance(image, np.ndarray):
  490. if rescale is None:
  491. # rescale default to the array being of floating type.
  492. rescale = isinstance(image.flat[0], np.floating)
  493. # If the channel as been moved to first dim, we put it back at the end.
  494. if image.ndim == 3 and image.shape[0] in [1, 3]:
  495. image = image.transpose(1, 2, 0)
  496. if rescale:
  497. image = image * 255
  498. image = image.astype(np.uint8)
  499. return PIL.Image.fromarray(image)
  500. return image
  501. def convert_rgb(self, image):
  502. """
  503. Converts `PIL.Image.Image` to RGB format.
  504. Args:
  505. image (`PIL.Image.Image`):
  506. The image to convert.
  507. """
  508. self._ensure_format_supported(image)
  509. if not isinstance(image, PIL.Image.Image):
  510. return image
  511. return image.convert("RGB")
  512. def rescale(self, image: np.ndarray, scale: float | int) -> np.ndarray:
  513. """
  514. Rescale a numpy image by scale amount
  515. """
  516. self._ensure_format_supported(image)
  517. return image * scale
  518. def to_numpy_array(self, image, rescale=None, channel_first=True):
  519. """
  520. Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
  521. dimension.
  522. Args:
  523. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  524. The image to convert to a NumPy array.
  525. rescale (`bool`, *optional*):
  526. Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
  527. default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
  528. channel_first (`bool`, *optional*, defaults to `True`):
  529. Whether or not to permute the dimensions of the image to put the channel dimension first.
  530. """
  531. self._ensure_format_supported(image)
  532. if isinstance(image, PIL.Image.Image):
  533. image = np.array(image)
  534. if is_torch_tensor(image):
  535. image = image.numpy()
  536. rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
  537. if rescale:
  538. image = self.rescale(image.astype(np.float32), 1 / 255.0)
  539. if channel_first and image.ndim == 3:
  540. image = image.transpose(2, 0, 1)
  541. return image
  542. def expand_dims(self, image):
  543. """
  544. Expands 2-dimensional `image` to 3 dimensions.
  545. Args:
  546. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  547. The image to expand.
  548. """
  549. self._ensure_format_supported(image)
  550. # Do nothing if PIL image
  551. if isinstance(image, PIL.Image.Image):
  552. return image
  553. if is_torch_tensor(image):
  554. image = image.unsqueeze(0)
  555. else:
  556. image = np.expand_dims(image, axis=0)
  557. return image
  558. def normalize(self, image, mean, std, rescale=False):
  559. """
  560. Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
  561. if it's a PIL Image.
  562. Args:
  563. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  564. The image to normalize.
  565. mean (`list[float]` or `np.ndarray` or `torch.Tensor`):
  566. The mean (per channel) to use for normalization.
  567. std (`list[float]` or `np.ndarray` or `torch.Tensor`):
  568. The standard deviation (per channel) to use for normalization.
  569. rescale (`bool`, *optional*, defaults to `False`):
  570. Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
  571. happen automatically.
  572. """
  573. self._ensure_format_supported(image)
  574. if isinstance(image, PIL.Image.Image):
  575. image = self.to_numpy_array(image, rescale=True)
  576. # If the input image is a PIL image, it automatically gets rescaled. If it's another
  577. # type it may need rescaling.
  578. elif rescale:
  579. if isinstance(image, np.ndarray):
  580. image = self.rescale(image.astype(np.float32), 1 / 255.0)
  581. elif is_torch_tensor(image):
  582. image = self.rescale(image.float(), 1 / 255.0)
  583. if isinstance(image, np.ndarray):
  584. if not isinstance(mean, np.ndarray):
  585. mean = np.array(mean).astype(image.dtype)
  586. if not isinstance(std, np.ndarray):
  587. std = np.array(std).astype(image.dtype)
  588. elif is_torch_tensor(image):
  589. import torch
  590. if not isinstance(mean, torch.Tensor):
  591. if isinstance(mean, np.ndarray):
  592. mean = torch.from_numpy(mean)
  593. else:
  594. mean = torch.tensor(mean)
  595. if not isinstance(std, torch.Tensor):
  596. if isinstance(std, np.ndarray):
  597. std = torch.from_numpy(std)
  598. else:
  599. std = torch.tensor(std)
  600. if image.ndim == 3 and image.shape[0] in [1, 3]:
  601. return (image - mean[:, None, None]) / std[:, None, None]
  602. else:
  603. return (image - mean) / std
  604. def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
  605. """
  606. Resizes `image`. Enforces conversion of input to PIL.Image.
  607. Args:
  608. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  609. The image to resize.
  610. size (`int` or `tuple[int, int]`):
  611. The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
  612. matched to this.
  613. If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
  614. `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
  615. this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
  616. resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
  617. The filter to user for resampling.
  618. default_to_square (`bool`, *optional*, defaults to `True`):
  619. How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
  620. square (`size`,`size`). If set to `False`, will replicate
  621. [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
  622. with support for resizing only the smallest edge and providing an optional `max_size`.
  623. max_size (`int`, *optional*, defaults to `None`):
  624. The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
  625. greater than `max_size` after being resized according to `size`, then the image is resized again so
  626. that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
  627. edge may be shorter than `size`. Only used if `default_to_square` is `False`.
  628. Returns:
  629. image: A resized `PIL.Image.Image`.
  630. """
  631. resample = resample if resample is not None else PILImageResampling.BILINEAR
  632. self._ensure_format_supported(image)
  633. if not isinstance(image, PIL.Image.Image):
  634. image = self.to_pil_image(image)
  635. if isinstance(size, list):
  636. size = tuple(size)
  637. if isinstance(size, int) or len(size) == 1:
  638. if default_to_square:
  639. size = (size, size) if isinstance(size, int) else (size[0], size[0])
  640. else:
  641. width, height = image.size
  642. # specified size only for the smallest edge
  643. short, long = (width, height) if width <= height else (height, width)
  644. requested_new_short = size if isinstance(size, int) else size[0]
  645. if short == requested_new_short:
  646. return image
  647. new_short, new_long = requested_new_short, int(requested_new_short * long / short)
  648. if max_size is not None:
  649. if max_size <= requested_new_short:
  650. raise ValueError(
  651. f"max_size = {max_size} must be strictly greater than the requested "
  652. f"size for the smaller edge size = {size}"
  653. )
  654. if new_long > max_size:
  655. new_short, new_long = int(max_size * new_short / new_long), max_size
  656. size = (new_short, new_long) if width <= height else (new_long, new_short)
  657. return image.resize(size, resample=resample)
  658. def center_crop(self, image, size):
  659. """
  660. Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
  661. size given, it will be padded (so the returned result has the size asked).
  662. Args:
  663. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
  664. The image to resize.
  665. size (`int` or `tuple[int, int]`):
  666. The size to which crop the image.
  667. Returns:
  668. new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
  669. height, width).
  670. """
  671. self._ensure_format_supported(image)
  672. if not isinstance(size, tuple):
  673. size = (size, size)
  674. # PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
  675. if is_torch_tensor(image) or isinstance(image, np.ndarray):
  676. if image.ndim == 2:
  677. image = self.expand_dims(image)
  678. image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
  679. else:
  680. image_shape = (image.size[1], image.size[0])
  681. top = (image_shape[0] - size[0]) // 2
  682. bottom = top + size[0] # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
  683. left = (image_shape[1] - size[1]) // 2
  684. right = left + size[1] # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
  685. # For PIL Images we have a method to crop directly.
  686. if isinstance(image, PIL.Image.Image):
  687. return image.crop((left, top, right, bottom))
  688. # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
  689. channel_first = image.shape[0] in [1, 3]
  690. # Transpose (height, width, n_channels) format images
  691. if not channel_first:
  692. if isinstance(image, np.ndarray):
  693. image = image.transpose(2, 0, 1)
  694. if is_torch_tensor(image):
  695. image = image.permute(2, 0, 1)
  696. # Check if cropped area is within image boundaries
  697. if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
  698. return image[..., top:bottom, left:right]
  699. # Otherwise, we may need to pad if the image is too small. Oh joy...
  700. new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
  701. if isinstance(image, np.ndarray):
  702. new_image = np.zeros_like(image, shape=new_shape)
  703. elif is_torch_tensor(image):
  704. new_image = image.new_zeros(new_shape)
  705. top_pad = (new_shape[-2] - image_shape[0]) // 2
  706. bottom_pad = top_pad + image_shape[0]
  707. left_pad = (new_shape[-1] - image_shape[1]) // 2
  708. right_pad = left_pad + image_shape[1]
  709. new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
  710. top += top_pad
  711. bottom += top_pad
  712. left += left_pad
  713. right += left_pad
  714. new_image = new_image[
  715. ..., max(0, top) : min(new_image.shape[-2], bottom), max(0, left) : min(new_image.shape[-1], right)
  716. ]
  717. return new_image
  718. def flip_channel_order(self, image):
  719. """
  720. Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
  721. `image` to a NumPy array if it's a PIL Image.
  722. Args:
  723. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  724. The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
  725. be first.
  726. """
  727. self._ensure_format_supported(image)
  728. if isinstance(image, PIL.Image.Image):
  729. image = self.to_numpy_array(image)
  730. return image[::-1, :, :]
  731. def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
  732. """
  733. Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
  734. counter clockwise around its centre.
  735. Args:
  736. image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
  737. The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
  738. rotating.
  739. Returns:
  740. image: A rotated `PIL.Image.Image`.
  741. """
  742. resample = resample if resample is not None else PIL.Image.NEAREST
  743. self._ensure_format_supported(image)
  744. if not isinstance(image, PIL.Image.Image):
  745. image = self.to_pil_image(image)
  746. return image.rotate(
  747. angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
  748. )
  749. def validate_annotations(
  750. annotation_format: AnnotationFormat,
  751. supported_annotation_formats: tuple[AnnotationFormat, ...],
  752. annotations: list[dict],
  753. ) -> None:
  754. if annotation_format not in supported_annotation_formats:
  755. raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
  756. if annotation_format is AnnotationFormat.COCO_DETECTION:
  757. if not valid_coco_detection_annotations(annotations):
  758. raise ValueError(
  759. "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
  760. "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
  761. "being a list of annotations in the COCO format."
  762. )
  763. if annotation_format is AnnotationFormat.COCO_PANOPTIC:
  764. if not valid_coco_panoptic_annotations(annotations):
  765. raise ValueError(
  766. "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
  767. "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
  768. "the latter being a list of annotations in the COCO format."
  769. )
  770. def validate_kwargs(valid_processor_keys: list[str], captured_kwargs: list[str]):
  771. unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
  772. if unused_keys:
  773. unused_key_str = ", ".join(unused_keys)
  774. # TODO raise a warning here instead of simply logging?
  775. logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")
  776. @dataclass()
  777. class SizeDict:
  778. """
  779. Hashable dictionary to store image size information.
  780. """
  781. height: int | None = None
  782. width: int | None = None
  783. longest_edge: int | None = None
  784. shortest_edge: int | None = None
  785. max_height: int | None = None
  786. max_width: int | None = None
  787. def __getitem__(self, key):
  788. if hasattr(self, key):
  789. return getattr(self, key)
  790. raise KeyError(f"Key {key} not found in SizeDict.")
  791. def get(self, key, default=None):
  792. if hasattr(self, key) and getattr(self, key) is not None:
  793. return getattr(self, key)
  794. return default
  795. def __iter__(self):
  796. # Yield only non-None (key, value) pairs so dict(self) excludes missing values.
  797. for f in fields(self):
  798. val = getattr(self, f.name)
  799. if val is not None:
  800. yield f.name, val
  801. def __hash__(self):
  802. return hash((self.height, self.width, self.longest_edge, self.shortest_edge, self.max_height, self.max_width))
  803. def __contains__(self, key):
  804. return hasattr(self, key) and getattr(self, key) is not None
  805. def __setitem__(self, key, value):
  806. if not hasattr(self, key):
  807. raise KeyError(f"Key {key} is not a valid field of SizeDict.")
  808. object.__setattr__(self, key, value)
  809. def __eq__(self, other):
  810. if isinstance(other, dict):
  811. return dict(self) == other
  812. if isinstance(other, SizeDict):
  813. return tuple(getattr(self, f.name) for f in fields(self)) == tuple(
  814. getattr(other, f.name) for f in fields(self)
  815. )
  816. return NotImplemented
  817. def __or__(self, other) -> "SizeDict":
  818. if isinstance(other, dict | SizeDict):
  819. merged = dict(self)
  820. merged.update(dict(other))
  821. return SizeDict(**merged)
  822. return NotImplemented
  823. def __ror__(self, other) -> dict:
  824. if isinstance(other, dict):
  825. merged = dict(other)
  826. merged.update(dict(self))
  827. return merged
  828. return NotImplemented