image_processing_backends.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. # Copyright 2025 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from collections.abc import Iterable
  15. from functools import lru_cache
  16. from typing import Any, Optional, Union
  17. import numpy as np
  18. from .image_processing_base import BatchFeature
  19. from .image_processing_utils import BaseImageProcessor
  20. from .image_transforms import (
  21. center_crop as np_center_crop,
  22. )
  23. from .image_transforms import (
  24. convert_to_rgb,
  25. divide_to_patches, # noqa: F401 - re-exported for backward compat with image_processing_utils_fast
  26. get_resize_output_image_size,
  27. get_size_with_aspect_ratio,
  28. group_images_by_shape,
  29. reorder_images,
  30. )
  31. from .image_transforms import (
  32. normalize as np_normalize,
  33. )
  34. from .image_transforms import (
  35. rescale as np_rescale,
  36. )
  37. from .image_transforms import (
  38. resize as np_resize,
  39. )
  40. from .image_utils import (
  41. ChannelDimension,
  42. ImageInput,
  43. ImageType,
  44. SizeDict,
  45. get_image_size,
  46. get_image_size_for_max_height_width,
  47. get_image_type,
  48. get_max_height_width,
  49. infer_channel_dimension_format,
  50. )
  51. from .processing_utils import ImagesKwargs, Unpack
  52. from .utils import (
  53. TensorType,
  54. is_torch_available,
  55. is_torchvision_available,
  56. is_vision_available,
  57. logging,
  58. )
  59. from .utils.import_utils import is_rocm_platform, is_torchdynamo_compiling, requires
  60. if is_vision_available():
  61. from .image_utils import PILImageResampling
  62. if is_torch_available():
  63. import torch
  64. if is_torchvision_available():
  65. from torchvision.transforms.v2 import functional as tvF
  66. from .image_utils import pil_torch_interpolation_mapping, torch_pil_interpolation_mapping
  67. else:
  68. pil_torch_interpolation_mapping = None
  69. torch_pil_interpolation_mapping = None
  70. logger = logging.get_logger(__name__)
  71. @requires(backends=("torch", "torchvision"))
  72. class TorchvisionBackend(BaseImageProcessor):
  73. """Torchvision backend for GPU-accelerated batched image processing."""
  74. def __init__(self, **kwargs: Unpack[ImagesKwargs]):
  75. super().__init__(**kwargs)
  76. self._set_attributes(**kwargs)
  77. @property
  78. def is_fast(self) -> bool:
  79. """
  80. `bool`: Whether or not this image processor is using the fast (Torchvision) backend.
  81. The `is_fast` property is deprecated and will be removed in v5.3 of Transformers.
  82. Use the `backend` attribute instead (e.g., `processor.backend == "torchvision"`).
  83. """
  84. logger.warning_once(
  85. "The `is_fast` property is deprecated and will be removed in v5.3 of Transformers. "
  86. "Use the `backend` attribute instead (e.g., `processor.backend == 'torchvision'`)."
  87. )
  88. return True
  89. @property
  90. def backend(self) -> str:
  91. """
  92. `str`: The backend used by this image processor.
  93. """
  94. return "torchvision"
  95. def process_image(
  96. self,
  97. image: ImageInput,
  98. do_convert_rgb: bool | None = None,
  99. input_data_format: str | ChannelDimension | None = None,
  100. device: Optional["torch.device"] = None,
  101. **kwargs: Unpack[ImagesKwargs],
  102. ) -> "torch.Tensor":
  103. """Process a single image for torchvision backend."""
  104. image_type = get_image_type(image)
  105. if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
  106. raise ValueError(f"Unsupported input image type {image_type}")
  107. if do_convert_rgb:
  108. image = self.convert_to_rgb(image)
  109. if image_type == ImageType.PIL:
  110. image = tvF.pil_to_tensor(image)
  111. elif image_type == ImageType.NUMPY:
  112. image = torch.from_numpy(image).contiguous()
  113. if image.ndim == 2:
  114. image = image.unsqueeze(0)
  115. if input_data_format is None:
  116. input_data_format = infer_channel_dimension_format(image)
  117. if input_data_format == ChannelDimension.LAST:
  118. image = image.permute(2, 0, 1).contiguous()
  119. if device is not None:
  120. image = image.to(device)
  121. return image
  122. def convert_to_rgb(self, image: ImageInput) -> ImageInput:
  123. """Convert an image to RGB format."""
  124. return convert_to_rgb(image)
  125. def pad(
  126. self,
  127. images: list["torch.Tensor"],
  128. pad_size: SizeDict = None,
  129. fill_value: int | None = 0,
  130. padding_mode: str | None = "constant",
  131. return_mask: bool = False,
  132. disable_grouping: bool | None = False,
  133. is_nested: bool | None = False,
  134. **kwargs,
  135. ) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
  136. """Pad images using Torchvision with batched operations."""
  137. if pad_size is not None:
  138. if not (pad_size.height and pad_size.width):
  139. raise ValueError(f"Pad size must contain 'height' and 'width' keys only. Got pad_size={pad_size}.")
  140. pad_size = (pad_size.height, pad_size.width)
  141. else:
  142. pad_size = get_max_height_width(images)
  143. grouped_images, grouped_images_index = group_images_by_shape(
  144. images, disable_grouping=disable_grouping, is_nested=is_nested
  145. )
  146. processed_images_grouped = {}
  147. processed_masks_grouped = {}
  148. for shape, stacked_images in grouped_images.items():
  149. image_size = stacked_images.shape[-2:]
  150. padding_height = pad_size[0] - image_size[0]
  151. padding_width = pad_size[1] - image_size[1]
  152. if padding_height < 0 or padding_width < 0:
  153. raise ValueError(
  154. f"Padding dimensions are negative. Please make sure that the `pad_size` is larger than the "
  155. f"image size. Got pad_size={pad_size}, image_size={image_size}."
  156. )
  157. if image_size != pad_size:
  158. padding = (0, 0, padding_width, padding_height)
  159. stacked_images = tvF.pad(stacked_images, padding, fill=fill_value, padding_mode=padding_mode)
  160. processed_images_grouped[shape] = stacked_images
  161. if return_mask:
  162. stacked_masks = torch.zeros_like(stacked_images, dtype=torch.int64)[..., 0, :, :]
  163. stacked_masks[..., : image_size[0], : image_size[1]] = 1
  164. processed_masks_grouped[shape] = stacked_masks
  165. processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=is_nested)
  166. if return_mask:
  167. processed_masks = reorder_images(processed_masks_grouped, grouped_images_index, is_nested=is_nested)
  168. return processed_images, processed_masks
  169. return processed_images
  170. def resize(
  171. self,
  172. image: "torch.Tensor",
  173. size: SizeDict,
  174. resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
  175. antialias: bool = True,
  176. **kwargs,
  177. ) -> "torch.Tensor":
  178. """Resize an image using Torchvision."""
  179. # Convert PIL resample to torchvision interpolation if needed
  180. if resample is not None:
  181. if isinstance(resample, (PILImageResampling, int)):
  182. interpolation = pil_torch_interpolation_mapping[resample]
  183. else:
  184. interpolation = resample
  185. else:
  186. interpolation = tvF.InterpolationMode.BILINEAR
  187. if interpolation == tvF.InterpolationMode.LANCZOS:
  188. logger.warning_once(
  189. "You have used a torchvision backend image processor with LANCZOS resample which not yet supported for torch.Tensor. "
  190. "BICUBIC resample will be used as an alternative. Please fall back to a pil backend image processor if you "
  191. "want full consistency with the original model."
  192. )
  193. interpolation = tvF.InterpolationMode.BICUBIC
  194. if size.shortest_edge and size.longest_edge:
  195. new_size = get_size_with_aspect_ratio(
  196. image.size()[-2:],
  197. size.shortest_edge,
  198. size.longest_edge,
  199. )
  200. elif size.shortest_edge:
  201. new_size = get_resize_output_image_size(
  202. image,
  203. size=size.shortest_edge,
  204. default_to_square=False,
  205. input_data_format=ChannelDimension.FIRST,
  206. )
  207. elif size.max_height and size.max_width:
  208. new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
  209. elif size.height and size.width:
  210. new_size = (size.height, size.width)
  211. else:
  212. raise ValueError(
  213. "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
  214. f" {size}."
  215. )
  216. # Workaround for torch.compile issue with uint8 on AMD GPUs
  217. if is_torchdynamo_compiling() and is_rocm_platform():
  218. return self._compile_friendly_resize(image, new_size, interpolation, antialias)
  219. return tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
  220. @staticmethod
  221. def _compile_friendly_resize(
  222. image: "torch.Tensor",
  223. new_size: tuple[int, int],
  224. interpolation: Optional["tvF.InterpolationMode"] = None,
  225. antialias: bool = True,
  226. ) -> "torch.Tensor":
  227. """A wrapper around tvF.resize for torch.compile compatibility with uint8 tensors."""
  228. if image.dtype == torch.uint8:
  229. image = image.float() / 256
  230. image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
  231. image = image * 256
  232. image = torch.where(image > 255, 255, image)
  233. image = torch.where(image < 0, 0, image)
  234. image = image.round().to(torch.uint8)
  235. else:
  236. image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
  237. return image
  238. def rescale(
  239. self,
  240. image: "torch.Tensor",
  241. scale: float,
  242. **kwargs,
  243. ) -> "torch.Tensor":
  244. """Rescale an image by a scale factor using Torchvision."""
  245. return image * scale
  246. def normalize(
  247. self,
  248. image: "torch.Tensor",
  249. mean: float | Iterable[float],
  250. std: float | Iterable[float],
  251. **kwargs,
  252. ) -> "torch.Tensor":
  253. """Normalize an image using Torchvision."""
  254. return tvF.normalize(image, mean, std)
  255. @lru_cache(maxsize=10)
  256. def _fuse_mean_std_and_rescale_factor(
  257. self,
  258. do_normalize: bool | None = None,
  259. image_mean: float | list[float] | None = None,
  260. image_std: float | list[float] | None = None,
  261. do_rescale: bool | None = None,
  262. rescale_factor: float | None = None,
  263. device: Optional["torch.device"] = None,
  264. ) -> tuple:
  265. if do_rescale and do_normalize:
  266. # Fused rescale and normalize
  267. image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
  268. image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
  269. do_rescale = False
  270. return image_mean, image_std, do_rescale
  271. def rescale_and_normalize(
  272. self,
  273. images: "torch.Tensor",
  274. do_rescale: bool,
  275. rescale_factor: float,
  276. do_normalize: bool,
  277. image_mean: float | list[float],
  278. image_std: float | list[float],
  279. ) -> "torch.Tensor":
  280. """Rescale and normalize images using Torchvision (fused for efficiency)."""
  281. image_mean, image_std, do_rescale = self._fuse_mean_std_and_rescale_factor(
  282. do_normalize=do_normalize,
  283. image_mean=image_mean,
  284. image_std=image_std,
  285. do_rescale=do_rescale,
  286. rescale_factor=rescale_factor,
  287. device=images.device,
  288. )
  289. if do_normalize:
  290. images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
  291. elif do_rescale:
  292. images = self.rescale(images, rescale_factor)
  293. return images
  294. def center_crop(
  295. self,
  296. image: "torch.Tensor",
  297. size: SizeDict,
  298. **kwargs,
  299. ) -> "torch.Tensor":
  300. """Center crop an image using Torchvision."""
  301. if size.height is None or size.width is None:
  302. raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
  303. image_height, image_width = image.shape[-2:]
  304. crop_height, crop_width = size.height, size.width
  305. if crop_width > image_width or crop_height > image_height:
  306. padding_ltrb = [
  307. (crop_width - image_width) // 2 if crop_width > image_width else 0,
  308. (crop_height - image_height) // 2 if crop_height > image_height else 0,
  309. (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
  310. (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
  311. ]
  312. image = tvF.pad(image, padding_ltrb, fill=0)
  313. image_height, image_width = image.shape[-2:]
  314. if crop_width == image_width and crop_height == image_height:
  315. return image
  316. crop_top = int((image_height - crop_height) / 2.0)
  317. crop_left = int((image_width - crop_width) / 2.0)
  318. return tvF.crop(image, crop_top, crop_left, crop_height, crop_width)
  319. def _preprocess(
  320. self,
  321. images: list["torch.Tensor"],
  322. do_resize: bool,
  323. size: SizeDict,
  324. resample: "PILImageResampling | tvF.InterpolationMode | int | None",
  325. do_center_crop: bool,
  326. crop_size: SizeDict,
  327. do_rescale: bool,
  328. rescale_factor: float,
  329. do_normalize: bool,
  330. image_mean: float | list[float] | None,
  331. image_std: float | list[float] | None,
  332. do_pad: bool | None,
  333. pad_size: SizeDict | None,
  334. disable_grouping: bool | None,
  335. return_tensors: str | TensorType | None,
  336. **kwargs,
  337. ) -> BatchFeature:
  338. """Preprocess using Torchvision backend (fast, GPU-accelerated)."""
  339. # Group images by size for batched resizing
  340. grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
  341. resized_images_grouped = {}
  342. for shape, stacked_images in grouped_images.items():
  343. if do_resize:
  344. stacked_images = self.resize(image=stacked_images, size=size, resample=resample)
  345. resized_images_grouped[shape] = stacked_images
  346. resized_images = reorder_images(resized_images_grouped, grouped_images_index)
  347. # Group images by size for further processing
  348. grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
  349. processed_images_grouped = {}
  350. for shape, stacked_images in grouped_images.items():
  351. if do_center_crop:
  352. stacked_images = self.center_crop(stacked_images, crop_size)
  353. # Fused rescale and normalize
  354. stacked_images = self.rescale_and_normalize(
  355. stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
  356. )
  357. processed_images_grouped[shape] = stacked_images
  358. processed_images = reorder_images(processed_images_grouped, grouped_images_index)
  359. if do_pad:
  360. processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)
  361. return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
  362. @requires(backends=("vision",))
  363. class PilBackend(BaseImageProcessor):
  364. """PIL/NumPy backend for portable CPU-only image processing."""
  365. def __init__(self, **kwargs: Unpack[ImagesKwargs]):
  366. super().__init__(**kwargs)
  367. self._set_attributes(**kwargs)
  368. @property
  369. def is_fast(self) -> bool:
  370. """
  371. `bool`: Whether or not this image processor is using the fast (Torchvision) backend.
  372. The `is_fast` property is deprecated and will be removed in v5.3 of Transformers.
  373. Use the `backend` attribute instead (e.g., `processor.backend == "torchvision"`).
  374. """
  375. logger.warning_once(
  376. "The `is_fast` property is deprecated and will be removed in v5.3 of Transformers. "
  377. "Use the `backend` attribute instead (e.g., `processor.backend == 'torchvision'`)."
  378. )
  379. return False
  380. @property
  381. def backend(self) -> str:
  382. """
  383. `str`: The backend used by this image processor.
  384. """
  385. return "pil"
  386. def process_image(
  387. self,
  388. image: ImageInput,
  389. do_convert_rgb: bool | None = None,
  390. input_data_format: str | ChannelDimension | None = None,
  391. **kwargs: Unpack[ImagesKwargs],
  392. ) -> np.ndarray:
  393. """Process a single image for PIL backend."""
  394. image_type = get_image_type(image)
  395. if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
  396. raise ValueError(f"Unsupported input image type {image_type}")
  397. if do_convert_rgb:
  398. image = self.convert_to_rgb(image)
  399. if image_type == ImageType.PIL:
  400. image = np.array(image)
  401. # Set LAST only for multi-channel PIL images (H, W, C); for grayscale (H, W), leave as is to avoid shape errors after expand_dims.
  402. if image.ndim >= 3:
  403. input_data_format = ChannelDimension.LAST if input_data_format is None else input_data_format
  404. elif image_type == ImageType.TORCH:
  405. image = image.numpy()
  406. if image.ndim == 2:
  407. image = np.expand_dims(image, axis=0)
  408. if input_data_format is None:
  409. input_data_format = infer_channel_dimension_format(image)
  410. if input_data_format == ChannelDimension.LAST:
  411. # Convert from channels-last to channels-first
  412. if isinstance(image, np.ndarray):
  413. image = np.transpose(image, (2, 0, 1))
  414. return image
  415. def convert_to_rgb(self, image: ImageInput) -> ImageInput:
  416. """Convert an image to RGB format."""
  417. return convert_to_rgb(image)
  418. def pad(
  419. self,
  420. images: list[np.ndarray],
  421. pad_size: SizeDict = None,
  422. fill_value: int | None = 0,
  423. padding_mode: str | None = "constant",
  424. return_mask: bool = False,
  425. **kwargs,
  426. ) -> tuple[list[np.ndarray], list[np.ndarray]] | list[np.ndarray]:
  427. """Pad images to specified size using NumPy."""
  428. if pad_size is not None:
  429. if not (pad_size.height and pad_size.width):
  430. raise ValueError(f"Pad size must contain 'height' and 'width' keys only. Got pad_size={pad_size}.")
  431. target_height, target_width = pad_size.height, pad_size.width
  432. else:
  433. target_height, target_width = get_max_height_width(images)
  434. processed_images = []
  435. processed_masks = []
  436. for image in images:
  437. height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
  438. padding_height = target_height - height
  439. padding_width = target_width - width
  440. if padding_height < 0 or padding_width < 0:
  441. raise ValueError(
  442. f"Padding dimensions are negative. Please make sure that the `pad_size` is larger than the "
  443. f"image size. Got pad_size=({target_height}, {target_width}), image_size=({height}, {width})."
  444. )
  445. if height != target_height or width != target_width:
  446. # Pad format: ((before_1, after_1), (before_2, after_2), ...)
  447. # For CHW format: ((0, 0), (0, padding_height), (0, padding_width))
  448. pad_width = ((0, 0), (0, padding_height), (0, padding_width))
  449. if padding_mode == "constant":
  450. image = np.pad(image, pad_width, mode="constant", constant_values=fill_value)
  451. else:
  452. image = np.pad(image, pad_width, mode=padding_mode)
  453. processed_images.append(image)
  454. if return_mask:
  455. mask = np.zeros((target_height, target_width), dtype=np.int64)
  456. mask[:height, :width] = 1
  457. processed_masks.append(mask)
  458. if return_mask:
  459. return processed_images, processed_masks
  460. return processed_images
  461. def resize(
  462. self,
  463. image: np.ndarray,
  464. size: SizeDict,
  465. resample: "PILImageResampling | None" = None,
  466. reducing_gap: int | None = None,
  467. **kwargs,
  468. ) -> np.ndarray:
  469. """Resize an image using PIL/NumPy."""
  470. # PIL backend only supports PILImageResampling
  471. if resample is not None and not isinstance(resample, (PILImageResampling, int)):
  472. if torch_pil_interpolation_mapping is not None and resample in torch_pil_interpolation_mapping:
  473. resample = torch_pil_interpolation_mapping[resample]
  474. else:
  475. resample = PILImageResampling.BILINEAR
  476. resample = resample if resample is not None else PILImageResampling.BILINEAR
  477. if size.shortest_edge and size.longest_edge:
  478. height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
  479. new_size = get_size_with_aspect_ratio(
  480. (height, width),
  481. size.shortest_edge,
  482. size.longest_edge,
  483. )
  484. elif size.shortest_edge:
  485. new_size = get_resize_output_image_size(
  486. image,
  487. size=size.shortest_edge,
  488. default_to_square=False,
  489. input_data_format=ChannelDimension.FIRST,
  490. )
  491. elif size.max_height and size.max_width:
  492. height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
  493. new_size = get_image_size_for_max_height_width((height, width), size.max_height, size.max_width)
  494. elif size.height and size.width:
  495. new_size = (size.height, size.width)
  496. else:
  497. raise ValueError(
  498. "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
  499. f" {size}."
  500. )
  501. return np_resize(
  502. image,
  503. size=new_size,
  504. resample=resample,
  505. reducing_gap=reducing_gap,
  506. data_format=ChannelDimension.FIRST,
  507. input_data_format=ChannelDimension.FIRST,
  508. )
  509. def rescale(
  510. self,
  511. image: np.ndarray,
  512. scale: float,
  513. **kwargs,
  514. ) -> np.ndarray:
  515. """Rescale an image by a scale factor using NumPy."""
  516. return np_rescale(
  517. image,
  518. scale=scale,
  519. data_format=ChannelDimension.FIRST,
  520. input_data_format=ChannelDimension.FIRST,
  521. )
  522. def normalize(
  523. self,
  524. image: np.ndarray,
  525. mean: float | Iterable[float],
  526. std: float | Iterable[float],
  527. **kwargs,
  528. ) -> np.ndarray:
  529. """Normalize an image using NumPy."""
  530. return np_normalize(
  531. image,
  532. mean=mean,
  533. std=std,
  534. data_format=ChannelDimension.FIRST,
  535. input_data_format=ChannelDimension.FIRST,
  536. )
  537. def center_crop(
  538. self,
  539. image: np.ndarray,
  540. size: SizeDict,
  541. **kwargs,
  542. ) -> np.ndarray:
  543. """Center crop an image using NumPy."""
  544. if size.height is None or size.width is None:
  545. raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
  546. return np_center_crop(
  547. image,
  548. size=(size.height, size.width),
  549. data_format=ChannelDimension.FIRST,
  550. input_data_format=ChannelDimension.FIRST,
  551. )
  552. def _preprocess(
  553. self,
  554. images: list[np.ndarray],
  555. do_resize: bool,
  556. size: SizeDict,
  557. resample: "PILImageResampling | None",
  558. do_center_crop: bool,
  559. crop_size: SizeDict,
  560. do_rescale: bool,
  561. rescale_factor: float,
  562. do_normalize: bool,
  563. image_mean: float | list[float] | None,
  564. image_std: float | list[float] | None,
  565. do_pad: bool | None,
  566. pad_size: SizeDict | None,
  567. return_tensors: str | TensorType | None,
  568. **kwargs,
  569. ) -> BatchFeature:
  570. """Preprocess using PIL backend (portable, CPU-only)."""
  571. processed_images = []
  572. for image in images:
  573. if do_resize:
  574. image = self.resize(image=image, size=size, resample=resample)
  575. if do_center_crop:
  576. image = self.center_crop(image, crop_size)
  577. if do_rescale:
  578. image = self.rescale(image, rescale_factor)
  579. if do_normalize:
  580. image = self.normalize(image, image_mean, image_std)
  581. processed_images.append(image)
  582. if do_pad:
  583. processed_images = self.pad(processed_images, pad_size=pad_size)
  584. return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
  585. def to_dict(self) -> dict[str, Any]:
  586. processor_dict = super().to_dict()
  587. # Remove the "Pil" suffix from the image processor type
  588. if processor_dict.get("image_processor_type", "").endswith("Pil"):
  589. processor_dict["image_processor_type"] = processor_dict["image_processor_type"][:-3]
  590. return processor_dict
  591. # Backward-compatible alias: allow referring to TorchvisionBackend as BaseImageProcessorFast
  592. BaseImageProcessorFast = TorchvisionBackend