video_processing_utils.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. # Copyright 2025 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import json
  15. import os
  16. import warnings
  17. from collections.abc import Callable
  18. from functools import partial
  19. from typing import Any
  20. import numpy as np
  21. from huggingface_hub import create_repo, is_offline_mode
  22. from huggingface_hub.dataclasses import validate_typed_dict
  23. from .dynamic_module_utils import custom_object_save
  24. from .image_processing_backends import TorchvisionBackend
  25. from .image_processing_utils import BatchFeature
  26. from .image_utils import (
  27. ChannelDimension,
  28. SizeDict,
  29. is_vision_available,
  30. validate_kwargs,
  31. )
  32. from .processing_utils import Unpack, VideosKwargs
  33. from .utils import (
  34. IMAGE_PROCESSOR_NAME,
  35. PROCESSOR_NAME,
  36. VIDEO_PROCESSOR_NAME,
  37. TensorType,
  38. add_start_docstrings,
  39. copy_func,
  40. is_torch_available,
  41. is_torchcodec_available,
  42. is_torchvision_v2_available,
  43. logging,
  44. safe_load_json_file,
  45. )
  46. from .utils.hub import cached_file
  47. from .utils.import_utils import requires
  48. from .video_utils import (
  49. VideoInput,
  50. VideoMetadata,
  51. group_videos_by_shape,
  52. infer_channel_dimension_format,
  53. is_valid_video,
  54. load_video,
  55. make_batched_metadata,
  56. make_batched_videos,
  57. reorder_videos,
  58. )
  59. if is_torch_available():
  60. import torch
  61. if is_torchvision_v2_available():
  62. import torchvision.transforms.v2.functional as tvF
  63. if is_vision_available():
  64. from .image_utils import PILImageResampling
  65. logger = logging.get_logger(__name__)
  66. BASE_VIDEO_PROCESSOR_DOCSTRING = r"""
  67. Args:
  68. do_resize (`bool`, *optional*, defaults to `self.do_resize`):
  69. Whether to resize the video's (height, width) dimensions to the specified `size`. Can be overridden by the
  70. `do_resize` parameter in the `preprocess` method.
  71. size (`dict`, *optional*, defaults to `self.size`):
  72. Size of the output video after resizing. Can be overridden by the `size` parameter in the `preprocess`
  73. method.
  74. size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
  75. The size by which to make sure both the height and width can be divided.
  76. default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
  77. Whether to default to a square video when resizing, if size is an int.
  78. resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
  79. Resampling filter to use if resizing the video. Only has an effect if `do_resize` is set to `True`. Can be
  80. overridden by the `resample` parameter in the `preprocess` method.
  81. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
  82. Whether to center crop the video to the specified `crop_size`. Can be overridden by `do_center_crop` in the
  83. `preprocess` method.
  84. crop_size (`dict[str, int]` *optional*, defaults to `self.crop_size`):
  85. Size of the output video after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
  86. method.
  87. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
  88. Whether to rescale the video by the specified scale `rescale_factor`. Can be overridden by the
  89. `do_rescale` parameter in the `preprocess` method.
  90. rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
  91. Scale factor to use if rescaling the video. Only has an effect if `do_rescale` is set to `True`. Can be
  92. overridden by the `rescale_factor` parameter in the `preprocess` method.
  93. do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
  94. Whether to normalize the video. Can be overridden by the `do_normalize` parameter in the `preprocess`
  95. method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
  96. image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
  97. Mean to use if normalizing the video. This is a float or list of floats the length of the number of
  98. channels in the video. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
  99. overridden by the `image_mean` parameter in the `preprocess` method.
  100. image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
  101. Standard deviation to use if normalizing the video. This is a float or list of floats the length of the
  102. number of channels in the video. Can be overridden by the `image_std` parameter in the `preprocess` method.
  103. Can be overridden by the `image_std` parameter in the `preprocess` method.
  104. do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
  105. Whether to convert the video to RGB.
  106. video_metadata (`VideoMetadata`, *optional*):
  107. Metadata of the video containing information about total duration, fps and total number of frames.
  108. do_sample_frames (`int`, *optional*, defaults to `self.do_sample_frames`):
  109. Whether to sample frames from the video before processing or to process the whole video.
  110. num_frames (`int`, *optional*, defaults to `self.num_frames`):
  111. Maximum number of frames to sample when `do_sample_frames=True`.
  112. fps (`int` or `float`, *optional*, defaults to `self.fps`):
  113. Target frames to sample per second when `do_sample_frames=True`.
  114. return_tensors (`str` or `TensorType`, *optional*):
  115. Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
  116. data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
  117. The channel dimension format for the output video. Can be one of:
  118. - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_channels, height, width) format.
  119. - `"channels_last"` or `ChannelDimension.LAST`: video in (height, width, num_channels) format.
  120. - Unset: Use the channel dimension format of the input video.
  121. input_data_format (`ChannelDimension` or `str`, *optional*):
  122. The channel dimension format for the input video. If unset, the channel dimension format is inferred
  123. from the input video. Can be one of:
  124. - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_channels, height, width) format.
  125. - `"channels_last"` or `ChannelDimension.LAST`: video in (height, width, num_channels) format.
  126. - `"none"` or `ChannelDimension.NONE`: video in (height, width) format.
  127. device (`torch.device`, *optional*):
  128. The device to process the videos on. If unset, the device is inferred from the input videos.
  129. return_metadata (`bool`, *optional*):
  130. Whether to return video metadata or not.
  131. """
  132. @add_start_docstrings(
  133. "Constructs a base VideoProcessor.",
  134. BASE_VIDEO_PROCESSOR_DOCSTRING,
  135. )
  136. @requires(backends=("vision", "torchvision"))
  137. class BaseVideoProcessor(TorchvisionBackend):
  138. _auto_class = None
  139. resample = None
  140. image_mean = None
  141. image_std = None
  142. size = None
  143. size_divisor = None
  144. default_to_square = True
  145. crop_size = None
  146. do_resize = None
  147. do_center_crop = None
  148. do_rescale = None
  149. rescale_factor = 1 / 255
  150. do_normalize = None
  151. do_convert_rgb = None
  152. do_sample_frames = None
  153. fps = None
  154. num_frames = None
  155. video_metadata = None
  156. return_metadata = False
  157. valid_kwargs = VideosKwargs
  158. model_input_names = ["pixel_values_videos"]
  159. def __init__(self, **kwargs: Unpack[VideosKwargs]) -> None:
  160. super().__init__(**kwargs)
  161. def __call__(self, videos, **kwargs) -> BatchFeature:
  162. return self.preprocess(videos, **kwargs)
  163. def convert_to_rgb(
  164. self,
  165. video: "torch.Tensor",
  166. ) -> VideoInput:
  167. """
  168. Converts a video to RGB format.
  169. Args:
  170. video (`"torch.Tensor"`):
  171. The video to convert.
  172. Returns:
  173. `torch.Tensor`: The converted video.
  174. """
  175. video = tvF.grayscale_to_rgb(video)
  176. if video.shape[-3] == 3 or not (video[..., 3, :, :] < 255).any():
  177. return video
  178. # There is a transparency layer, blend it with a white background.
  179. # Calculate the alpha proportion for blending.
  180. alpha = video[..., 3, :, :] / 255.0
  181. video = (1 - alpha[..., None, :, :]) * 255 + alpha[..., None, :, :] * video[..., :3, :, :]
  182. return video
  183. def sample_frames(
  184. self,
  185. metadata: VideoMetadata,
  186. num_frames: int | None = None,
  187. fps: int | float | None = None,
  188. **kwargs,
  189. ):
  190. """
  191. Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
  192. If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
  193. and `fps` are mutually exclusive.
  194. Args:
  195. metadata (`VideoMetadata`):
  196. Metadata of the video containing information about total duration, fps and total number of frames.
  197. num_frames (`int`, *optional*):
  198. Maximum number of frames to sample. Defaults to `self.num_frames`.
  199. fps (`int` or `float`, *optional*):
  200. Target frames to sample per second. Defaults to `self.fps`.
  201. Returns:
  202. np.ndarray:
  203. Indices to sample video frames.
  204. """
  205. if fps is not None and num_frames is not None:
  206. raise ValueError(
  207. "`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!"
  208. )
  209. num_frames = num_frames if num_frames is not None else self.num_frames
  210. fps = fps if fps is not None else self.fps
  211. total_num_frames = metadata.total_num_frames
  212. # If num_frames is not given but fps is, calculate num_frames from fps
  213. if num_frames is None and fps is not None:
  214. if metadata is None or metadata.fps is None:
  215. raise ValueError(
  216. "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
  217. "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
  218. )
  219. num_frames = int(total_num_frames / metadata.fps * fps)
  220. if num_frames > total_num_frames:
  221. raise ValueError(
  222. f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
  223. )
  224. if num_frames is not None:
  225. indices = torch.arange(0, total_num_frames, total_num_frames / num_frames).int()
  226. else:
  227. indices = torch.arange(0, total_num_frames).int()
  228. return indices
  229. def _decode_and_sample_videos(
  230. self,
  231. videos: VideoInput,
  232. video_metadata: VideoMetadata | dict,
  233. do_sample_frames: bool | None = None,
  234. sample_indices_fn: Callable | None = None,
  235. ) -> list["torch.Tensor"]:
  236. """
  237. Decode input videos and sample frames if needed.
  238. """
  239. videos = make_batched_videos(videos)
  240. video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
  241. # Only sample frames if an array video is passed, otherwise first decode -> then sample
  242. if is_valid_video(videos[0]) and do_sample_frames:
  243. sampled_videos = []
  244. sampled_metadata = []
  245. for video, metadata in zip(videos, video_metadata):
  246. indices = sample_indices_fn(metadata=metadata)
  247. metadata.frames_indices = indices
  248. sampled_videos.append(video[indices])
  249. sampled_metadata.append(metadata)
  250. videos = sampled_videos
  251. video_metadata = sampled_metadata
  252. elif not is_valid_video(videos[0]):
  253. if isinstance(videos[0], list):
  254. # Videos sometimes are passed as a list of image URLs, especially through templates
  255. videos = [
  256. torch.stack([tvF.pil_to_tensor(image) for image in images], dim=0)
  257. for images in self.fetch_images(videos)
  258. ]
  259. if do_sample_frames:
  260. raise ValueError(
  261. "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`."
  262. )
  263. else:
  264. videos, video_metadata = self.fetch_videos(videos, sample_indices_fn=sample_indices_fn)
  265. return videos, video_metadata
  266. def _prepare_input_videos(
  267. self,
  268. videos: VideoInput,
  269. input_data_format: str | ChannelDimension | None = None,
  270. device: str | None = None,
  271. ) -> list["torch.Tensor"]:
  272. """
  273. Prepare the input videos for processing.
  274. """
  275. processed_videos = []
  276. for video in videos:
  277. # `make_batched_videos` always returns a 4D array per video
  278. if isinstance(video, np.ndarray):
  279. # not using tvF.to_tensor as it doesn't handle (C, H, W) numpy arrays
  280. video = torch.from_numpy(video).contiguous()
  281. # Infer the channel dimension format if not provided
  282. if input_data_format is None:
  283. input_data_format = infer_channel_dimension_format(video)
  284. if input_data_format == ChannelDimension.LAST:
  285. video = video.permute(0, 3, 1, 2).contiguous()
  286. if device is not None:
  287. video = video.to(device)
  288. processed_videos.append(video)
  289. return processed_videos
  290. @add_start_docstrings(
  291. BASE_VIDEO_PROCESSOR_DOCSTRING,
  292. )
  293. def preprocess(
  294. self,
  295. videos: VideoInput,
  296. **kwargs: Unpack[VideosKwargs],
  297. ) -> BatchFeature:
  298. validate_kwargs(
  299. captured_kwargs=kwargs.keys(),
  300. valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
  301. )
  302. # Perform type validation on received kwargs
  303. validate_typed_dict(self.valid_kwargs, kwargs)
  304. # Set default kwargs from self. This ensures that if a kwarg is not provided
  305. # by the user, it gets its default value from the instance, or is set to None.
  306. for kwarg_name in self.valid_kwargs.__annotations__:
  307. kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
  308. input_data_format = kwargs.pop("input_data_format")
  309. do_sample_frames = kwargs.pop("do_sample_frames")
  310. device = kwargs.pop("device")
  311. video_metadata = kwargs.pop("video_metadata")
  312. sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
  313. videos, video_metadata = self._decode_and_sample_videos(
  314. videos,
  315. video_metadata=video_metadata,
  316. do_sample_frames=do_sample_frames,
  317. sample_indices_fn=sample_indices_fn,
  318. )
  319. videos = self._prepare_input_videos(videos=videos, input_data_format=input_data_format, device=device)
  320. kwargs = self._standardize_kwargs(**kwargs)
  321. self._validate_preprocess_kwargs(**kwargs)
  322. # Pop kwargs that are not needed in _preprocess
  323. kwargs.pop("data_format")
  324. return_metadata = kwargs.pop("return_metadata")
  325. preprocessed_videos = self._preprocess(videos=videos, **kwargs)
  326. if return_metadata:
  327. preprocessed_videos["video_metadata"] = video_metadata
  328. return preprocessed_videos
  329. def _preprocess(
  330. self,
  331. videos: list["torch.Tensor"],
  332. do_convert_rgb: bool,
  333. do_resize: bool,
  334. size: SizeDict,
  335. resample: "PILImageResampling | tvF.InterpolationMode | int | None",
  336. do_center_crop: bool,
  337. crop_size: SizeDict,
  338. do_rescale: bool,
  339. rescale_factor: float,
  340. do_normalize: bool,
  341. image_mean: float | list[float] | None,
  342. image_std: float | list[float] | None,
  343. return_tensors: str | TensorType | None = None,
  344. **kwargs,
  345. ) -> BatchFeature:
  346. # Group videos by size for batched resizing
  347. grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
  348. resized_videos_grouped = {}
  349. for shape, stacked_videos in grouped_videos.items():
  350. if do_convert_rgb:
  351. stacked_videos = self.convert_to_rgb(stacked_videos)
  352. if do_resize:
  353. stacked_videos = self.resize(stacked_videos, size=size, resample=resample)
  354. resized_videos_grouped[shape] = stacked_videos
  355. resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
  356. # Group videos by size for further processing
  357. # Needed in case do_resize is False, or resize returns videos with different sizes
  358. grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
  359. processed_videos_grouped = {}
  360. for shape, stacked_videos in grouped_videos.items():
  361. if do_center_crop:
  362. stacked_videos = self.center_crop(stacked_videos, crop_size)
  363. # Fused rescale and normalize
  364. stacked_videos = self.rescale_and_normalize(
  365. stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
  366. )
  367. processed_videos_grouped[shape] = stacked_videos
  368. processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
  369. return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
  370. @classmethod
  371. def from_pretrained(
  372. cls,
  373. pretrained_model_name_or_path: str | os.PathLike,
  374. cache_dir: str | os.PathLike | None = None,
  375. force_download: bool = False,
  376. local_files_only: bool = False,
  377. token: str | bool | None = None,
  378. revision: str = "main",
  379. **kwargs,
  380. ):
  381. r"""
  382. Instantiate a type of [`~video_processing_utils.VideoProcessorBase`] from an video processor.
  383. Args:
  384. pretrained_model_name_or_path (`str` or `os.PathLike`):
  385. This can be either:
  386. - a string, the *model id* of a pretrained video hosted inside a model repo on
  387. huggingface.co.
  388. - a path to a *directory* containing a video processor file saved using the
  389. [`~video_processing_utils.VideoProcessorBase.save_pretrained`] method, e.g.,
  390. `./my_model_directory/`.
  391. - a path to a saved video processor JSON *file*, e.g.,
  392. `./my_model_directory/video_preprocessor_config.json`.
  393. cache_dir (`str` or `os.PathLike`, *optional*):
  394. Path to a directory in which a downloaded pretrained model video processor should be cached if the
  395. standard cache should not be used.
  396. force_download (`bool`, *optional*, defaults to `False`):
  397. Whether or not to force to (re-)download the video processor files and override the cached versions if
  398. they exist.
  399. proxies (`dict[str, str]`, *optional*):
  400. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  401. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  402. token (`str` or `bool`, *optional*):
  403. The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
  404. the token generated when running `hf auth login` (stored in `~/.huggingface`).
  405. revision (`str`, *optional*, defaults to `"main"`):
  406. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  407. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  408. identifier allowed by git.
  409. <Tip>
  410. To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
  411. </Tip>
  412. return_unused_kwargs (`bool`, *optional*, defaults to `False`):
  413. If `False`, then this function returns just the final video processor object. If `True`, then this
  414. functions returns a `Tuple(video_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
  415. consisting of the key/value pairs whose keys are not video processor attributes: i.e., the part of
  416. `kwargs` which has not been used to update `video_processor` and is otherwise ignored.
  417. subfolder (`str`, *optional*, defaults to `""`):
  418. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  419. specify the folder name here.
  420. kwargs (`dict[str, Any]`, *optional*):
  421. The values in kwargs of any keys which are video processor attributes will be used to override the
  422. loaded values. Behavior concerning key/value pairs whose keys are *not* video processor attributes is
  423. controlled by the `return_unused_kwargs` keyword parameter.
  424. Returns:
  425. A video processor of type [`~video_processing_utils.ImagVideoProcessorBase`].
  426. Examples:
  427. ```python
  428. # We can't instantiate directly the base class *VideoProcessorBase* so let's show the examples on a
  429. # derived class: *LlavaOnevisionVideoProcessor*
  430. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  431. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
  432. ) # Download video_processing_config from huggingface.co and cache.
  433. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  434. "./test/saved_model/"
  435. ) # E.g. video processor (or model) was saved using *save_pretrained('./test/saved_model/')*
  436. video_processor = LlavaOnevisionVideoProcessor.from_pretrained("./test/saved_model/video_preprocessor_config.json")
  437. video_processor = LlavaOnevisionVideoProcessor.from_pretrained(
  438. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", do_normalize=False, foo=False
  439. )
  440. assert video_processor.do_normalize is False
  441. video_processor, unused_kwargs = LlavaOnevisionVideoProcessor.from_pretrained(
  442. "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", do_normalize=False, foo=False, return_unused_kwargs=True
  443. )
  444. assert video_processor.do_normalize is False
  445. assert unused_kwargs == {"foo": False}
  446. ```"""
  447. kwargs["cache_dir"] = cache_dir
  448. kwargs["force_download"] = force_download
  449. kwargs["local_files_only"] = local_files_only
  450. kwargs["revision"] = revision
  451. if token is not None:
  452. kwargs["token"] = token
  453. video_processor_dict, kwargs = cls.get_video_processor_dict(pretrained_model_name_or_path, **kwargs)
  454. return cls.from_dict(video_processor_dict, **kwargs)
  455. def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
  456. """
  457. Save an video processor object to the directory `save_directory`, so that it can be re-loaded using the
  458. [`~video_processing_utils.VideoProcessorBase.from_pretrained`] class method.
  459. Args:
  460. save_directory (`str` or `os.PathLike`):
  461. Directory where the video processor JSON file will be saved (will be created if it does not exist).
  462. push_to_hub (`bool`, *optional*, defaults to `False`):
  463. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  464. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  465. namespace).
  466. kwargs (`dict[str, Any]`, *optional*):
  467. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  468. """
  469. if os.path.isfile(save_directory):
  470. raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
  471. os.makedirs(save_directory, exist_ok=True)
  472. if push_to_hub:
  473. commit_message = kwargs.pop("commit_message", None)
  474. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  475. repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
  476. files_timestamps = self._get_files_timestamps(save_directory)
  477. # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
  478. # loaded from the Hub.
  479. if self._auto_class is not None:
  480. custom_object_save(self, save_directory, config=self)
  481. # If we save using the predefined names, we can load using `from_pretrained`
  482. output_video_processor_file = os.path.join(save_directory, VIDEO_PROCESSOR_NAME)
  483. self.to_json_file(output_video_processor_file)
  484. logger.info(f"Video processor saved in {output_video_processor_file}")
  485. if push_to_hub:
  486. self._upload_modified_files(
  487. save_directory,
  488. repo_id,
  489. files_timestamps,
  490. commit_message=commit_message,
  491. token=kwargs.get("token"),
  492. )
  493. return [output_video_processor_file]
  494. @classmethod
  495. def get_video_processor_dict(
  496. cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
  497. ) -> tuple[dict[str, Any], dict[str, Any]]:
  498. """
  499. From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
  500. video processor of type [`~video_processing_utils.VideoProcessorBase`] using `from_dict`.
  501. Parameters:
  502. pretrained_model_name_or_path (`str` or `os.PathLike`):
  503. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
  504. subfolder (`str`, *optional*, defaults to `""`):
  505. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  506. specify the folder name here.
  507. Returns:
  508. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the video processor object.
  509. """
  510. cache_dir = kwargs.pop("cache_dir", None)
  511. force_download = kwargs.pop("force_download", False)
  512. proxies = kwargs.pop("proxies", None)
  513. token = kwargs.pop("token", None)
  514. local_files_only = kwargs.pop("local_files_only", False)
  515. revision = kwargs.pop("revision", None)
  516. subfolder = kwargs.pop("subfolder", "")
  517. from_pipeline = kwargs.pop("_from_pipeline", None)
  518. from_auto_class = kwargs.pop("_from_auto", False)
  519. user_agent = {"file_type": "video processor", "from_auto_class": from_auto_class}
  520. if from_pipeline is not None:
  521. user_agent["using_pipeline"] = from_pipeline
  522. if is_offline_mode() and not local_files_only:
  523. logger.info("Offline mode: forcing local_files_only=True")
  524. local_files_only = True
  525. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  526. is_local = os.path.isdir(pretrained_model_name_or_path)
  527. if os.path.isfile(pretrained_model_name_or_path):
  528. resolved_video_processor_file = pretrained_model_name_or_path
  529. resolved_processor_file = None
  530. is_local = True
  531. else:
  532. video_processor_file = VIDEO_PROCESSOR_NAME
  533. try:
  534. # Try to load with a new config name first and if not successful try with the old file name
  535. # NOTE: we save all processor configs as nested dict in PROCESSOR_NAME from v5, which is the standard
  536. resolved_processor_file = cached_file(
  537. pretrained_model_name_or_path,
  538. filename=PROCESSOR_NAME,
  539. cache_dir=cache_dir,
  540. force_download=force_download,
  541. proxies=proxies,
  542. local_files_only=local_files_only,
  543. token=token,
  544. user_agent=user_agent,
  545. revision=revision,
  546. subfolder=subfolder,
  547. _raise_exceptions_for_missing_entries=False,
  548. )
  549. resolved_video_processor_files = [
  550. resolved_file
  551. for filename in [video_processor_file, IMAGE_PROCESSOR_NAME]
  552. if (
  553. resolved_file := cached_file(
  554. pretrained_model_name_or_path,
  555. filename=filename,
  556. cache_dir=cache_dir,
  557. force_download=force_download,
  558. proxies=proxies,
  559. local_files_only=local_files_only,
  560. token=token,
  561. user_agent=user_agent,
  562. revision=revision,
  563. subfolder=subfolder,
  564. _raise_exceptions_for_missing_entries=False,
  565. )
  566. )
  567. is not None
  568. ]
  569. resolved_video_processor_file = (
  570. resolved_video_processor_files[0] if resolved_video_processor_files else None
  571. )
  572. except OSError:
  573. # Raise any OS error raise by `cached_file`. It will have a helpful error message adapted to
  574. # the original exception.
  575. raise
  576. except Exception:
  577. # For any other exception, we throw a generic error.
  578. raise OSError(
  579. f"Can't load video processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  580. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  581. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  582. f" directory containing a {video_processor_file} file"
  583. )
  584. # Load video_processor dict. Priority goes as (nested config if found -> video processor config -> image processor config)
  585. # We are downloading both configs because almost all models have a `processor_config.json` but
  586. # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
  587. video_processor_dict = None
  588. if resolved_processor_file is not None:
  589. processor_dict = safe_load_json_file(resolved_processor_file)
  590. if "video_processor" in processor_dict:
  591. video_processor_dict = processor_dict["video_processor"]
  592. if resolved_video_processor_file is not None and video_processor_dict is None:
  593. video_processor_dict = safe_load_json_file(resolved_video_processor_file)
  594. if video_processor_dict is None:
  595. raise OSError(
  596. f"Can't load video processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  597. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  598. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  599. f" directory containing a {video_processor_file} file"
  600. )
  601. if is_local:
  602. logger.info(f"loading configuration file {resolved_video_processor_file}")
  603. else:
  604. logger.info(
  605. f"loading configuration file {video_processor_file} from cache at {resolved_video_processor_file}"
  606. )
  607. return video_processor_dict, kwargs
  608. @classmethod
  609. def from_dict(cls, video_processor_dict: dict[str, Any], **kwargs):
  610. """
  611. Instantiates a type of [`~video_processing_utils.VideoProcessorBase`] from a Python dictionary of parameters.
  612. Args:
  613. video_processor_dict (`dict[str, Any]`):
  614. Dictionary that will be used to instantiate the video processor object. Such a dictionary can be
  615. retrieved from a pretrained checkpoint by leveraging the
  616. [`~video_processing_utils.VideoProcessorBase.to_dict`] method.
  617. kwargs (`dict[str, Any]`):
  618. Additional parameters from which to initialize the video processor object.
  619. Returns:
  620. [`~video_processing_utils.VideoProcessorBase`]: The video processor object instantiated from those
  621. parameters.
  622. """
  623. video_processor_dict = video_processor_dict.copy()
  624. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  625. video_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
  626. video_processor = cls(**video_processor_dict)
  627. # Apply extra kwargs to instance (BC for remote code, e.g. phi4_multimodal)
  628. extra_keys = []
  629. for key in reversed(list(kwargs.keys())):
  630. if hasattr(video_processor, key) and key not in cls.valid_kwargs.__annotations__:
  631. setattr(video_processor, key, kwargs.pop(key, None))
  632. extra_keys.append(key)
  633. if extra_keys:
  634. logger.warning_once(
  635. f"Image processor {cls.__name__}: kwargs {extra_keys} were applied for backward compatibility. "
  636. f"To avoid this warning, add them to valid_kwargs: create a custom TypedDict extending "
  637. f"ImagesKwargs with these keys and set it as the `valid_kwargs` class attribute."
  638. )
  639. logger.info(f"Video processor {video_processor}")
  640. if return_unused_kwargs:
  641. return video_processor, kwargs
  642. else:
  643. return video_processor
  644. def to_dict(self) -> dict[str, Any]:
  645. """
  646. Serializes this instance to a Python dictionary.
  647. Returns:
  648. `dict[str, Any]`: Dictionary of all the attributes that make up this video processor instance.
  649. """
  650. filtered_dict = super().to_dict()
  651. filtered_dict.pop("image_processor_type", None)
  652. filtered_dict["video_processor_type"] = self.__class__.__name__
  653. return filtered_dict
  654. def to_json_string(self) -> str:
  655. """
  656. Serializes this instance to a JSON string.
  657. Returns:
  658. `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
  659. """
  660. dictionary = self.to_dict()
  661. for key, value in dictionary.items():
  662. if isinstance(value, np.ndarray):
  663. dictionary[key] = value.tolist()
  664. return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
  665. def to_json_file(self, json_file_path: str | os.PathLike):
  666. """
  667. Save this instance to a JSON file.
  668. Args:
  669. json_file_path (`str` or `os.PathLike`):
  670. Path to the JSON file in which this image_processor instance's parameters will be saved.
  671. """
  672. with open(json_file_path, "w", encoding="utf-8") as writer:
  673. writer.write(self.to_json_string())
  674. def __repr__(self):
  675. return f"{self.__class__.__name__} {self.to_json_string()}"
  676. @classmethod
  677. def from_json_file(cls, json_file: str | os.PathLike):
  678. """
  679. Instantiates a video processor of type [`~video_processing_utils.VideoProcessorBase`] from the path to a JSON
  680. file of parameters.
  681. Args:
  682. json_file (`str` or `os.PathLike`):
  683. Path to the JSON file containing the parameters.
  684. Returns:
  685. A video processor of type [`~video_processing_utils.VideoProcessorBase`]: The video_processor object
  686. instantiated from that JSON file.
  687. """
  688. with open(json_file, "r", encoding="utf-8") as reader:
  689. text = reader.read()
  690. video_processor_dict = json.loads(text)
  691. return cls(**video_processor_dict)
  692. @classmethod
  693. def register_for_auto_class(cls, auto_class="AutoVideoProcessor"):
  694. """
  695. Register this class with a given auto class. This should only be used for custom video processors as the ones
  696. in the library are already mapped with `AutoVideoProcessor `.
  697. <Tip warning={true}>
  698. This API is experimental and may have some slight breaking changes in the next releases.
  699. </Tip>
  700. Args:
  701. auto_class (`str` or `type`, *optional*, defaults to `"AutoVideoProcessor "`):
  702. The auto class to register this new video processor with.
  703. """
  704. if not isinstance(auto_class, str):
  705. auto_class = auto_class.__name__
  706. import transformers.models.auto as auto_module
  707. if not hasattr(auto_module, auto_class):
  708. raise ValueError(f"{auto_class} is not a valid auto class.")
  709. cls._auto_class = auto_class
  710. def fetch_videos(self, video_url_or_urls: str | list[str] | list[list[str]], sample_indices_fn=None):
  711. """
  712. Convert a single or a list of urls into the corresponding `np.array` objects.
  713. If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
  714. returned.
  715. """
  716. backend = "torchcodec"
  717. if not is_torchcodec_available():
  718. warnings.warn(
  719. "`torchcodec` is not installed and cannot be used to decode the video by default. "
  720. "Falling back to `torchvision`. Note that `torchvision` decoding is deprecated and will be removed in future versions. "
  721. )
  722. backend = "torchvision"
  723. if isinstance(video_url_or_urls, list):
  724. return list(zip(*[self.fetch_videos(x, sample_indices_fn=sample_indices_fn) for x in video_url_or_urls]))
  725. else:
  726. return load_video(video_url_or_urls, backend=backend, sample_indices_fn=sample_indices_fn)
  727. BaseVideoProcessor.push_to_hub = copy_func(BaseVideoProcessor.push_to_hub)
  728. if BaseVideoProcessor.push_to_hub.__doc__ is not None:
  729. BaseVideoProcessor.push_to_hub.__doc__ = BaseVideoProcessor.push_to_hub.__doc__.format(
  730. object="video processor", object_class="AutoVideoProcessor", object_files="video processor file"
  731. )