processing_utils.py 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017
  1. # Copyright 2022 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. Processing saving/loading class for common processors.
  16. """
  17. import bisect
  18. import copy
  19. import inspect
  20. import json
  21. import os
  22. import sys
  23. import typing
  24. from dataclasses import dataclass
  25. from pathlib import Path
  26. from typing import Annotated, Any, Literal, TypedDict, TypeVar, Union
  27. import numpy as np
  28. import typing_extensions
  29. from huggingface_hub import create_repo, is_offline_mode
  30. from huggingface_hub.dataclasses import validate_typed_dict
  31. from huggingface_hub.errors import EntryNotFoundError
  32. from .audio_utils import AudioInput, load_audio
  33. from .dynamic_module_utils import custom_object_save
  34. from .feature_extraction_utils import BatchFeature
  35. from .image_utils import ChannelDimension, ImageInput, is_vision_available
  36. from .tokenization_utils_base import (
  37. PaddingStrategy,
  38. PreTokenizedInput,
  39. PreTrainedTokenizerBase,
  40. TextInput,
  41. TruncationStrategy,
  42. )
  43. from .utils import (
  44. AUDIO_TOKENIZER_NAME,
  45. CHAT_TEMPLATE_DIR,
  46. CHAT_TEMPLATE_FILE,
  47. LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE,
  48. PROCESSOR_NAME,
  49. PushToHubMixin,
  50. TensorType,
  51. cached_file,
  52. copy_func,
  53. direct_transformers_import,
  54. is_torch_available,
  55. list_repo_templates,
  56. logging,
  57. )
  58. from .utils.chat_template_utils import _get_template_variables, render_jinja_template
  59. from .utils.type_validators import (
  60. device_validator,
  61. image_size_validator,
  62. padding_validator,
  63. positive_any_number,
  64. positive_int,
  65. resampling_validator,
  66. tensor_type_validator,
  67. truncation_validator,
  68. video_metadata_validator,
  69. )
  70. from .video_utils import VideoInput, VideoMetadataType
  71. if is_torch_available():
  72. import torch
  73. from .modeling_utils import PreTrainedAudioTokenizerBase
  74. if is_vision_available():
  75. from .image_utils import PILImageResampling
  76. logger = logging.get_logger(__name__)
  77. # type hinting: specifying the type of processor class that inherits from ProcessorMixin
  78. SpecificProcessorType = TypeVar("SpecificProcessorType", bound="ProcessorMixin")
  79. # Dynamically import the Transformers module to grab the attribute classes of the processor from their names.
  80. transformers_module = direct_transformers_import(Path(__file__).parent)
  81. class _LazyAutoProcessorMapping(dict):
  82. """
  83. Lazy dictionary to avoid circular imports.
  84. The mapping names are only imported when accessed.
  85. """
  86. _MAPPING_NAMES = {
  87. "image_processor": ("transformers.models.auto.image_processing_auto", "AutoImageProcessor"),
  88. "video_processor": ("transformers.models.auto.video_processing_auto", "AutoVideoProcessor"),
  89. "feature_extractor": ("transformers.models.auto.feature_extraction_auto", "AutoFeatureExtractor"),
  90. "audio_processor": ("transformers.models.auto.feature_extraction_auto", "AutoFeatureExtractor"),
  91. "tokenizer": ("transformers.models.auto.tokenization_auto", "AutoTokenizer"),
  92. }
  93. def __getitem__(self, key):
  94. if key not in self._MAPPING_NAMES:
  95. raise KeyError(key)
  96. module_name, attr_name = self._MAPPING_NAMES[key]
  97. module = __import__(module_name, fromlist=[attr_name])
  98. return getattr(module, attr_name)
  99. def __contains__(self, key):
  100. return key in self._MAPPING_NAMES
  101. def keys(self):
  102. return self._MAPPING_NAMES.keys()
  103. MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping()
  104. MODALITY_TO_BASE_CLASS_MAPPING = {
  105. "audio_tokenizer": (
  106. "HiggsAudioV2TokenizerModel",
  107. "DacModel",
  108. ), # TODO: @eustlb, to be replaced with PreTrainedAudioTokenizerBase
  109. "audio_processor": "FeatureExtractionMixin",
  110. "tokenizer": ("PreTrainedTokenizerBase", "MistralCommonBackend"),
  111. "feature_extractor": "FeatureExtractionMixin",
  112. "image_processor": "ImageProcessingMixin",
  113. "video_processor": "BaseVideoProcessor",
  114. }
  115. def _get_modality_for_attribute(attribute_name: str) -> str:
  116. """
  117. Get the canonical modality type for a given attribute name.
  118. For example:
  119. - "image_processor" -> "image_processor"
  120. - "encoder_image_processor" -> "image_processor"
  121. - "text_tokenizer" -> "tokenizer"
  122. - "my_feature_extractor" -> "feature_extractor"
  123. """
  124. for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys():
  125. if modality in attribute_name:
  126. return modality
  127. raise ValueError(
  128. f"Cannot determine modality for attribute '{attribute_name}'. "
  129. f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}"
  130. )
  131. if sys.version_info >= (3, 11):
  132. Unpack = typing.Unpack
  133. else:
  134. Unpack = typing_extensions.Unpack
  135. class TextKwargs(TypedDict, total=False):
  136. """
  137. Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and
  138. docstrings associated.
  139. Attributes:
  140. add_special_tokens (`bool`, *optional*)
  141. Whether or not to add special tokens when encoding the sequences.
  142. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*)
  143. Activates and controls padding.
  144. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*):
  145. Activates and controls truncation.
  146. max_length (`int`, *optional*):
  147. Controls the maximum length to use by one of the truncation/padding parameters.
  148. stride (`int`, *optional*):
  149. If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
  150. is_split_into_words (`bool`, *optional*):
  151. Whether or not the input is already pre-tokenized.
  152. pad_to_multiple_of (`int`, *optional*):
  153. If set, will pad the sequence to a multiple of the provided value.
  154. return_token_type_ids (`bool`, *optional*):
  155. Whether to return token type IDs.
  156. return_attention_mask (`bool`, *optional*):
  157. Whether to return the attention mask.
  158. return_overflowing_tokens (`bool`, *optional*):
  159. Whether or not to return overflowing token sequences.
  160. return_special_tokens_mask (`bool`, *optional*):
  161. Whether or not to return special tokens mask information.
  162. return_offsets_mapping (`bool`, *optional*):
  163. Whether or not to return `(char_start, char_end)` for each token.
  164. return_length (`bool`, *optional*):
  165. Whether or not to return the lengths of the encoded inputs.
  166. verbose (`bool`, *optional*):
  167. Whether or not to print more information and warnings.
  168. padding_side (`str`, *optional*):
  169. The side on which padding will be applied.
  170. return_mm_token_type_ids (`bool`, *optional*):
  171. Whether to return multimodal token type ids indicating mm placeholder token positions.
  172. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  173. If set, will return tensors of a particular framework. Acceptable values are:
  174. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  175. - `'np'`: Return NumPy `np.ndarray` objects.
  176. """
  177. text_pair: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None
  178. text_target: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None
  179. text_pair_target: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None
  180. add_special_tokens: bool | None
  181. padding: Annotated[bool | str | PaddingStrategy | None, padding_validator()]
  182. truncation: Annotated[bool | str | TruncationStrategy | None, truncation_validator()]
  183. max_length: Annotated[int | None, positive_int()]
  184. stride: Annotated[int | None, positive_int()]
  185. is_split_into_words: bool | None
  186. pad_to_multiple_of: Annotated[int | None, positive_int()]
  187. return_token_type_ids: bool | None
  188. return_attention_mask: bool | None
  189. return_overflowing_tokens: bool | None
  190. return_special_tokens_mask: bool | None
  191. return_offsets_mapping: bool | None
  192. return_length: bool | None
  193. verbose: bool | None
  194. padding_side: Literal["left", "right"] | None
  195. return_mm_token_type_ids: bool | None
  196. return_tensors: Annotated[str | TensorType | None, tensor_type_validator()]
  197. class ImagesKwargs(TypedDict, total=False):
  198. """
  199. Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor
  200. class methods and docstrings.
  201. Attributes:
  202. do_convert_rgb (`bool`):
  203. Whether to convert the image to RGB format.
  204. do_resize (`bool`, *optional*):
  205. Whether to resize the image.
  206. size (`dict[str, int]`, *optional*):
  207. Resize the shorter side of the input to `size["shortest_edge"]`.
  208. default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
  209. Whether to default to a square when resizing, if size is an int.
  210. crop_size (`dict[str, int]`, *optional*):
  211. Desired output size when applying center-cropping.
  212. resample (`PILImageResampling`, *optional*):
  213. Resampling filter to use if resizing the image.
  214. do_rescale (`bool`, *optional*):
  215. Whether to rescale the image by the specified scale `rescale_factor`.
  216. rescale_factor (`int` or `float`, *optional*):
  217. Scale factor to use if rescaling the image.
  218. do_normalize (`bool`, *optional*):
  219. Whether to normalize the image.
  220. image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
  221. Mean to use if normalizing the image.
  222. image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
  223. Standard deviation to use if normalizing the image.
  224. do_pad (`bool`, *optional*):
  225. Whether to pad the images in the batch.
  226. pad_size (`dict[str, int]`, *optional*):
  227. The size `{"height": int, "width" int}` to pad the images to.
  228. do_center_crop (`bool`, *optional*):
  229. Whether to center crop the image.
  230. data_format (`ChannelDimension` or `str`, *optional*):
  231. The channel dimension format for the output image.
  232. input_data_format (`ChannelDimension` or `str`, *optional*):
  233. The channel dimension format for the input image.
  234. device (`Union[str, torch.Tensor]`, *optional*):
  235. The device to use for processing (e.g. "cpu", "cuda"), only relevant for torchvision backend.
  236. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  237. If set, will return tensors of a particular framework. Acceptable values are:
  238. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  239. - `'np'`: Return NumPy `np.ndarray` objects.
  240. disable_grouping (`bool`, *optional*):
  241. Whether to group images by shapes when processing or not, only relevant for torchvision backend.
  242. image_seq_length (`int`, *optional*):
  243. The number of image tokens to be used for each image in the input.
  244. Added for backward compatibility but this should be set as a processor attribute in future models.
  245. """
  246. do_convert_rgb: bool | None
  247. do_resize: bool | None
  248. size: Annotated[int | list[int] | tuple[int, ...] | dict[str, int] | None, image_size_validator()]
  249. default_to_square: bool | None
  250. crop_size: Annotated[int | list[int] | tuple[int, ...] | dict[str, int] | None, image_size_validator()]
  251. resample: Annotated[Union["PILImageResampling", int] | None, resampling_validator()]
  252. do_rescale: bool | None
  253. rescale_factor: float | None
  254. do_normalize: bool | None
  255. image_mean: float | list[float] | tuple[float, ...] | None
  256. image_std: float | list[float] | tuple[float, ...] | None
  257. do_pad: bool | None
  258. pad_size: Annotated[int | list[int] | tuple[int, ...] | dict[str, int] | None, image_size_validator()]
  259. do_center_crop: bool | None
  260. data_format: str | ChannelDimension | None
  261. input_data_format: str | ChannelDimension | None
  262. device: Annotated[Union[str, "torch.device"] | None, device_validator()]
  263. return_tensors: Annotated[str | TensorType | None, tensor_type_validator()]
  264. disable_grouping: bool | None
  265. image_seq_length: int | None
  266. class VideosKwargs(TypedDict, total=False):
  267. """
  268. Keyword arguments for video processing.
  269. Attributes:
  270. do_convert_rgb (`bool`):
  271. Whether to convert the video to RGB format.
  272. do_resize (`bool`):
  273. Whether to resize the video.
  274. size (`dict[str, int]`, *optional*):
  275. Resize the shorter side of the input to `size["shortest_edge"]`.
  276. default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
  277. Whether to default to a square when resizing, if size is an int.
  278. resample (`PILImageResampling`, *optional*):
  279. Resampling filter to use if resizing the video.
  280. do_rescale (`bool`, *optional*):
  281. Whether to rescale the video by the specified scale `rescale_factor`.
  282. rescale_factor (`int` or `float`, *optional*):
  283. Scale factor to use if rescaling the video.
  284. do_normalize (`bool`, *optional*):
  285. Whether to normalize the video.
  286. image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
  287. Mean to use if normalizing the video.
  288. image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
  289. Standard deviation to use if normalizing the video.
  290. do_center_crop (`bool`, *optional*):
  291. Whether to center crop the video.
  292. do_pad (`bool`, *optional*):
  293. Whether to pad the images in the batch.
  294. do_sample_frames (`bool`, *optional*):
  295. Whether to sample frames from the video before processing or to process the whole video.
  296. video_metadata (`Union[VideoMetadata, dict]`, *optional*):
  297. Metadata of the video containing information about total duration, fps and total number of frames.
  298. num_frames (`int`, *optional*):
  299. Maximum number of frames to sample when `do_sample_frames=True`.
  300. fps (`int` or `float`, *optional*):
  301. Target frames to sample per second when `do_sample_frames=True`.
  302. crop_size (`dict[str, int]`, *optional*):
  303. Desired output size when applying center-cropping.
  304. data_format (`ChannelDimension` or `str`, *optional*):
  305. The channel dimension format for the output video.
  306. input_data_format (`ChannelDimension` or `str`, *optional*):
  307. The channel dimension format for the input video.
  308. device (`Union[str, torch.Tensor]`, *optional*):
  309. The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
  310. return_metadata (`bool`, *optional*):
  311. Whether to return video metadata or not.
  312. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  313. If set, will return tensors of a particular framework. Acceptable values are:
  314. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  315. - `'np'`: Return NumPy `np.ndarray` objects.
  316. """
  317. do_convert_rgb: bool | None
  318. do_resize: bool | None
  319. size: Annotated[int | list[int] | tuple[int, ...] | dict[str, int] | None, image_size_validator()]
  320. default_to_square: bool | None
  321. resample: Annotated[Union["PILImageResampling", int] | None, resampling_validator()]
  322. do_rescale: bool | None
  323. rescale_factor: float | None
  324. do_normalize: bool | None
  325. image_mean: float | list[float] | tuple[float, ...] | None
  326. image_std: float | list[float] | tuple[float, ...] | None
  327. do_center_crop: bool | None
  328. do_pad: bool | None
  329. crop_size: Annotated[int | list[int] | tuple[int, ...] | dict[str, int] | None, image_size_validator()]
  330. data_format: str | ChannelDimension | None
  331. input_data_format: str | ChannelDimension | None
  332. device: Annotated[Union[str, "torch.device"] | None, device_validator()]
  333. do_sample_frames: bool | None
  334. video_metadata: Annotated[VideoMetadataType | None, video_metadata_validator()]
  335. fps: Annotated[int | float | None, positive_any_number()]
  336. num_frames: Annotated[int | None, positive_int()]
  337. return_metadata: bool | None
  338. return_tensors: Annotated[str | TensorType | None, tensor_type_validator()]
  339. class AudioKwargs(TypedDict, total=False):
  340. """
  341. Keyword arguments for audio processing.
  342. Attributes:
  343. sampling_rate (`int`, *optional*):
  344. The sampling rate at which the `raw_speech` input was sampled.
  345. raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
  346. The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
  347. values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
  348. stereo, i.e. single float per timestep.
  349. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*):
  350. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  351. index) among:
  352. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  353. sequence if provided).
  354. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  355. acceptable input length for the model if that argument is not provided.
  356. - `False` or `'do_not_pad'`
  357. max_length (`int`, *optional*):
  358. Maximum length of the returned list and optionally padding length (see above).
  359. truncation (`bool`, *optional*):
  360. Activates truncation to cut input sequences longer than *max_length* to *max_length*.
  361. pad_to_multiple_of (`int`, *optional*):
  362. If set, will pad the sequence to a multiple of the provided value.
  363. return_attention_mask (`bool`, *optional*):
  364. Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
  365. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  366. If set, will return tensors of a particular framework. Acceptable values are:
  367. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  368. - `'np'`: Return NumPy `np.ndarray` objects.
  369. """
  370. sampling_rate: Annotated[int | None, positive_int()]
  371. raw_speech: Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]] | None
  372. padding: Annotated[bool | str | PaddingStrategy | None, padding_validator()]
  373. max_length: Annotated[int | None, positive_int()]
  374. truncation: Annotated[bool | str | TruncationStrategy | None, truncation_validator()]
  375. pad_to_multiple_of: Annotated[int | None, positive_int()]
  376. return_attention_mask: bool | None
  377. return_tensors: Annotated[str | TensorType | None, tensor_type_validator()]
  378. class ProcessingKwargs(TypedDict, total=False):
  379. """
  380. Base class for kwargs passing to processors.
  381. In case a model has specific kwargs that are not present in the base class or default values for existing keys,
  382. it should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
  383. 1) Additional typed keys and that this model requires to process inputs.
  384. 2) Default values for existing keys under a `_defaults` attribute.
  385. New keys have to be defined as follows to ensure type hinting is done correctly.
  386. ```python
  387. # adding a new image kwarg for this model
  388. class ModelImagesKwargs(ImagesKwargs, total=False):
  389. new_image_kwarg: Optional[bool]
  390. class ModelProcessorKwargs(ProcessingKwargs, total=False):
  391. images_kwargs: ModelImagesKwargs
  392. _defaults = {
  393. "images_kwargs: {
  394. "new_image_kwarg": False,
  395. }
  396. "text_kwargs": {
  397. "padding": "max_length",
  398. },
  399. }
  400. ```
  401. For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs,
  402. you need to manually update the __annotations__ dictionary. This can be done as follows:
  403. ```python
  404. class CustomProcessorKwargs(ProcessingKwargs, total=False):
  405. images_kwargs: CustomImagesKwargs
  406. CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs # python 3.8 compatibility
  407. ```
  408. """
  409. _defaults = {}
  410. text_kwargs: TextKwargs = {
  411. **TextKwargs.__annotations__,
  412. }
  413. images_kwargs: ImagesKwargs = {
  414. **ImagesKwargs.__annotations__,
  415. }
  416. videos_kwargs: VideosKwargs = {
  417. **VideosKwargs.__annotations__,
  418. }
  419. audio_kwargs: AudioKwargs = {
  420. **AudioKwargs.__annotations__,
  421. }
  422. class TokenizerChatTemplateKwargs(TypedDict, total=False):
  423. """
  424. NOTE: `TokenizerChatTemplateKwargs` is deprecated and will be removed in future versions
  425. Keyword arguments for tokenizer's `apply_chat_template`, when it is called from within a processor.
  426. tools (`list[Dict]`, *optional*):
  427. A list of tools (callable functions) that will be accessible to the model. If the template does not
  428. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  429. giving the name, description and argument types for the tool. See our
  430. [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
  431. for more information.
  432. documents (`list[dict[str, str]]`, *optional*):
  433. A list of dicts representing documents that will be accessible to the model if it is performing RAG
  434. (retrieval-augmented generation). If the template does not support RAG, this argument will have no
  435. effect. We recommend that each document should be a dict containing "title" and "text" keys. Please
  436. see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG)
  437. for examples of passing documents with chat templates.
  438. add_generation_prompt (bool, *optional*):
  439. If this is set, a prompt with the token(s) that indicate
  440. the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
  441. Note that this argument will be passed to the chat template, and so it must be supported in the
  442. template for this argument to have any effect.
  443. continue_final_message (bool, *optional*):
  444. If this is set, the chat will be formatted so that the final
  445. message in the chat is open-ended, without any EOS tokens. The model will continue this message
  446. rather than starting a new one. This allows you to "prefill" part of
  447. the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
  448. return_assistant_tokens_mask (`bool`, defaults to `False`):
  449. Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
  450. the mask will contain 1. For user and system tokens, the mask will contain 0.
  451. This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
  452. reasoning_effort (`str`, *optional*):
  453. The reasoning effort level to use for the model's response. Supported values depend on the model
  454. (e.g. `"none"`, "low"`, `"medium"`, `"high"`). If the template does not support reasoning effort,
  455. this argument will have no effect.
  456. """
  457. tools: list[dict] | None = None
  458. documents: list[dict[str, str]] | None = None
  459. add_generation_prompt: bool | None = False
  460. continue_final_message: bool | None = False
  461. return_assistant_tokens_mask: bool | None = False
  462. reasoning_effort: str | None = None
  463. class ProcessorChatTemplateKwargs(TokenizerChatTemplateKwargs, total=False):
  464. """
  465. NOTE: `ProcessorChatTemplateKwargs` is deprecated and will be removed in future versions
  466. Keyword arguments for processor's `apply_chat_template`.
  467. tokenize (`bool`, *optional*, defaults to `False`):
  468. Whether to tokenize the output or not.
  469. return_dict (`bool`, defaults to `False`):
  470. Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
  471. load_audio_from_video (`bool`, *optional*, defaults to `False`):
  472. Whether to use the audio track of input video. If `True` the audio track will be loaded and passed to the
  473. processor. This flag has no effect if the model doesn't support audio modality.
  474. """
  475. tokenize: bool | None = False
  476. return_dict: bool | None = False
  477. load_audio_from_video: bool | None = False
  478. class AllKwargsForChatTemplate(TypedDict, total=False):
  479. "NOTE: `AllKwargsForChatTemplate` is deprecated and will be removed in future versions"
  480. processor_kwargs: ProcessingKwargs
  481. template_kwargs: ProcessorChatTemplateKwargs
  482. @dataclass
  483. class MultiModalData:
  484. """
  485. Dataclass that holds extra useful data for processing
  486. multimodal data. Processors currently cannot return keys,
  487. unless it is used in model's forward. Thus we have helper
  488. methods that calculate and return useful data from processing
  489. input multimodals (images/videos).
  490. Note that this dataclass is aimed to be used only in vLLM
  491. and we might change its API in the future.
  492. """
  493. num_image_tokens: list[int] | None = None
  494. num_video_tokens: list[int] | None = None
  495. num_audio_tokens: list[int] | None = None
  496. num_image_patches: list[int] | None = None
  497. def __contains__(self, key):
  498. return hasattr(self, key) and getattr(self, key) is not None
  499. def __getitem__(self, key):
  500. if hasattr(self, key):
  501. return getattr(self, key)
  502. raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
  503. class ProcessorMixin(PushToHubMixin):
  504. """
  505. This is a mixin used to provide saving/loading functionality for all processor classes.
  506. """
  507. # Names need to be attr_class for attr in attributes
  508. _auto_class = None
  509. valid_processor_kwargs = ProcessingKwargs
  510. # args have to match the attributes class attribute
  511. def __init__(self, *args, **kwargs):
  512. # First, extract chat template from kwargs. It can never be a positional arg
  513. setattr(self, "chat_template", kwargs.pop("chat_template", None))
  514. self.image_ids = [getattr(self, "image_token_id", None)]
  515. self.video_ids = [getattr(self, "video_token_id", None)]
  516. self.audio_ids = [getattr(self, "audio_token_id", None)]
  517. # Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
  518. if (audio_tokenizer := kwargs.pop("audio_tokenizer", None)) is not None:
  519. proper_class = self.check_argument_for_proper_class("audio_tokenizer", audio_tokenizer)
  520. if not (is_torch_available() and isinstance(audio_tokenizer, PreTrainedAudioTokenizerBase)):
  521. raise ValueError(
  522. f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
  523. " registered for audio tokenization."
  524. )
  525. setattr(self, "audio_tokenizer", audio_tokenizer)
  526. # Sanitize args and kwargs
  527. for key in kwargs:
  528. if key not in self.get_attributes():
  529. raise TypeError(f"Unexpected keyword argument {key}.")
  530. for arg, attribute_name in zip(args, self.get_attributes()):
  531. if attribute_name in kwargs:
  532. raise TypeError(f"Got multiple values for argument {attribute_name}.")
  533. else:
  534. kwargs[attribute_name] = arg
  535. if len(kwargs) != len(self.get_attributes()):
  536. raise ValueError(
  537. f"This processor requires {len(self.get_attributes())} arguments: {', '.join(self.get_attributes())}. Got "
  538. f"{len(args)} arguments instead."
  539. )
  540. # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
  541. for attribute_name, arg in kwargs.items():
  542. self.check_argument_for_proper_class(attribute_name, arg)
  543. setattr(self, attribute_name, arg)
  544. def __call__(
  545. self,
  546. images: ImageInput | None = None,
  547. text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
  548. videos: VideoInput | None = None,
  549. audio: AudioInput | None = None,
  550. **kwargs: Unpack[ProcessingKwargs],
  551. ):
  552. """
  553. Main method to prepare for model inputs. This method forwards the each modality argument to its own processor
  554. along with `kwargs`. Please refer to the docstring of the each processor attributes for more information.
  555. Args:
  556. images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
  557. The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
  558. tensor. Both channels-first and channels-last formats are supported.
  559. text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
  560. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  561. (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  562. `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  563. videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
  564. The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
  565. tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
  566. audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
  567. The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
  568. tensor.
  569. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  570. If set, will return tensors of a particular framework. Acceptable values are:
  571. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  572. - `'np'`: Return NumPy `np.ndarray` objects.
  573. Returns:
  574. [`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format.
  575. """
  576. if "audios" in kwargs and audio is None:
  577. raise ValueError("You passed keyword argument `audios` which is deprecated. Please use `audio` instead.")
  578. if images is None and text is None and videos is None and audio is None:
  579. raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}")
  580. kwargs = self._merge_kwargs(
  581. self.valid_processor_kwargs,
  582. tokenizer_init_kwargs=self.tokenizer.init_kwargs if hasattr(self, "tokenizer") else {},
  583. **kwargs,
  584. )
  585. attribute_to_kwargs = {
  586. "tokenizer": (text, "text_kwargs"),
  587. "image_processor": (images, "images_kwargs"),
  588. "video_processor": (videos, "videos_kwargs"),
  589. "feature_extractor": (audio, "audio_kwargs"),
  590. }
  591. outputs = {}
  592. for attribute_name in self.get_attributes():
  593. attribute = getattr(self, attribute_name, None)
  594. input_data, input_kwargs = attribute_to_kwargs[attribute_name]
  595. if input_data is not None and attribute is not None:
  596. attribute_output = attribute(input_data, **kwargs[input_kwargs])
  597. outputs.update(attribute_output)
  598. return BatchFeature(outputs)
  599. def check_argument_for_proper_class(self, argument_name, argument):
  600. """
  601. Checks the passed argument's class against the expected transformers class. In case of an unexpected
  602. mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
  603. is returned.
  604. """
  605. # If the exact attribute name is not in the mapping, use its canonical modality
  606. # (e.g., "encoder_tokenizer" -> "tokenizer")
  607. if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING:
  608. argument_name = _get_modality_for_attribute(argument_name)
  609. class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
  610. if isinstance(class_name, tuple):
  611. proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
  612. else:
  613. proper_class = self.get_possibly_dynamic_module(class_name)
  614. if not isinstance(argument, proper_class):
  615. raise TypeError(
  616. f"Received a {type(argument).__name__} for argument {argument_name}, but a {class_name} was expected."
  617. )
  618. return proper_class
  619. def to_dict(self) -> dict[str, Any]:
  620. """
  621. Serializes this instance to a Python dictionary.
  622. Returns:
  623. `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
  624. """
  625. # Exclude tokenizer attributes before deepcopying to avoid copying large vocab/token structures.
  626. tokenizer_attributes = set()
  627. for attribute in self.__class__.get_attributes():
  628. if attribute in self.__dict__:
  629. modality = _get_modality_for_attribute(attribute)
  630. if modality == "tokenizer":
  631. tokenizer_attributes.add(attribute)
  632. dict_to_copy = {k: v for k, v in self.__dict__.items() if k not in tokenizer_attributes}
  633. output = copy.deepcopy(dict_to_copy)
  634. # Get the kwargs in `__init__`.
  635. sig = inspect.signature(self.__init__)
  636. # Only save the attributes that are presented in the kwargs of `__init__`.
  637. # or in the attributes
  638. attrs_to_save = list(sig.parameters) + self.__class__.get_attributes()
  639. # extra attributes to be kept
  640. attrs_to_save += ["auto_map"]
  641. if "chat_template" in output:
  642. del output["chat_template"]
  643. def cast_array_to_list(dictionary):
  644. """
  645. Numpy arrays are not serialiazable but can be in pre-processing dicts.
  646. This function casts arrays to list, recusring through the nested configs as well.
  647. """
  648. for key, value in dictionary.items():
  649. if isinstance(value, np.ndarray):
  650. dictionary[key] = value.tolist()
  651. elif isinstance(value, dict):
  652. dictionary[key] = cast_array_to_list(value)
  653. return dictionary
  654. # Special case, add `audio_tokenizer` dict which points to model weights and path
  655. if "audio_tokenizer" in output:
  656. audio_tokenizer_dict = {
  657. "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
  658. "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
  659. }
  660. output["audio_tokenizer"] = audio_tokenizer_dict
  661. # Serialize attributes as a dict
  662. output = {
  663. k: v.to_dict() if isinstance(v, PushToHubMixin) else v
  664. for k, v in output.items()
  665. if (
  666. k in attrs_to_save # keep all attributes that have to be serialized
  667. and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects
  668. )
  669. }
  670. output = cast_array_to_list(output)
  671. output["processor_class"] = self.__class__.__name__
  672. return output
  673. def to_json_string(self) -> str:
  674. """
  675. Serializes this instance to a JSON string.
  676. Returns:
  677. `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
  678. """
  679. dictionary = self.to_dict()
  680. return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
  681. def to_json_file(self, json_file_path: str | os.PathLike):
  682. """
  683. Save this instance to a JSON file.
  684. Args:
  685. json_file_path (`str` or `os.PathLike`):
  686. Path to the JSON file in which this processor instance's parameters will be saved.
  687. """
  688. with open(json_file_path, "w", encoding="utf-8") as writer:
  689. writer.write(self.to_json_string())
  690. def __repr__(self):
  691. attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.get_attributes()]
  692. attributes_repr = "\n".join(attributes_repr)
  693. return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
  694. def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
  695. """
  696. Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
  697. can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
  698. <Tip>
  699. This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
  700. [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
  701. methods above for more information.
  702. </Tip>
  703. Args:
  704. save_directory (`str` or `os.PathLike`):
  705. Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
  706. be created if it does not exist).
  707. push_to_hub (`bool`, *optional*, defaults to `False`):
  708. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  709. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  710. namespace).
  711. kwargs (`dict[str, Any]`, *optional*):
  712. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  713. """
  714. os.makedirs(save_directory, exist_ok=True)
  715. if push_to_hub:
  716. commit_message = kwargs.pop("commit_message", None)
  717. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  718. repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
  719. files_timestamps = self._get_files_timestamps(save_directory)
  720. # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
  721. # loaded from the Hub.
  722. if self._auto_class is not None:
  723. attrs = [getattr(self, attribute_name) for attribute_name in self.get_attributes()]
  724. configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
  725. configs.append(self)
  726. custom_object_save(self, save_directory, config=configs)
  727. for attribute_name in self.get_attributes():
  728. attribute = getattr(self, attribute_name)
  729. modality = _get_modality_for_attribute(attribute_name)
  730. is_primary = attribute_name == modality
  731. if modality == "tokenizer":
  732. attribute._set_processor_class(self.__class__.__name__)
  733. # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
  734. if is_primary:
  735. attribute.save_pretrained(save_directory)
  736. else:
  737. # if a model has multiple tokenizers, save the additional tokenizers in their own folders.
  738. attribute.save_pretrained(os.path.join(save_directory, attribute_name))
  739. elif attribute._auto_class is not None:
  740. custom_object_save(attribute, save_directory, config=attribute)
  741. if self._auto_class is not None:
  742. # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
  743. for attribute_name in self.get_attributes():
  744. attribute = getattr(self, attribute_name)
  745. if isinstance(attribute, PreTrainedTokenizerBase):
  746. del attribute.init_kwargs["auto_map"]
  747. # If we save using the predefined names, we can load using `from_pretrained`
  748. # plus we save chat_template in its own file
  749. output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
  750. output_chat_template_file_jinja = os.path.join(save_directory, CHAT_TEMPLATE_FILE)
  751. chat_template_dir = os.path.join(save_directory, CHAT_TEMPLATE_DIR)
  752. # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
  753. # to avoid serializing chat template in json config file. So let's get it from `self` directly
  754. if isinstance(self.chat_template, str):
  755. # New format for single templates is to save them as chat_template.jinja
  756. with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f:
  757. f.write(self.chat_template)
  758. logger.info(f"chat template saved in {output_chat_template_file_jinja}")
  759. elif isinstance(self.chat_template, dict):
  760. # New format for multiple templates is to save the default as chat_template.jinja
  761. # and the other templates in the chat_templates/ directory
  762. for template_name, template in self.chat_template.items():
  763. if template_name == "default":
  764. with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f:
  765. f.write(self.chat_template["default"])
  766. logger.info(f"chat template saved in {output_chat_template_file_jinja}")
  767. else:
  768. os.makedirs(chat_template_dir, exist_ok=True)
  769. template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja")
  770. with open(template_filepath, "w", encoding="utf-8") as f:
  771. f.write(template)
  772. logger.info(f"chat template saved in {template_filepath}")
  773. # Create a unified `preprocessor_config.json` and save all attributes as a composite config, except for tokenizers
  774. self.to_json_file(output_processor_file)
  775. logger.info(f"processor saved in {output_processor_file}")
  776. return_files = [output_processor_file]
  777. if push_to_hub:
  778. self._upload_modified_files(
  779. save_directory,
  780. repo_id,
  781. files_timestamps,
  782. commit_message=commit_message,
  783. token=kwargs.get("token"),
  784. )
  785. return return_files
  786. @classmethod
  787. def get_processor_dict(
  788. cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
  789. ) -> tuple[dict[str, Any], dict[str, Any]]:
  790. """
  791. From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
  792. processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
  793. Parameters:
  794. pretrained_model_name_or_path (`str` or `os.PathLike`):
  795. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
  796. subfolder (`str`, *optional*, defaults to `""`):
  797. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  798. specify the folder name here.
  799. Returns:
  800. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
  801. """
  802. # holding a copy for optionally loading the audio tokenizer (if available)
  803. audio_tokenizer_kwargs = copy.deepcopy(kwargs)
  804. cache_dir = kwargs.pop("cache_dir", None)
  805. force_download = kwargs.pop("force_download", False)
  806. proxies = kwargs.pop("proxies", None)
  807. token = kwargs.pop("token", None)
  808. local_files_only = kwargs.pop("local_files_only", False)
  809. revision = kwargs.pop("revision", None)
  810. subfolder = kwargs.pop("subfolder", "")
  811. from_pipeline = kwargs.pop("_from_pipeline", None)
  812. from_auto_class = kwargs.pop("_from_auto", False)
  813. user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
  814. if from_pipeline is not None:
  815. user_agent["using_pipeline"] = from_pipeline
  816. if is_offline_mode() and not local_files_only:
  817. logger.info("Offline mode: forcing local_files_only=True")
  818. local_files_only = True
  819. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  820. is_local = os.path.isdir(pretrained_model_name_or_path)
  821. if os.path.isdir(pretrained_model_name_or_path):
  822. processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
  823. additional_chat_template_files = {}
  824. resolved_additional_chat_template_files = {}
  825. if os.path.isfile(pretrained_model_name_or_path):
  826. resolved_processor_file = pretrained_model_name_or_path
  827. # can't load chat-template and audio tokenizer when given a file as pretrained_model_name_or_path
  828. resolved_chat_template_file = None
  829. resolved_raw_chat_template_file = None
  830. resolved_audio_tokenizer_file = None
  831. is_local = True
  832. else:
  833. if is_local:
  834. template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
  835. if template_dir.is_dir():
  836. for template_file in template_dir.glob("*.jinja"):
  837. template_name = template_file.stem
  838. additional_chat_template_files[template_name] = f"{CHAT_TEMPLATE_DIR}/{template_file.name}"
  839. else:
  840. try:
  841. for template in list_repo_templates(
  842. pretrained_model_name_or_path,
  843. local_files_only=local_files_only,
  844. revision=revision,
  845. cache_dir=cache_dir,
  846. token=token,
  847. ):
  848. template = template.removesuffix(".jinja")
  849. additional_chat_template_files[template] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
  850. except EntryNotFoundError:
  851. pass # No template dir means no template files
  852. processor_file = PROCESSOR_NAME
  853. try:
  854. # Load from local folder or from cache or download from model Hub and cache
  855. resolved_processor_file = cached_file(
  856. pretrained_model_name_or_path,
  857. processor_file,
  858. cache_dir=cache_dir,
  859. force_download=force_download,
  860. proxies=proxies,
  861. local_files_only=local_files_only,
  862. token=token,
  863. user_agent=user_agent,
  864. revision=revision,
  865. subfolder=subfolder,
  866. _raise_exceptions_for_missing_entries=False,
  867. )
  868. # chat_template.json is a legacy file used by the processor class
  869. # a raw chat_template.jinja is preferred in future
  870. resolved_chat_template_file = cached_file(
  871. pretrained_model_name_or_path,
  872. LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE,
  873. cache_dir=cache_dir,
  874. force_download=force_download,
  875. proxies=proxies,
  876. local_files_only=local_files_only,
  877. token=token,
  878. user_agent=user_agent,
  879. revision=revision,
  880. subfolder=subfolder,
  881. _raise_exceptions_for_missing_entries=False,
  882. )
  883. resolved_raw_chat_template_file = cached_file(
  884. pretrained_model_name_or_path,
  885. CHAT_TEMPLATE_FILE,
  886. cache_dir=cache_dir,
  887. force_download=force_download,
  888. proxies=proxies,
  889. local_files_only=local_files_only,
  890. token=token,
  891. user_agent=user_agent,
  892. revision=revision,
  893. subfolder=subfolder,
  894. _raise_exceptions_for_missing_entries=False,
  895. )
  896. resolved_additional_chat_template_files = {
  897. template_name: cached_file(
  898. pretrained_model_name_or_path,
  899. template_file,
  900. cache_dir=cache_dir,
  901. force_download=force_download,
  902. proxies=proxies,
  903. local_files_only=local_files_only,
  904. token=token,
  905. user_agent=user_agent,
  906. revision=revision,
  907. subfolder=subfolder,
  908. _raise_exceptions_for_missing_entries=False,
  909. )
  910. for template_name, template_file in additional_chat_template_files.items()
  911. }
  912. resolved_audio_tokenizer_file = cached_file(
  913. pretrained_model_name_or_path,
  914. AUDIO_TOKENIZER_NAME,
  915. cache_dir=cache_dir,
  916. force_download=force_download,
  917. proxies=proxies,
  918. local_files_only=local_files_only,
  919. token=token,
  920. user_agent=user_agent,
  921. revision=revision,
  922. subfolder=subfolder,
  923. _raise_exceptions_for_missing_entries=False,
  924. )
  925. except OSError:
  926. # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
  927. # the original exception.
  928. raise
  929. except Exception:
  930. # For any other exception, we throw a generic error.
  931. raise OSError(
  932. f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
  933. " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
  934. f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
  935. f" directory containing a {PROCESSOR_NAME} file"
  936. )
  937. # Add chat template as kwarg before returning because most models don't have processor config
  938. if resolved_chat_template_file is not None:
  939. # This is the legacy path
  940. with open(resolved_chat_template_file, encoding="utf-8") as reader:
  941. chat_template_json = json.loads(reader.read())
  942. chat_templates = {"default": chat_template_json["chat_template"]}
  943. if resolved_additional_chat_template_files:
  944. raise ValueError(
  945. "Cannot load chat template due to conflicting files - this checkpoint combines "
  946. "a legacy chat_template.json file with separate template files, which is not "
  947. "supported. To resolve this error, replace the legacy chat_template.json file "
  948. "with a modern chat_template.jinja file."
  949. )
  950. else:
  951. chat_templates = {
  952. template_name: open(template_file, "r", encoding="utf-8").read()
  953. for template_name, template_file in resolved_additional_chat_template_files.items()
  954. }
  955. if resolved_raw_chat_template_file is not None:
  956. with open(resolved_raw_chat_template_file, "r", encoding="utf-8") as reader:
  957. chat_templates["default"] = reader.read()
  958. if isinstance(chat_templates, dict) and "default" in chat_templates and len(chat_templates) == 1:
  959. chat_templates = chat_templates["default"] # Flatten when we just have a single template/file
  960. # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
  961. # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
  962. # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
  963. # However, for models added in the future, we won't get the expected error if this file is missing.
  964. if resolved_processor_file is None:
  965. # In any case we need to pass `chat_template` if it is available
  966. processor_dict = {}
  967. else:
  968. try:
  969. # Load processor dict
  970. with open(resolved_processor_file, encoding="utf-8") as reader:
  971. text = reader.read()
  972. processor_dict = json.loads(text)
  973. except json.JSONDecodeError:
  974. raise OSError(
  975. f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
  976. )
  977. if is_local:
  978. logger.info(f"loading configuration file {resolved_processor_file}")
  979. else:
  980. logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
  981. if processor_dict.get("chat_template") is not None:
  982. logger.warning_once(
  983. "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' "
  984. "in the processor's config. Make sure to move your template to its own file."
  985. )
  986. elif chat_templates:
  987. processor_dict["chat_template"] = chat_templates
  988. # Audio tokenizer needs to load the model checkpoint first, because the saved
  989. # json file contains only references to the model path and repo id
  990. if resolved_audio_tokenizer_file is not None or "audio_tokenizer" in processor_dict:
  991. if resolved_audio_tokenizer_file is not None:
  992. reader = open(resolved_audio_tokenizer_file, "r", encoding="utf-8")
  993. audio_tokenizer_dict = reader.read()
  994. audio_tokenizer_dict = json.loads(audio_tokenizer_dict)
  995. else:
  996. audio_tokenizer_dict = processor_dict["audio_tokenizer"]
  997. audio_tokenizer_class = cls.get_possibly_dynamic_module(audio_tokenizer_dict["audio_tokenizer_class"])
  998. audio_tokenizer_path = audio_tokenizer_dict["audio_tokenizer_name_or_path"]
  999. processor_dict["audio_tokenizer"] = audio_tokenizer_class.from_pretrained(
  1000. audio_tokenizer_path, **audio_tokenizer_kwargs
  1001. )
  1002. return processor_dict, kwargs
  1003. @classmethod
  1004. def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
  1005. """
  1006. Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
  1007. Args:
  1008. processor_dict (`dict[str, Any]`):
  1009. Dictionary that will be used to instantiate the processor object. Such a dictionary can be
  1010. retrieved from a pretrained checkpoint by leveraging the
  1011. [`~processing_utils.ProcessingMixin.to_dict`] method.
  1012. kwargs (`dict[str, Any]`):
  1013. Additional parameters from which to initialize the processor object.
  1014. Returns:
  1015. [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
  1016. parameters.
  1017. """
  1018. processor_dict = processor_dict.copy()
  1019. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  1020. # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
  1021. # If we don't pop, some specific kwargs will raise a warning or error
  1022. for unused_kwarg in cls.get_attributes() + ["auto_map", "processor_class"]:
  1023. processor_dict.pop(unused_kwarg, None)
  1024. # override processor_dict with given kwargs
  1025. processor_dict.update(kwargs)
  1026. # check if there is an overlap between args and processor_dict
  1027. accepted_args_and_kwargs = cls.__init__.__code__.co_varnames[: cls.__init__.__code__.co_argcount][1:]
  1028. # validate both processor_dict and given kwargs
  1029. unused_kwargs, valid_kwargs = cls.validate_init_kwargs(
  1030. processor_config=processor_dict, valid_kwargs=accepted_args_and_kwargs
  1031. )
  1032. # update args that are already in processor_dict to avoid duplicate arguments
  1033. args_to_update = {
  1034. i: valid_kwargs.pop(arg)
  1035. for i, arg in enumerate(accepted_args_and_kwargs)
  1036. if (arg in valid_kwargs and i < len(args))
  1037. }
  1038. args = [args_to_update.get(i, arg) for i, arg in enumerate(args)]
  1039. # instantiate processor with used (and valid) kwargs only
  1040. processor = cls(*args, **valid_kwargs)
  1041. logger.info(f"Processor {processor}")
  1042. if return_unused_kwargs:
  1043. return processor, unused_kwargs
  1044. else:
  1045. return processor
  1046. def _merge_kwargs(
  1047. self,
  1048. ModelProcessorKwargs: ProcessingKwargs,
  1049. tokenizer_init_kwargs: dict | None = None,
  1050. **kwargs,
  1051. ) -> dict[str, dict]:
  1052. """
  1053. Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
  1054. The order of operations is as follows:
  1055. 1) kwargs passed as before have highest priority to preserve BC.
  1056. ```python
  1057. high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
  1058. processor(..., **high_priority_kwargs)
  1059. ```
  1060. 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
  1061. ```python
  1062. processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
  1063. ```
  1064. 3) kwargs passed during instantiation of a modality processor have fourth priority.
  1065. ```python
  1066. tokenizer = tokenizer_class(..., {"padding": "max_length"})
  1067. image_processor = image_processor_class(...)
  1068. processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call
  1069. ```
  1070. 4) defaults kwargs specified at processor level have lowest priority.
  1071. ```python
  1072. class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
  1073. _defaults = {
  1074. "text_kwargs": {
  1075. "padding": "max_length",
  1076. "max_length": 64,
  1077. },
  1078. }
  1079. ```
  1080. Args:
  1081. ModelProcessorKwargs (`ProcessingKwargs`):
  1082. Typed dictionary of kwargs specifically required by the model passed.
  1083. tokenizer_init_kwargs (`Dict`, *optional*):
  1084. Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
  1085. Returns:
  1086. output_kwargs (`Dict`):
  1087. Dictionary of per-modality kwargs to be passed to each modality-specific processor.
  1088. """
  1089. # holding a copy to avoid mutating user-provided arguments
  1090. # Use deepcopy to also copy nested dicts (like videos_kwargs) that will be modified via pop()
  1091. kwargs = copy.deepcopy(kwargs)
  1092. # Initialize dictionaries
  1093. output_kwargs = {
  1094. "text_kwargs": {},
  1095. "images_kwargs": {},
  1096. "audio_kwargs": {},
  1097. "videos_kwargs": {},
  1098. }
  1099. default_kwargs = {
  1100. "text_kwargs": {},
  1101. "images_kwargs": {},
  1102. "audio_kwargs": {},
  1103. "videos_kwargs": {},
  1104. }
  1105. map_preprocessor_kwargs = {
  1106. "text_kwargs": "tokenizer",
  1107. "images_kwargs": "image_processor",
  1108. "audio_kwargs": "feature_extractor",
  1109. "videos_kwargs": "video_processor",
  1110. }
  1111. possible_modality_keywords = {"text", "audio", "videos", "images"}
  1112. used_keys = set()
  1113. # get defaults from set model processor kwargs if they exist
  1114. for modality in default_kwargs:
  1115. default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
  1116. # Some preprocessors define a set of accepted "valid_kwargs" (currently only vision).
  1117. # In those cases, we don’t declare a `ModalityKwargs` attribute in the TypedDict.
  1118. # Instead, we dynamically obtain the kwargs from the preprocessor and merge them
  1119. # with the general kwargs set. This ensures consistency between preprocessor and
  1120. # processor classes, and helps prevent accidental mismatches.
  1121. modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__)
  1122. if modality in map_preprocessor_kwargs:
  1123. preprocessor = getattr(self, map_preprocessor_kwargs[modality], None)
  1124. preprocessor_valid_kwargs = (
  1125. getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None
  1126. )
  1127. modality_valid_kwargs.update(
  1128. set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else [])
  1129. )
  1130. # update defaults with arguments from tokenizer init
  1131. for modality_key in modality_valid_kwargs:
  1132. # init with tokenizer init kwargs if necessary
  1133. if tokenizer_init_kwargs is not None and modality_key in tokenizer_init_kwargs:
  1134. value = (
  1135. getattr(self.tokenizer, modality_key)
  1136. if hasattr(self.tokenizer, modality_key)
  1137. else tokenizer_init_kwargs[modality_key]
  1138. )
  1139. default_kwargs[modality][modality_key] = value
  1140. # now defaults kwargs are updated with the tokenizers defaults.
  1141. # pass defaults to output dictionary
  1142. output_kwargs.update(default_kwargs)
  1143. # For `common_kwargs` just update all modality-specific kwargs with same key/values
  1144. common_kwargs = ModelProcessorKwargs._defaults.get("common_kwargs", {})
  1145. common_kwargs.update(kwargs.get("common_kwargs", {}))
  1146. if common_kwargs:
  1147. for kwarg in output_kwargs.values():
  1148. kwarg.update(common_kwargs)
  1149. # update modality kwargs with passed kwargs
  1150. non_modality_kwargs = set(kwargs) - set(output_kwargs)
  1151. for modality, output_kwarg in output_kwargs.items():
  1152. modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__)
  1153. if modality in map_preprocessor_kwargs:
  1154. preprocessor = getattr(self, map_preprocessor_kwargs[modality], None)
  1155. preprocessor_valid_kwargs = (
  1156. getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None
  1157. )
  1158. modality_valid_kwargs.update(
  1159. set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else [])
  1160. )
  1161. for modality_key in modality_valid_kwargs:
  1162. # check if we received a structured kwarg dict or not to handle it correctly
  1163. if modality in kwargs:
  1164. kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
  1165. # check if this key was passed as a flat kwarg.
  1166. if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
  1167. raise ValueError(
  1168. f"Keyword argument {modality_key} was passed two times:\n"
  1169. f"in a dictionary for {modality} and as a **kwarg."
  1170. )
  1171. elif modality_key in kwargs:
  1172. # we get a modality_key instead of popping it because modality-specific processors
  1173. # can have overlapping kwargs
  1174. kwarg_value = kwargs.get(modality_key, "__empty__")
  1175. else:
  1176. kwarg_value = "__empty__"
  1177. if not isinstance(kwarg_value, str) or kwarg_value != "__empty__":
  1178. output_kwarg[modality_key] = kwarg_value
  1179. used_keys.add(modality_key)
  1180. # Determine if kwargs is a flat dictionary or contains nested dictionaries
  1181. if any(key in default_kwargs for key in kwargs):
  1182. # kwargs is dictionary-based, and some keys match modality names
  1183. for modality, subdict in kwargs.items():
  1184. if modality in default_kwargs:
  1185. for subkey, subvalue in subdict.items():
  1186. if subkey not in used_keys:
  1187. output_kwargs[modality][subkey] = subvalue
  1188. used_keys.add(subkey)
  1189. else:
  1190. # kwargs is a flat dictionary
  1191. for key, kwarg in kwargs.items():
  1192. if key not in used_keys and key not in possible_modality_keywords:
  1193. logger.warning_once(
  1194. f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
  1195. )
  1196. for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
  1197. if key in map_preprocessor_kwargs:
  1198. preprocessor = getattr(self, map_preprocessor_kwargs[key], None)
  1199. if preprocessor is None or getattr(preprocessor, "valid_kwargs", None) is None:
  1200. continue
  1201. preprocessor_typed_dict_obj = getattr(preprocessor, "valid_kwargs")
  1202. typed_dict_obj = TypedDict(
  1203. "merged_typed_dict",
  1204. {**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__},
  1205. total=False,
  1206. )
  1207. validate_typed_dict(typed_dict_obj, output_kwargs[key])
  1208. return output_kwargs
  1209. @classmethod
  1210. def from_pretrained(
  1211. cls: type[SpecificProcessorType],
  1212. pretrained_model_name_or_path: str | os.PathLike,
  1213. cache_dir: str | os.PathLike | None = None,
  1214. force_download: bool = False,
  1215. local_files_only: bool = False,
  1216. token: str | bool | None = None,
  1217. revision: str = "main",
  1218. **kwargs,
  1219. ) -> SpecificProcessorType:
  1220. r"""
  1221. Instantiate a processor associated with a pretrained model.
  1222. <Tip>
  1223. This class method is simply calling the feature extractor
  1224. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
  1225. [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
  1226. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
  1227. methods above for more information.
  1228. </Tip>
  1229. Args:
  1230. pretrained_model_name_or_path (`str` or `os.PathLike`):
  1231. This can be either:
  1232. - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
  1233. huggingface.co.
  1234. - a path to a *directory* containing a feature extractor file saved using the
  1235. [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
  1236. - a path to a saved feature extractor JSON *file*, e.g.,
  1237. `./my_model_directory/preprocessor_config.json`.
  1238. **kwargs
  1239. Additional keyword arguments passed along to both
  1240. [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
  1241. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
  1242. """
  1243. kwargs["cache_dir"] = cache_dir
  1244. kwargs["force_download"] = force_download
  1245. kwargs["local_files_only"] = local_files_only
  1246. kwargs["revision"] = revision
  1247. if token is not None:
  1248. kwargs["token"] = token
  1249. # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
  1250. processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
  1251. args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
  1252. return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)
  1253. @classmethod
  1254. def get_attributes(cls):
  1255. args_in_init = inspect.signature(cls.__init__).parameters.keys()
  1256. attributes = []
  1257. for sub_processor_type in args_in_init:
  1258. # don't treat audio_tokenizer as an attribute
  1259. if sub_processor_type == "audio_tokenizer":
  1260. continue
  1261. if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
  1262. attributes.append(sub_processor_type)
  1263. # Legacy processors may not override `__init__` and instead expose modality
  1264. # attributes via `<attribute>_class`. In that case, `args_in_init` only exposes
  1265. # `*args`/`**kwargs`, so we need to infer the attributes from those class-level
  1266. # hints to keep backward compatibility (e.g. dynamic processors stored on the Hub).
  1267. if not attributes:
  1268. for attribute_name, value in cls.__dict__.items():
  1269. if value is None or attribute_name == "audio_tokenizer_class" or not attribute_name.endswith("_class"):
  1270. continue
  1271. inferred_attribute = attribute_name[: -len("_class")]
  1272. if inferred_attribute == "audio_tokenizer":
  1273. continue
  1274. if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
  1275. attributes.append(inferred_attribute)
  1276. return attributes
  1277. @classmethod
  1278. def register_for_auto_class(cls, auto_class="AutoProcessor"):
  1279. """
  1280. Register this class with a given auto class. This should only be used for custom feature extractors as the ones
  1281. in the library are already mapped with `AutoProcessor`.
  1282. Args:
  1283. auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
  1284. The auto class to register this new feature extractor with.
  1285. """
  1286. if not isinstance(auto_class, str):
  1287. auto_class = auto_class.__name__
  1288. import transformers.models.auto as auto_module
  1289. if not hasattr(auto_module, auto_class):
  1290. raise ValueError(f"{auto_class} is not a valid auto class.")
  1291. cls._auto_class = auto_class
  1292. @classmethod
  1293. def _load_tokenizer_from_pretrained(
  1294. cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
  1295. ):
  1296. auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
  1297. is_primary = sub_processor_type == "tokenizer"
  1298. if is_primary:
  1299. # Primary tokenizer: load from root
  1300. tokenizer = auto_processor_class.from_pretrained(
  1301. pretrained_model_name_or_path, subfolder=subfolder, **kwargs
  1302. )
  1303. else:
  1304. # Additional tokenizer: load from subfolder (e.g., "decoder_tokenizer")
  1305. tokenizer_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type
  1306. tokenizer = auto_processor_class.from_pretrained(
  1307. pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs
  1308. )
  1309. return tokenizer
  1310. @classmethod
  1311. def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
  1312. """
  1313. Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
  1314. and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
  1315. that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
  1316. modality names in their attribute name.
  1317. For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`.
  1318. Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders.
  1319. For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via
  1320. Auto class. Additional ones are instantiated from the config stored in processor_config.json
  1321. (passed as processor_dict).
  1322. Args:
  1323. pretrained_model_name_or_path: Path or model id to load from.
  1324. processor_dict: Optional dict containing processor config (from processor_config.json).
  1325. Required when loading additional non-tokenizer sub-processors.
  1326. """
  1327. args = []
  1328. processor_dict = processor_dict if processor_dict is not None else {}
  1329. # Remove subfolder from kwargs to avoid duplicate keyword arguments
  1330. subfolder = kwargs.pop("subfolder", "")
  1331. # get args from processor init signature
  1332. sub_processors = cls.get_attributes()
  1333. for sub_processor_type in sub_processors:
  1334. modality = _get_modality_for_attribute(sub_processor_type)
  1335. is_primary = sub_processor_type == modality
  1336. if (
  1337. "tokenizer" in sub_processor_type
  1338. ): # This is only necessary for the checkpoint in test_processing_mistral3.py which has no config.json and
  1339. # the tokenizer_config.json references LlamaTokenizerFast. TODO: update the config on the hub.
  1340. if "PixtralProcessor" in cls.__name__:
  1341. from .tokenization_utils_tokenizers import TokenizersBackend
  1342. tokenizer = TokenizersBackend.from_pretrained(
  1343. pretrained_model_name_or_path, subfolder=subfolder, **kwargs
  1344. )
  1345. else:
  1346. tokenizer = cls._load_tokenizer_from_pretrained(
  1347. sub_processor_type, pretrained_model_name_or_path, subfolder=subfolder, **kwargs
  1348. )
  1349. args.append(tokenizer)
  1350. elif is_primary:
  1351. # Primary non-tokenizer sub-processor: load via Auto class
  1352. auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
  1353. # For backward compatibility, check if sub-processor class name is hardcoded as an attribute of the processor class.
  1354. if hasattr(cls, sub_processor_type + "_class"):
  1355. sub_processor_class_name = getattr(cls, sub_processor_type + "_class")
  1356. logger.warning_once(
  1357. f"`{cls.__name__}` defines `{sub_processor_type}_class = '{sub_processor_class_name}'`, "
  1358. f"which is deprecated. Register the correct mapping in `{auto_processor_class.__name__}` instead.",
  1359. )
  1360. auto_processor_class = cls.get_possibly_dynamic_module(sub_processor_class_name)
  1361. sub_processor = auto_processor_class.from_pretrained(
  1362. pretrained_model_name_or_path, subfolder=subfolder, **kwargs
  1363. )
  1364. args.append(sub_processor)
  1365. elif sub_processor_type in processor_dict:
  1366. # Additional non-tokenizer sub-processor: instantiate from config in processor_dict
  1367. sub_processor_config = processor_dict[sub_processor_type]
  1368. if isinstance(sub_processor_config, dict):
  1369. # Determine the class to instantiate
  1370. # Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type'
  1371. type_key = f"{modality}_type"
  1372. class_name = sub_processor_config.get(type_key)
  1373. if class_name is None:
  1374. raise ValueError(
  1375. f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. "
  1376. f"Config keys: {list(sub_processor_config.keys())}"
  1377. )
  1378. processor_class = cls.get_possibly_dynamic_module(class_name)
  1379. sub_processor = processor_class(**sub_processor_config)
  1380. args.append(sub_processor)
  1381. else:
  1382. raise ValueError(
  1383. f"Expected dict for {sub_processor_type} in processor_config.json, "
  1384. f"got {type(sub_processor_config)}"
  1385. )
  1386. else:
  1387. raise ValueError(
  1388. f"Cannot find config for {sub_processor_type} in processor_config.json. "
  1389. f"Available keys: {list(processor_dict.keys())}"
  1390. )
  1391. return args
  1392. @staticmethod
  1393. def get_possibly_dynamic_module(module_name):
  1394. if hasattr(transformers_module, module_name):
  1395. return getattr(transformers_module, module_name)
  1396. lookup_locations = [
  1397. transformers_module.IMAGE_PROCESSOR_MAPPING,
  1398. transformers_module.VIDEO_PROCESSOR_MAPPING,
  1399. transformers_module.TOKENIZER_MAPPING,
  1400. transformers_module.FEATURE_EXTRACTOR_MAPPING,
  1401. transformers_module.MODEL_FOR_AUDIO_TOKENIZATION_MAPPING,
  1402. ]
  1403. for lookup_location in lookup_locations:
  1404. for custom_class in lookup_location._extra_content.values():
  1405. if isinstance(custom_class, tuple):
  1406. for custom_subclass in custom_class:
  1407. if custom_subclass is not None and custom_subclass.__name__ == module_name:
  1408. return custom_subclass
  1409. elif custom_class is not None and custom_class.__name__ == module_name:
  1410. return custom_class
  1411. raise ValueError(
  1412. f"Could not find module {module_name} in `transformers`. If this is a custom class, "
  1413. f"it should be registered using the relevant `AutoClass.register()` function so that "
  1414. f"other functions can find it!"
  1415. )
  1416. def batch_decode(self, *args, **kwargs):
  1417. """
  1418. This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
  1419. refer to the docstring of this method for more information.
  1420. """
  1421. if not hasattr(self, "tokenizer"):
  1422. raise ValueError(f"Cannot batch decode text: {self.__class__.__name__} has no tokenizer.")
  1423. return self.tokenizer.batch_decode(*args, **kwargs)
  1424. def decode(self, *args, **kwargs):
  1425. """
  1426. This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
  1427. the docstring of this method for more information.
  1428. """
  1429. if not hasattr(self, "tokenizer"):
  1430. raise ValueError(f"Cannot decode text: {self.__class__.__name__} has no tokenizer.")
  1431. return self.tokenizer.decode(*args, **kwargs)
  1432. @property
  1433. def model_input_names(self):
  1434. model_input_names = []
  1435. for attribute_name in self.get_attributes():
  1436. attribute = getattr(self, attribute_name, None)
  1437. attr_input_names = getattr(attribute, "model_input_names")
  1438. model_input_names.extend(attr_input_names)
  1439. return model_input_names
  1440. @staticmethod
  1441. def validate_init_kwargs(processor_config, valid_kwargs):
  1442. kwargs_from_config = set(processor_config.keys())
  1443. valid_kwargs_set = set(valid_kwargs)
  1444. unused_keys = kwargs_from_config - valid_kwargs_set
  1445. valid_keys = kwargs_from_config & valid_kwargs_set
  1446. unused_kwargs = {k: processor_config[k] for k in unused_keys} if unused_keys else {}
  1447. valid_kwargs = {k: processor_config[k] for k in valid_keys} if valid_keys else {}
  1448. return unused_kwargs, valid_kwargs
  1449. def create_mm_token_type_ids(self, input_ids: list) -> list[list[int]]:
  1450. # We have to iterate for each list separately because inputs
  1451. # might be non-padded lists and we can't cast numpy on that!
  1452. # Then cast numpy as each input for faster indexing
  1453. mm_token_type_ids = []
  1454. for tokenizer_input in input_ids:
  1455. tokenizer_input = np.array(tokenizer_input)
  1456. mm_token_types = np.zeros_like(tokenizer_input)
  1457. mm_token_types[np.isin(tokenizer_input, self.image_ids)] = 1
  1458. mm_token_types[np.isin(tokenizer_input, self.video_ids)] = 2
  1459. mm_token_types[np.isin(tokenizer_input, self.audio_ids)] = 3
  1460. mm_token_type_ids.append(mm_token_types.tolist())
  1461. return mm_token_type_ids
  1462. def apply_chat_template(
  1463. self,
  1464. conversation: list[dict[str, str]] | list[list[dict[str, str]]],
  1465. chat_template: str | None = None,
  1466. tools: list[dict] | None = None,
  1467. documents: list[dict[str, str]] | None = None,
  1468. add_generation_prompt: bool = False,
  1469. continue_final_message: bool = False,
  1470. return_assistant_tokens_mask: bool = False,
  1471. tokenize: bool = False,
  1472. return_tensors: str | TensorType | None = None,
  1473. return_dict: bool = False,
  1474. load_audio_from_video: bool = False,
  1475. processor_kwargs: dict | None = None,
  1476. **kwargs,
  1477. ) -> str:
  1478. """
  1479. Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
  1480. conversations to turn them into a single tokenizable string.
  1481. The input is expected to be in the following format, where each message content is a list consisting of text and
  1482. optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
  1483. `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
  1484. conversation = [
  1485. {
  1486. "role": "user",
  1487. "content": [
  1488. {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
  1489. {"type": "text", "text": "Please describe this image in detail."},
  1490. ],
  1491. },
  1492. ]
  1493. Args:
  1494. conversation (`Union[list[Dict, [str, str]], list[list[dict[str, str]]]]`):
  1495. The conversation to format.
  1496. chat_template (`Optional[str]`, *optional*):
  1497. The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
  1498. chat template is used.
  1499. """
  1500. processor_kwargs = processor_kwargs or {}
  1501. if chat_template is None:
  1502. if isinstance(self.chat_template, dict) and "default" in self.chat_template:
  1503. chat_template = self.chat_template["default"]
  1504. elif isinstance(self.chat_template, dict):
  1505. raise ValueError(
  1506. 'The processor has multiple chat templates but none of them are named "default". You need to specify'
  1507. " which one to use by passing the `chat_template` argument. Available templates are: "
  1508. f"{', '.join(self.chat_template.keys())}"
  1509. )
  1510. elif self.chat_template is not None:
  1511. chat_template = self.chat_template
  1512. else:
  1513. raise ValueError(
  1514. "Cannot use apply_chat_template because this processor does not have a chat template."
  1515. )
  1516. else:
  1517. if isinstance(self.chat_template, dict) and chat_template in self.chat_template:
  1518. # It's the name of a template, not a full template string
  1519. chat_template = self.chat_template[chat_template]
  1520. else:
  1521. # It's a template string, render it directly
  1522. pass
  1523. # Users might still be passing processing kwargs in `**kwargs` so we need to filter
  1524. # out additional kwargs that the template expects via Jinja2 template introspection
  1525. template_kwargs = _get_template_variables(chat_template)
  1526. processor_kwargs_from_kwargs = {k: v for k, v in kwargs.items() if k not in template_kwargs}
  1527. if processor_kwargs_from_kwargs:
  1528. logger.warning(
  1529. "Kwargs passed to `processor.__call__` have to be in `processor_kwargs` dict, not in `**kwargs`"
  1530. )
  1531. processor_kwargs = processor_kwargs_from_kwargs
  1532. # Check if tokenizer is fast - use backend attribute if available, otherwise fall back to class name
  1533. is_tokenizers_fast = False
  1534. if hasattr(self, "tokenizer"):
  1535. if hasattr(self.tokenizer, "backend"):
  1536. is_tokenizers_fast = self.tokenizer.backend == "tokenizers"
  1537. else:
  1538. # Fallback to class name check
  1539. is_tokenizers_fast = self.tokenizer.__class__.__name__.endswith("Fast")
  1540. if continue_final_message:
  1541. if add_generation_prompt:
  1542. raise ValueError(
  1543. "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
  1544. )
  1545. if return_assistant_tokens_mask:
  1546. raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
  1547. if return_assistant_tokens_mask:
  1548. if not is_tokenizers_fast:
  1549. raise ValueError(
  1550. "`return_assistant_tokens_mask` is not possible with slow tokenizers. Make sure you have `tokenizers` installed. "
  1551. "If the error persists, open an issue to support a Fast tokenizer for your model."
  1552. )
  1553. else:
  1554. processor_kwargs["return_offsets_mapping"] = (
  1555. True # force offset mapping so we can infer token boundaries
  1556. )
  1557. # Set the sampling rate to load the audio files if user hasn't already passed with `kwargs`
  1558. sampling_rate = kwargs.get("sampling_rate", processor_kwargs.get("sampling_rate"))
  1559. if sampling_rate is None:
  1560. if hasattr(self, "feature_extractor") and hasattr(self.feature_extractor, "sampling_rate"):
  1561. sampling_rate = self.feature_extractor.sampling_rate
  1562. else:
  1563. sampling_rate = 16_000
  1564. if isinstance(conversation, (list, tuple)) and (
  1565. isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
  1566. ):
  1567. is_batched = True
  1568. conversations = conversation
  1569. else:
  1570. is_batched = False
  1571. conversations = [conversation]
  1572. # Normalize OpenAI-style "image_url" content blocks to HuggingFace-style "image" blocks
  1573. # OpenAI format: {"type": "image_url", "image_url": {"url": "..."}}
  1574. # HuggingFace format: {"type": "image", "url": "..."}
  1575. for conversation_idx, conversation in enumerate(conversations):
  1576. for message in conversation:
  1577. if not isinstance(message.get("content"), list):
  1578. continue
  1579. new_content = []
  1580. for content in message["content"]:
  1581. if isinstance(content, dict) and content.get("type") == "image_url" and "image_url" in content:
  1582. image_url_info = content["image_url"]
  1583. url = image_url_info.get("url", "") if isinstance(image_url_info, dict) else image_url_info
  1584. new_content.append({"type": "image", "url": url})
  1585. else:
  1586. new_content.append(content)
  1587. message["content"] = new_content
  1588. if tokenize:
  1589. batch_images, batch_videos = [], []
  1590. batch_audios = []
  1591. for conversation in conversations:
  1592. images, videos = [], []
  1593. for message in conversation:
  1594. visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
  1595. audio_fnames = [
  1596. content[key]
  1597. for content in message["content"]
  1598. for key in ["audio", "url", "path"]
  1599. if key in content and content["type"] == "audio"
  1600. ]
  1601. image_fnames = [
  1602. vision_info[key]
  1603. for vision_info in visuals
  1604. for key in ["image", "url", "path", "base64"]
  1605. if key in vision_info and vision_info["type"] == "image"
  1606. ]
  1607. images.extend(image_fnames)
  1608. video_fnames = [
  1609. vision_info[key]
  1610. for vision_info in visuals
  1611. for key in ["video", "url", "path"]
  1612. if key in vision_info and vision_info["type"] == "video"
  1613. ]
  1614. videos.extend(video_fnames)
  1615. # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
  1616. if not load_audio_from_video:
  1617. for fname in audio_fnames:
  1618. batch_audios.append(load_audio(fname, sampling_rate=sampling_rate))
  1619. else:
  1620. for fname in video_fnames:
  1621. # This updates the template in-place and adds audio entry
  1622. # to ensure `audio` token is added by jinja
  1623. message["content"].append({"type": "audio"})
  1624. batch_audios.append(load_audio(fname, sampling_rate=sampling_rate))
  1625. # Currently all processors can accept nested list of batches, but not flat list of visuals
  1626. # So we'll make a batched list of images and let the processor handle it
  1627. batch_images.append(images)
  1628. batch_videos.append(videos)
  1629. # `kwargs` overwrite special tokens if both are present
  1630. template_kwargs = {**self.tokenizer.special_tokens_map, **kwargs}
  1631. prompt, generation_indices = render_jinja_template(
  1632. conversations=conversations,
  1633. tools=tools,
  1634. documents=documents,
  1635. chat_template=chat_template,
  1636. return_assistant_tokens_mask=return_assistant_tokens_mask,
  1637. continue_final_message=continue_final_message,
  1638. add_generation_prompt=add_generation_prompt,
  1639. **template_kwargs,
  1640. )
  1641. if not is_batched:
  1642. prompt = prompt[0]
  1643. if tokenize:
  1644. # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
  1645. # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
  1646. # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
  1647. # everything internally. The below line is to keep BC for that and be able to work with model that have
  1648. # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
  1649. # without actionable solution for users
  1650. single_prompt = prompt[0] if is_batched else prompt
  1651. if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
  1652. processor_kwargs["add_special_tokens"] = False
  1653. # Always sample frames by default unless explicitly set to `False` by users. If users do not pass `num_frames`/`fps`
  1654. # sampling should not done for BC.
  1655. if "do_sample_frames" not in processor_kwargs and (
  1656. processor_kwargs.get("fps") is not None or processor_kwargs.get("num_frames") is not None
  1657. ):
  1658. processor_kwargs["do_sample_frames"] = True
  1659. # Set only is user passes a non-None value. Otherwise wa want to use each processor's own defaults
  1660. if return_tensors:
  1661. processor_kwargs["return_tensors"] = return_tensors
  1662. images_exist = any((im is not None) for im_list in batch_images for im in im_list)
  1663. videos_exist = any((vid is not None) for vid_list in batch_videos for vid in vid_list)
  1664. out = self(
  1665. text=prompt,
  1666. images=batch_images if images_exist else None,
  1667. videos=batch_videos if videos_exist else None,
  1668. audio=batch_audios if batch_audios else None,
  1669. **processor_kwargs,
  1670. )
  1671. if return_dict:
  1672. if return_assistant_tokens_mask:
  1673. assistant_masks = []
  1674. offset_mapping = out.pop("offset_mapping")
  1675. input_ids = out["input_ids"]
  1676. for i in range(len(input_ids)):
  1677. current_mask = [0] * len(input_ids[i])
  1678. offsets = offset_mapping[i]
  1679. offset_starts = [start for start, end in offsets]
  1680. for assistant_start_char, assistant_end_char in generation_indices[i]:
  1681. start_pos = bisect.bisect_left(offset_starts, assistant_start_char)
  1682. end_pos = bisect.bisect_left(offset_starts, assistant_end_char)
  1683. if not (
  1684. start_pos >= 0
  1685. and start_pos < len(offsets)
  1686. and offsets[start_pos][0] <= assistant_start_char < offsets[start_pos][1]
  1687. ):
  1688. # start_token is out of bounds maybe due to truncation.
  1689. continue
  1690. # Ensure end_pos is also within bounds
  1691. if end_pos > len(input_ids[i]):
  1692. end_pos = len(input_ids[i])
  1693. for token_id in range(start_pos, end_pos if end_pos else len(input_ids[i])):
  1694. current_mask[token_id] = 1
  1695. assistant_masks.append(current_mask)
  1696. out["assistant_masks"] = assistant_masks
  1697. out.convert_to_tensors(tensor_type=return_tensors)
  1698. return out
  1699. else:
  1700. return out["input_ids"]
  1701. return prompt
  1702. def parse_response(
  1703. self,
  1704. response: "str | list[str | int | list[int]] | np.ndarray | torch.Tensor",
  1705. schema: list | dict | None = None,
  1706. ):
  1707. """
  1708. Converts an output string created by generating text from a model into a parsed message dictionary.
  1709. This method is intended for use with chat models, and will read the tokenizer's `response_schema` attribute to
  1710. control parsing, although this can be overridden by passing a `response_schema` argument directly.
  1711. Args:
  1712. response (`str`):
  1713. The output string generated by the model. This can be either a decoded string or list of strings,
  1714. or token IDs as a list/array.
  1715. schema (`Union[list, dict]`, *optional*):
  1716. A response schema that indicates the expected output format and how parsing should be performed.
  1717. If not provided, the tokenizer's `response_schema` attribute will be used.
  1718. """
  1719. if not hasattr(self, "tokenizer"):
  1720. raise ValueError("Can't use parse_response on a processor class without a tokenizer!")
  1721. return self.tokenizer.parse_response(response, schema)
  1722. def post_process_multimodal_output(
  1723. self, generated_outputs, skip_special_tokens=True, generation_mode=None, **kwargs
  1724. ):
  1725. """
  1726. Post-process the output of a multimodal model to return the requested modality output.
  1727. If the model cannot generated the requested modality, an error will be raised.
  1728. Args:
  1729. generated_outputs (`torch.Tensor` or `np.ndarray`):
  1730. The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
  1731. or `(sequence_length,)`.
  1732. skip_special_tokens (`bool`, *optional*, defaults to `True`):
  1733. Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
  1734. generation_mode (`str`, *optional*):
  1735. Generation mode indicated which modality to output and can be one of `["text", "image", "audio"]`.
  1736. **kwargs:
  1737. Additional arguments to be passed to the tokenizer's `batch_decode method`.
  1738. Returns:
  1739. `list[str]`: The decoded text.
  1740. """
  1741. if generation_mode is not None and generation_mode != "text":
  1742. raise ValueError(
  1743. f"{self.__class__.__name__} got an unexpected generation_mode={generation_mode}. Supported options are only [`text`]"
  1744. )
  1745. return self.post_process_image_text_to_text(
  1746. generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs
  1747. )
  1748. def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
  1749. """
  1750. Post-process the output of a vlm to decode the text.
  1751. Args:
  1752. generated_outputs (`torch.Tensor` or `np.ndarray`):
  1753. The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
  1754. or `(sequence_length,)`.
  1755. skip_special_tokens (`bool`, *optional*, defaults to `True`):
  1756. Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `decode` method.
  1757. **kwargs:
  1758. Additional arguments to be passed to the tokenizer's `decode` method.
  1759. Returns:
  1760. `list[str]`: The decoded text.
  1761. """
  1762. return self.tokenizer.decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
  1763. def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
  1764. """
  1765. Checks that number of special tokens in text and processed text is same. The count can be different
  1766. if tokenized text was truncated, leading to issues in model code.
  1767. """
  1768. for modality in modalities:
  1769. token_str = getattr(self, f"{modality}_token")
  1770. token_id = getattr(self, f"{modality}_token_id")
  1771. ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
  1772. text_count = [sample.count(token_str) for sample in text]
  1773. if ids_count != text_count:
  1774. raise ValueError(
  1775. f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
  1776. "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
  1777. )
  1778. ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
  1779. if ProcessorMixin.push_to_hub.__doc__ is not None:
  1780. ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
  1781. object="processor", object_class="AutoProcessor", object_files="processor files"
  1782. )