| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514 |
- # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import enum
- import re
- from typing import Any, Union, overload
- import numpy as np
- from ..audio_utils import AudioInput
- from ..generation import GenerationConfig
- from ..image_utils import ImageInput
- from ..processing_utils import ProcessingKwargs, Unpack
- from ..utils import (
- add_end_docstrings,
- is_torch_available,
- is_vision_available,
- logging,
- requires_backends,
- )
- from ..video_utils import VideoInput
- from .base import Pipeline, build_pipeline_init_args
- if is_torch_available():
- import torch
- from ..models.auto.modeling_auto import MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES
- from .pt_utils import KeyDataset
- if is_vision_available():
- from PIL import Image
- logger = logging.get_logger(__name__)
- class ReturnType(enum.Enum):
- TENSORS = 0
- NEW_TEXT = 1
- FULL_TEXT = 2
- class Chat:
- """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
- to this format because the rest of the pipeline code tends to assume that lists of messages are
- actually a batch of samples rather than messages in the same conversation."""
- def __init__(self, messages: list[dict]):
- for message in messages:
- if not ("role" in message and "content" in message):
- raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
- self.messages = messages
- @add_end_docstrings(build_pipeline_init_args(has_processor=True))
- class AnyToAnyPipeline(Pipeline):
- """
- Multimodal Generation pipeline using an `AutoModelForMultimodalLM`. This pipeline generates text given any
- combination of multimodal data and text.When the underlying model is a conversational model, it can also
- accept one or more chats, in which case the pipeline will operate in chat mode and will continue the
- chat(s) by adding its response(s). Each chat takes the form of a list of dicts, where each dict contains
- "role" and "content" keys.
- Unless the model you're using explicitly sets these generation parameters in its configuration files
- (`generation_config.json`), the following default values will be used:
- - max_new_tokens: 256
- Example:
- ```python
- >>> from transformers import pipeline
- >>> pipe = pipeline(task="any-to-any", model="google/gemma-3n-E4B-it")
- >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
- [{'generated_text': 'a photo of two birds'}]
- ```
- ```python
- >>> from transformers import pipeline
- >>> pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
- >>> messages = [
- >>> {
- >>> "role": "user",
- >>> "content": [
- >>> {
- >>> "type": "image",
- >>> "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
- >>> },
- >>> {"type": "text", "text": "Describe this image."},
- >>> ],
- >>> },
- >>> {
- >>> "role": "assistant",
- >>> "content": [
- >>> {"type": "text", "text": "There is a dog and"},
- >>> ],
- >>> },
- >>> ]
- >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
- [{'input_text': [{'role': 'user',
- 'content': [{'type': 'image',
- 'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
- {'type': 'text', 'text': 'Describe this image.'}]},
- {'role': 'assistant',
- 'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
- 'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
- ```
- Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
- This multimodal pipeline can currently be loaded from pipeline() using the following task identifier:
- "any-to-any".
- See the list of available models on
- [huggingface.co/models](https://huggingface.co/models?pipeline_tag=any-to-any).
- """
- _load_processor = True
- _load_image_processor = False
- _load_feature_extractor = False
- _load_tokenizer = False
- _pipeline_calls_generate = True
- # Make sure the docstring is updated when the default generation config is changed
- _default_generation_config = GenerationConfig(
- max_new_tokens=256,
- )
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- if "image" in self.model.input_modalities or "video" in self.model.input_modalities:
- requires_backends(self, "vision")
- requires_backends(self, "torchvision")
- if "audio" in self.model.input_modalities:
- requires_backends(self, "librosa")
- self.check_model_type(MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES)
- def _sanitize_parameters(
- self,
- max_new_tokens=None,
- generate_kwargs=None,
- timeout=None,
- return_full_text=None,
- return_tensors=None,
- return_type=None,
- clean_up_tokenization_spaces=None,
- stop_sequence=None,
- continue_final_message=None,
- skip_special_tokens=None,
- generation_mode=None,
- processor_kwargs=None,
- **kwargs: Unpack[ProcessingKwargs],
- ):
- forward_kwargs = {}
- preprocess_params = {}
- postprocess_params = {}
- # Preprocess params
- preprocess_params.update(kwargs)
- if timeout is not None:
- preprocess_params["timeout"] = timeout
- if continue_final_message is not None:
- preprocess_params["continue_final_message"] = continue_final_message
- if processor_kwargs is not None:
- preprocess_params["processor_kwargs"] = processor_kwargs
- # Forward kwargs
- forward_kwargs["generate_kwargs"] = generate_kwargs or {}
- if generation_mode is not None and generation_mode != "text":
- forward_kwargs["generate_kwargs"]["generation_mode"] = generation_mode
- # Qwen-Omni models need to know the origin of audio, to align mm position ids
- if kwargs.get("load_audio_from_video") and re.search(r"qwen\domni", self.model.__class__.__name__.lower()):
- forward_kwargs["generate_kwargs"]["use_audio_in_video"] = True
- if stop_sequence is not None:
- if isinstance(stop_sequence, str):
- stop_sequence = [stop_sequence]
- forward_kwargs["generate_kwargs"]["stop_strings"] = stop_sequence
- forward_kwargs["generate_kwargs"]["tokenizer"] = self.processor.tokenizer
- if max_new_tokens is not None:
- if generate_kwargs is not None and "max_new_tokens" in generate_kwargs:
- raise ValueError(
- "'max_new_tokens' is defined twice, once in 'generate_kwargs' and "
- "once as a direct argument. Please use only one."
- )
- forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
- if return_full_text is not None and return_type is None:
- if return_tensors is not None:
- raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
- return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
- elif return_tensors is not None and return_type is None:
- return_type = ReturnType.TENSORS
- # We don't want to set the global default to FULLTEXT at init time. That is why
- # `_postprocess_params` is checked before setting the default value
- elif return_type is None and generation_mode in [None, "text"] and hasattr(self, "_postprocess_params"):
- return_type = ReturnType.FULL_TEXT
- # Postprocess params
- if generation_mode not in [None, "text"] and return_type is not None:
- raise ValueError(
- f"`return_type` cannot be set to {return_type} when generation_mode={generation_mode}. "
- "Set `return_type=None` or generation_mode='text'"
- )
- if generation_mode not in [None, "text", "image", "audio"]:
- raise ValueError(
- f"`generation_mode` can be only one of the `text`, `audio`, `image` but got generation_mode[={generation_mode}]"
- )
- elif generation_mode is not None and generation_mode not in self.model.output_modalities:
- raise ValueError(
- f"`generation_mode={generation_mode}` is not supported for {self.model.__class__.__name__}. "
- f"The model can only output the following modalities: {self.model.output_modalities}"
- )
- if return_type is not None:
- postprocess_params["return_type"] = return_type
- if continue_final_message is not None:
- postprocess_params["continue_final_message"] = continue_final_message
- if clean_up_tokenization_spaces is not None:
- postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
- if skip_special_tokens is not None:
- postprocess_params["skip_special_tokens"] = skip_special_tokens
- postprocess_params["generation_mode"] = generation_mode
- return preprocess_params, forward_kwargs, postprocess_params
- @overload
- def __call__(
- self,
- text: str | None = None,
- images: Union[str, "Image.Image"] | None = None,
- videos: Union[str, "np.ndarray", "torch.Tensor"] | None = None,
- audio: Union[str, "np.ndarray"] | None = None,
- **kwargs: Any,
- ) -> list[dict[str, Any]]: ...
- @overload
- def __call__(
- self,
- text: list[str] | None = None,
- images: list[str] | list["Image.Image"] | None = None,
- videos: list[str] | list["np.ndarray"] | list["torch.Tensor"] | None = None,
- audio: list[str] | list["np.ndarray"] | None = None,
- **kwargs: Any,
- ) -> list[list[dict[str, Any]]]: ...
- def __call__(
- self,
- text: str | list[str] | list[dict],
- images: str | list[str] | list[list[str]] | ImageInput | None = None,
- videos: str | list[str] | VideoInput | None = None,
- audio: str | list[str] | AudioInput | None = None,
- **kwargs,
- ) -> list[dict[str, Any]] | list[list[dict[str, Any]]]:
- """
- Generate a text given text and optionally multimodal data passed as inputs.
- Args:
- text (`str`, `list[str]`, `list[dict]`):
- The text to be used for generation. If a list of strings is passed, the length of the list should be
- the same as the number of images. Text can also follow the chat format: a list of dictionaries where
- each dictionary represents a message in a conversation. Each dictionary should have two keys: 'role'
- and 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of
- dictionary containing the text of the message and the type of the message.
- images (`str`, `list[str]`, `ImageInput`):
- The pipeline handles three types of images:
- - A string containing a HTTP(s) link pointing to an image
- - A string containing a local path to an image
- - An image loaded in PIL directly
- The pipeline accepts either a single image or a batch of images. Finally, this pipeline also supports
- the chat format (see `text`) containing images and text in this argument.
- videos (`str`, `list[str]`, `VideoInput`):
- The pipeline handles three types of videos:
- - A string containing a HTTP(s) link pointing to a video
- - A string containing a local path to a video
- - A video loaded and decoded to array format
- The pipeline accepts either a single video or a batch of videos. Finally, this pipeline also supports
- the chat format (see `text`) containing videos and text in this argument.
- audio (`str`, `list[str]`, `AudioInput`):
- The pipeline handles three types of audios:
- - A string containing a HTTP(s) link pointing to an audio
- - A string containing a local path to an audio
- - An audio loaded in PIL directly
- The pipeline accepts either a single audios or a batch of audios. Finally, this pipeline also supports
- the chat format (see `text`) containing audios and text in this argument.
- return_tensors (`bool`, *optional*, defaults to `False`):
- Returns the tensors of predictions (as token indices) in the outputs. If set to
- `True`, the decoded text is not returned.
- return_text (`bool`, *optional*):
- Returns the decoded texts in the outputs.
- return_full_text (`bool`, *optional*, defaults to `True`):
- If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
- specified at the same time as `return_text`.
- clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
- Whether or not to clean up the potential extra spaces in the text output.
- continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
- last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
- By default this is `True` when the final message in the input chat has the `assistant` role and
- `False` otherwise, but you can manually override that behaviour by setting this flag.
- Return:
- A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot
- return a combination of both `generated_text` and `generated_token_ids`):
- - **generated_text** (`str`, present when `return_text=True` and `generation_mode="text"`) -- The generated text.
- - **generated_audio** (`np.ndarray`, present when `generation_mode="audio"`) -- The generated audio.
- - **generated_image** (`PIL.Image.Image`, present when `generation_mode="image"`) -- The generated image.
- - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True` and `generation_mode="text"`) -- The token
- ids of the generated text.
- - **input_text** (`str`) -- The input text.
- """
- if images is None and text is None:
- raise ValueError("You must at least provide either text or images.")
- if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
- # We have one or more prompts in list-of-dicts format, so this is chat mode
- if isinstance(text[0], dict) and "role" in text[0]:
- return super().__call__(Chat(text), **kwargs)
- elif isinstance(text[0], (list, tuple)) and isinstance(text[0][0], dict) and "role" in text[0][0]:
- chats = [Chat(chat) for chat in text] # 🐈 🐈 🐈
- return super().__call__(chats, **kwargs)
- if text is not None and not (isinstance(text, str) or (isinstance(text, list) and isinstance(text[0], str))):
- """
- Supports the following format
- - {"text": text, "image": image, "video": video, "audio": audio}
- - [{"text": text, "image": image, "video": video, "audio": audio}]
- - Generator and datasets
- This is a common pattern in other multimodal pipelines, so we support it here as well.
- """
- return super().__call__(text, **kwargs)
- # encourage the user to use the chat format if supported
- if getattr(self.processor, "chat_template", None) is not None:
- logger.warning_once(
- "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even "
- "though this model supports chat. Consider using the chat format for better results. For more "
- "information, see https://huggingface.co/docs/transformers/en/chat_templating"
- )
- return super().__call__({"text": text, "images": images, "video": videos, "audio": audio}, **kwargs)
- def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
- if isinstance(inputs, Chat):
- # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
- # because very few models support multiple separate, consecutive assistant messages
- if continue_final_message is None:
- continue_final_message = inputs.messages[-1]["role"] == "assistant"
- # Processor kwargs are passed separately from jinja kwargs to chat template
- # but it was added only in https://github.com/huggingface/transformers/pull/44881
- processor_kwargs = processing_kwargs.pop("processor_kwargs", None) or {}
- chat_template_kwargs = {
- "continue_final_message": continue_final_message,
- "return_tensors": "pt",
- "tokenize": True,
- "return_dict": True,
- "add_generation_prompt": not continue_final_message,
- "processor_kwargs": processor_kwargs,
- **processing_kwargs,
- }
- # Handle Mistral tokenizer which does not accept processing kwargs
- if self.processor.tokenizer.__class__.__name__ == "MistralCommonBackend":
- chat_template_kwargs = {
- k: v for k, v in chat_template_kwargs.items() if k in ["padding", "truncation", "max_length"]
- }
- model_inputs = self.processor.apply_chat_template(
- inputs.messages,
- **chat_template_kwargs,
- ).to(dtype=self.dtype)
- model_inputs["text"] = inputs
- return model_inputs
- # In case we only have text inputs
- if isinstance(inputs, (list, tuple, str)):
- text = inputs
- inputs = {}
- else:
- inputs = inputs.copy() # avoid in-place changes if users passed dict
- text = inputs.pop("text")
- # Feature extractor do not load audio files and expect a decoded array
- if inputs.get("audio", None) is not None and hasattr(self.processor, "feature_extractor"):
- inputs["audio"] = self.processor.feature_extractor.fetch_audio(inputs["audio"])
- # If batched text inputs, we set padding to True unless specified otherwise
- processor_kwargs = processing_kwargs.pop("processor_kwargs", None) or processing_kwargs
- if isinstance(text, (list, tuple)) and len(text) > 1:
- processor_kwargs.setdefault("padding", True)
- model_inputs = self.processor(text=text, **inputs, return_tensors="pt", **processor_kwargs).to(
- dtype=self.dtype
- )
- model_inputs["text"] = text
- return model_inputs
- def _forward(self, model_inputs, generate_kwargs=None):
- generate_kwargs = {} if generate_kwargs is None else generate_kwargs
- prompt_text = model_inputs.pop("text")
- input_ids = model_inputs.get("input_ids", model_inputs.get("decoder_input_ids"))
- # User-defined `generation_config` passed to the pipeline call take precedence
- if "generation_config" not in generate_kwargs:
- generate_kwargs["generation_config"] = self.generation_config
- generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
- return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
- def postprocess(
- self,
- model_outputs,
- return_type=None,
- continue_final_message=None,
- skip_special_tokens=None,
- **postprocess_kwargs,
- ):
- input_texts = model_outputs["prompt_text"]
- input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
- generated_sequence = model_outputs["generated_sequence"]
- input_ids = model_outputs["input_ids"]
- if return_type == ReturnType.TENSORS:
- return [
- {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
- for i in range(len(input_texts))
- ]
- # Decode inputs and outputs the same way to remove input text from generated text if present
- skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
- if getattr(self.tokenizer, "response_schema", False):
- skip_special_tokens = False
- generation_mode = postprocess_kwargs["generation_mode"] or "text"
- if generation_mode == "image" and hasattr(self.model, "decode_image_tokens"):
- generated_sequence = self.model.decode_image_tokens(generated_sequence.to(self.model.device))
- generated_outputs = self.processor.post_process_multimodal_output(
- generated_sequence, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
- )
- # Force consistent behavior for including the input text in the output
- if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
- # Remove the input text from the generated text if the generated text starts with the input text
- # (accounting for the possibility of a space between the input and generated text)
- new_generated_texts = []
- postprocess_kwargs["generation_mode"] = "text"
- decoded_inputs = self.processor.post_process_multimodal_output(
- input_ids, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
- )
- for text_generated, decoded_input in zip(generated_outputs, decoded_inputs):
- # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
- index_input_text = text_generated.find(decoded_input)
- # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
- if 0 <= index_input_text <= 2:
- # If the input text is found, we remove it
- new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
- else:
- new_generated_texts.append(text_generated)
- generated_outputs = new_generated_texts
- if return_type == ReturnType.FULL_TEXT:
- full_texts = []
- for prompt_text, generated_text in zip(input_texts, generated_outputs):
- if isinstance(prompt_text, str):
- generated_text = prompt_text + generated_text
- elif isinstance(prompt_text, Chat):
- if continue_final_message is None:
- # If the user passes a chat ending in an assistant message, we treat it as a prefill by
- # default because very few models support multiple separate, consecutive assistant messages
- continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
- if continue_final_message:
- # With assistant prefill, concat onto the end of the last message
- new_text = dict(prompt_text.messages[-1]["content"][-1].items())
- new_text["text"] += generated_text
- generated_text = list(prompt_text.messages)[:-1] + [
- {
- "role": prompt_text.messages[-1]["role"],
- "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
- }
- ]
- else:
- # When we're not starting from a prefill, the output is a new assistant message
- if getattr(self.tokenizer, "response_schema", False):
- assistant_message = self.tokenizer.parse_response(generated_text)
- else:
- assistant_message = {"role": "assistant", "content": generated_text}
- generated_text = list(prompt_text.messages) + [assistant_message]
- full_texts.append(generated_text)
- generated_outputs = full_texts
- records = [
- {
- "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
- f"generated_{generation_mode}": generated_output,
- }
- for input_text, generated_output in zip(input_texts, generated_outputs)
- ]
- return records
|