processing_clipseg.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Copyright 2022 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. Image/Text processor class for CLIPSeg
  16. """
  17. from ...processing_utils import ProcessorMixin
  18. from ...tokenization_utils_base import BatchEncoding
  19. from ...utils import auto_docstring
  20. @auto_docstring
  21. class CLIPSegProcessor(ProcessorMixin):
  22. def __init__(self, image_processor=None, tokenizer=None, **kwargs):
  23. super().__init__(image_processor, tokenizer)
  24. @auto_docstring
  25. def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs):
  26. r"""
  27. visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
  28. The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
  29. NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
  30. (C, H, W), where C is a number of channels, H and W are image height and width.
  31. Returns:
  32. [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
  33. - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
  34. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
  35. `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
  36. `None`).
  37. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  38. """
  39. if text is None and visual_prompt is None and images is None:
  40. raise ValueError("You have to specify either text, visual prompt or images.")
  41. if text is not None and visual_prompt is not None:
  42. raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
  43. output_kwargs = self._merge_kwargs(
  44. self.valid_processor_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
  45. )
  46. if text is not None:
  47. encoding = self.tokenizer(text, return_tensors=return_tensors, **output_kwargs["text_kwargs"])
  48. if visual_prompt is not None:
  49. prompt_features = self.image_processor(
  50. visual_prompt, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
  51. )
  52. if images is not None:
  53. image_features = self.image_processor(
  54. images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
  55. )
  56. if visual_prompt is not None and images is not None:
  57. encoding = {
  58. "pixel_values": image_features.pixel_values,
  59. "conditional_pixel_values": prompt_features.pixel_values,
  60. }
  61. return encoding
  62. elif text is not None and images is not None:
  63. encoding["pixel_values"] = image_features.pixel_values
  64. return encoding
  65. elif text is not None:
  66. return encoding
  67. elif visual_prompt is not None:
  68. encoding = {
  69. "conditional_pixel_values": prompt_features.pixel_values,
  70. }
  71. return encoding
  72. else:
  73. return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
  74. __all__ = ["CLIPSegProcessor"]