| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- """Transforms for text rendering and augmentation on images.
- This module provides transforms for adding and manipulating text on images,
- including text augmentation techniques like word insertion, deletion, and swapping.
- """
- from __future__ import annotations
- import re
- from pathlib import Path
- from typing import Annotated, Any, Literal
- import numpy as np
- from pydantic import AfterValidator
- import albumentations.augmentations.text.functional as ftext
- from albumentations.core.bbox_utils import check_bboxes, denormalize_bboxes
- from albumentations.core.pydantic import check_range_bounds, nondecreasing
- from albumentations.core.transforms_interface import BaseTransformInitSchema, ImageOnlyTransform
- __all__ = ["TextImage"]
- class TextImage(ImageOnlyTransform):
- """Apply text rendering transformations on images.
- This class supports rendering text directly onto images using a variety of configurations,
- such as custom fonts, font sizes, colors, and augmentation methods. The text can be placed
- inside specified bounding boxes.
- Args:
- font_path (str | Path): Path to the font file to use for rendering text.
- stopwords (list[str] | None): List of stopwords for text augmentation.
- augmentations (tuple[str | None, ...]): List of text augmentations to apply.
- None: text is printed as is
- "insertion": insert random stop words into the text.
- "swap": swap random words in the text.
- "deletion": delete random words from the text.
- fraction_range (tuple[float, float]): Range for selecting a fraction of bounding boxes to modify.
- font_size_fraction_range (tuple[float, float]): Range for selecting the font size as a fraction of
- bounding box height.
- font_color (tuple[float, ...]): Font color as RGB values (e.g., (0, 0, 0) for black).
- clear_bg (bool): Whether to clear the background before rendering text.
- metadata_key (str): Key to access metadata in the parameters.
- p (float): Probability of applying the transform.
- Targets:
- image, volume
- Image types:
- uint8, float32
- References:
- doc-augmentation: https://github.com/danaaubakirova/doc-augmentation
- Examples:
- >>> import albumentations as A
- >>> transform = A.Compose([
- A.TextImage(
- font_path=Path("/path/to/font.ttf"),
- stopwords=("the", "is", "in"),
- augmentations=("insertion", "deletion"),
- fraction_range=(0.5, 1.0),
- font_size_fraction_range=(0.5, 0.9),
- font_color=(255, 0, 0), # red in RGB
- metadata_key="text_metadata",
- p=0.5
- )
- ])
- >>> transformed = transform(image=my_image, text_metadata=my_metadata)
- >>> image = transformed['image']
- # This will render text on `my_image` based on the metadata provided in `my_metadata`.
- """
- class InitSchema(BaseTransformInitSchema):
- font_path: str | Path
- stopwords: tuple[str, ...]
- augmentations: tuple[str | None, ...]
- fraction_range: Annotated[
- tuple[float, float],
- AfterValidator(nondecreasing),
- AfterValidator(check_range_bounds(0, 1)),
- ]
- font_size_fraction_range: Annotated[
- tuple[float, float],
- AfterValidator(nondecreasing),
- AfterValidator(check_range_bounds(0, 1)),
- ]
- font_color: tuple[float, ...]
- clear_bg: bool
- metadata_key: str
- def __init__(
- self,
- font_path: str | Path,
- stopwords: tuple[str, ...] = ("the", "is", "in", "at", "of"),
- augmentations: tuple[Literal["insertion", "swap", "deletion"] | None, ...] = (None,),
- fraction_range: tuple[float, float] = (1.0, 1.0),
- font_size_fraction_range: tuple[float, float] = (0.8, 0.9),
- font_color: tuple[float, ...] = (0, 0, 0), # black in RGB
- clear_bg: bool = False,
- metadata_key: str = "textimage_metadata",
- p: float = 0.5,
- ) -> None:
- super().__init__(p=p)
- self.metadata_key = metadata_key
- self.font_path = font_path
- self.fraction_range = fraction_range
- self.stopwords = stopwords
- self.augmentations = list(augmentations)
- self.font_size_fraction_range = font_size_fraction_range
- self.font_color = font_color
- self.clear_bg = clear_bg
- @property
- def targets_as_params(self) -> list[str]:
- """Get list of targets that should be passed as parameters to transforms.
- Returns:
- list[str]: List containing the metadata key name
- """
- return [self.metadata_key]
- def random_aug(
- self,
- text: str,
- fraction: float,
- choice: Literal["insertion", "swap", "deletion"],
- ) -> str:
- """Apply a random text augmentation to the input text.
- Args:
- text (str): Original text to augment
- fraction (float): Fraction of words to modify
- choice (Literal["insertion", "swap", "deletion"]): Type of augmentation to apply
- Returns:
- str: Augmented text or empty string if no change was made
- Raises:
- ValueError: If an invalid choice is provided
- """
- words = [word for word in text.strip().split() if word]
- num_words = len(words)
- num_words_to_modify = max(1, int(fraction * num_words))
- if choice == "insertion":
- result_sentence = ftext.insert_random_stopwords(words, num_words_to_modify, self.stopwords, self.py_random)
- elif choice == "swap":
- result_sentence = ftext.swap_random_words(words, num_words_to_modify, self.py_random)
- elif choice == "deletion":
- result_sentence = ftext.delete_random_words(words, num_words_to_modify, self.py_random)
- else:
- raise ValueError("Invalid choice. Choose from 'insertion', 'swap', or 'deletion'.")
- result_sentence = re.sub(" +", " ", result_sentence).strip()
- return result_sentence if result_sentence != text else ""
- def preprocess_metadata(
- self,
- image: np.ndarray,
- bbox: tuple[float, float, float, float],
- text: str,
- bbox_index: int,
- ) -> dict[str, Any]:
- """Preprocess text metadata for a single bounding box.
- Args:
- image (np.ndarray): Input image
- bbox (tuple[float, float, float, float]): Normalized bounding box coordinates
- text (str): Text to render in the bounding box
- bbox_index (int): Index of the bounding box in the original metadata
- Returns:
- dict[str, Any]: Processed metadata including font, position, and text information
- Raises:
- ImportError: If PIL.ImageFont is not installed
- """
- try:
- from PIL import ImageFont
- except ImportError as err:
- raise ImportError(
- "ImageFont from PIL is required to use TextImage transform. Install it with `pip install Pillow`.",
- ) from err
- check_bboxes(np.array([bbox]))
- denormalized_bbox = denormalize_bboxes(np.array([bbox]), image.shape[:2])[0]
- x_min, y_min, x_max, y_max = (int(x) for x in denormalized_bbox[:4])
- bbox_height = y_max - y_min
- font_size_fraction = self.py_random.uniform(*self.font_size_fraction_range)
- font = ImageFont.truetype(str(self.font_path), int(font_size_fraction * bbox_height))
- if not self.augmentations or self.augmentations is None:
- augmented_text = text
- else:
- augmentation = self.py_random.choice(self.augmentations)
- augmented_text = text if augmentation is None else self.random_aug(text, 0.5, choice=augmentation)
- font_color = self.font_color
- return {
- "bbox_coords": (x_min, y_min, x_max, y_max),
- "bbox_index": bbox_index,
- "original_text": text,
- "text": augmented_text,
- "font": font,
- "font_color": font_color,
- }
- def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, Any]) -> dict[str, Any]:
- """Generate parameters based on input data.
- Args:
- params (dict[str, Any]): Dictionary of existing parameters
- data (dict[str, Any]): Dictionary containing input data with image and metadata
- Returns:
- dict[str, Any]: Dictionary containing the overlay data for text rendering
- """
- image = data["image"] if "image" in data else data["images"][0]
- metadata = data[self.metadata_key]
- if metadata == []:
- return {
- "overlay_data": [],
- }
- if isinstance(metadata, dict):
- metadata = [metadata]
- fraction = self.py_random.uniform(*self.fraction_range)
- num_bboxes_to_modify = int(len(metadata) * fraction)
- bbox_indices_to_update = self.py_random.sample(range(len(metadata)), num_bboxes_to_modify)
- overlay_data = [
- self.preprocess_metadata(image, metadata[bbox_index]["bbox"], metadata[bbox_index]["text"], bbox_index)
- for bbox_index in bbox_indices_to_update
- ]
- return {
- "overlay_data": overlay_data,
- }
- def apply(
- self,
- img: np.ndarray,
- overlay_data: list[dict[str, Any]],
- **params: Any,
- ) -> np.ndarray:
- """Apply text rendering to the input image.
- Args:
- img (np.ndarray): Input image
- overlay_data (list[dict[str, Any]]): List of dictionaries containing text rendering information
- **params (Any): Additional parameters
- Returns:
- np.ndarray: Image with rendered text
- """
- return ftext.render_text(img, overlay_data, clear_bg=self.clear_bg)
- def apply_with_params(self, params: dict[str, Any], *args: Any, **kwargs: Any) -> dict[str, Any]:
- """Apply the transform and include overlay data in the result.
- Args:
- params (dict[str, Any]): Parameters for the transform
- *args (Any): Additional positional arguments
- **kwargs (Any): Additional keyword arguments
- Returns:
- dict[str, Any]: Dictionary containing the transformed data and simplified overlay information
- """
- res = super().apply_with_params(params, *args, **kwargs)
- res["overlay_data"] = [
- {
- "bbox_coords": overlay["bbox_coords"],
- "text": overlay["text"],
- "original_text": overlay["original_text"],
- "bbox_index": overlay["bbox_index"],
- "font_color": overlay["font_color"],
- }
- for overlay in params["overlay_data"]
- ]
- return res
|