| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- from __future__ import annotations
- from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
- from typing_extensions import TypeAlias
- from bs4.dammit import EntitySubstitution
- if TYPE_CHECKING:
- from bs4._typing import _AttributeValue
- class Formatter(EntitySubstitution):
- """Describes a strategy to use when outputting a parse tree to a string.
- Some parts of this strategy come from the distinction between
- HTML4, HTML5, and XML. Others are configurable by the user.
- Formatters are passed in as the `formatter` argument to methods
- like `bs4.element.Tag.encode`. Most people won't need to
- think about formatters, and most people who need to think about
- them can pass in one of these predefined strings as `formatter`
- rather than making a new Formatter object:
- For HTML documents:
- * 'html' - HTML entity substitution for generic HTML documents. (default)
- * 'html5' - HTML entity substitution for HTML5 documents, as
- well as some optimizations in the way tags are rendered.
- * 'html5-4.12.0' - The version of the 'html5' formatter used prior to
- Beautiful Soup 4.13.0.
- * 'minimal' - Only make the substitutions necessary to guarantee
- valid HTML.
- * None - Do not perform any substitution. This will be faster
- but may result in invalid markup.
- For XML documents:
- * 'html' - Entity substitution for XHTML documents.
- * 'minimal' - Only make the substitutions necessary to guarantee
- valid XML. (default)
- * None - Do not perform any substitution. This will be faster
- but may result in invalid markup.
- """
- #: Constant name denoting HTML markup
- HTML: str = "html"
- #: Constant name denoting XML markup
- XML: str = "xml"
- #: Default values for the various constructor options when the
- #: markup language is HTML.
- HTML_DEFAULTS: Dict[str, Set[str]] = dict(
- cdata_containing_tags=set(["script", "style"]),
- )
- language: Optional[str] #: :meta private:
- entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
- void_element_close_prefix: str #: :meta private:
- cdata_containing_tags: Set[str] #: :meta private:
- indent: str #: :meta private:
- #: If this is set to true by the constructor, then attributes whose
- #: values are sent to the empty string will be treated as HTML
- #: boolean attributes. (Attributes whose value is None are always
- #: rendered this way.)
- empty_attributes_are_booleans: bool
- def _default(
- self, language: str, value: Optional[Set[str]], kwarg: str
- ) -> Set[str]:
- if value is not None:
- return value
- if language == self.XML:
- # When XML is the markup language in use, all of the
- # defaults are the empty list.
- return set()
- # Otherwise, it depends on what's in HTML_DEFAULTS.
- return self.HTML_DEFAULTS[kwarg]
- def __init__(
- self,
- language: Optional[str] = None,
- entity_substitution: Optional[_EntitySubstitutionFunction] = None,
- void_element_close_prefix: str = "/",
- cdata_containing_tags: Optional[Set[str]] = None,
- empty_attributes_are_booleans: bool = False,
- indent: Union[int,str] = 1,
- ):
- r"""Constructor.
- :param language: This should be `Formatter.XML` if you are formatting
- XML markup and `Formatter.HTML` if you are formatting HTML markup.
- :param entity_substitution: A function to call to replace special
- characters with XML/HTML entities. For examples, see
- bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
- :param void_element_close_prefix: By default, void elements
- are represented as <tag/> (XML rules) rather than <tag>
- (HTML rules). To get <tag>, pass in the empty string.
- :param cdata_containing_tags: The set of tags that are defined
- as containing CDATA in this dialect. For example, in HTML,
- <script> and <style> tags are defined as containing CDATA,
- and their contents should not be formatted.
- :param empty_attributes_are_booleans: If this is set to true,
- then attributes whose values are sent to the empty string
- will be treated as `HTML boolean
- attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
- whose value is None are always rendered this way.)
- :param indent: If indent is a non-negative integer or string,
- then the contents of elements will be indented
- appropriately when pretty-printing. An indent level of 0,
- negative, or "" will only insert newlines. Using a
- positive integer indent indents that many spaces per
- level. If indent is a string (such as "\t"), that string
- is used to indent each level. The default behavior is to
- indent one space per level.
- """
- self.language = language or self.HTML
- self.entity_substitution = entity_substitution
- self.void_element_close_prefix = void_element_close_prefix
- self.cdata_containing_tags = self._default(
- self.language, cdata_containing_tags, "cdata_containing_tags"
- )
- self.empty_attributes_are_booleans = empty_attributes_are_booleans
- if indent is None:
- indent = 0
- indent_str: str
- if isinstance(indent, int):
- if indent < 0:
- indent = 0
- indent_str = " " * indent
- elif isinstance(indent, str):
- indent_str = indent
- else:
- indent_str = " "
- self.indent = indent_str
- def substitute(self, ns: str) -> str:
- """Process a string that needs to undergo entity substitution.
- This may be a string encountered in an attribute value or as
- text.
- :param ns: A string.
- :return: The same string but with certain characters replaced by named
- or numeric entities.
- """
- if not self.entity_substitution:
- return ns
- from .element import NavigableString
- if (
- isinstance(ns, NavigableString)
- and ns.parent is not None
- and ns.parent.name in self.cdata_containing_tags
- ):
- # Do nothing.
- return ns
- # Substitute.
- return self.entity_substitution(ns)
- def attribute_value(self, value: str) -> str:
- """Process the value of an attribute.
- :param ns: A string.
- :return: A string with certain characters replaced by named
- or numeric entities.
- """
- return self.substitute(value)
- def attributes(
- self, tag: bs4.element.Tag # type:ignore
- ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
- """Reorder a tag's attributes however you want.
- By default, attributes are sorted alphabetically. This makes
- behavior consistent between Python 2 and Python 3, and preserves
- backwards compatibility with older versions of Beautiful Soup.
- If `empty_attributes_are_booleans` is True, then
- attributes whose values are set to the empty string will be
- treated as boolean attributes.
- """
- if tag.attrs is None:
- return []
- items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
- return sorted(
- (k, (None if self.empty_attributes_are_booleans and v == "" else v))
- for k, v in items
- )
- class HTMLFormatter(Formatter):
- """A generic Formatter for HTML."""
- REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
- def __init__(
- self,
- entity_substitution: Optional[_EntitySubstitutionFunction] = None,
- void_element_close_prefix: str = "/",
- cdata_containing_tags: Optional[Set[str]] = None,
- empty_attributes_are_booleans: bool = False,
- indent: Union[int,str] = 1,
- ):
- super(HTMLFormatter, self).__init__(
- self.HTML,
- entity_substitution,
- void_element_close_prefix,
- cdata_containing_tags,
- empty_attributes_are_booleans,
- indent=indent
- )
- class XMLFormatter(Formatter):
- """A generic Formatter for XML."""
- REGISTRY: Dict[Optional[str], XMLFormatter] = {}
- def __init__(
- self,
- entity_substitution: Optional[_EntitySubstitutionFunction] = None,
- void_element_close_prefix: str = "/",
- cdata_containing_tags: Optional[Set[str]] = None,
- empty_attributes_are_booleans: bool = False,
- indent: Union[int,str] = 1,
- ):
- super(XMLFormatter, self).__init__(
- self.XML,
- entity_substitution,
- void_element_close_prefix,
- cdata_containing_tags,
- empty_attributes_are_booleans,
- indent=indent,
- )
- # Set up aliases for the default formatters.
- HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html
- )
- HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html5,
- void_element_close_prefix="",
- empty_attributes_are_booleans=True,
- )
- HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html,
- void_element_close_prefix="",
- empty_attributes_are_booleans=True,
- )
- HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
- entity_substitution=EntitySubstitution.substitute_xml
- )
- HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
- XMLFormatter.REGISTRY["html"] = XMLFormatter(
- entity_substitution=EntitySubstitution.substitute_html
- )
- XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
- entity_substitution=EntitySubstitution.substitute_xml
- )
- XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
- # Define type aliases to improve readability.
- #
- #: A function to call to replace special characters with XML or HTML
- #: entities.
- _EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
- # Many of the output-centered methods take an argument that can either
- # be a Formatter object or the name of a Formatter to be looked up.
- _FormatterOrName = Union[Formatter, str]
|