formatter.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. from __future__ import annotations
  2. from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
  3. from typing_extensions import TypeAlias
  4. from bs4.dammit import EntitySubstitution
  5. if TYPE_CHECKING:
  6. from bs4._typing import _AttributeValue
  7. class Formatter(EntitySubstitution):
  8. """Describes a strategy to use when outputting a parse tree to a string.
  9. Some parts of this strategy come from the distinction between
  10. HTML4, HTML5, and XML. Others are configurable by the user.
  11. Formatters are passed in as the `formatter` argument to methods
  12. like `bs4.element.Tag.encode`. Most people won't need to
  13. think about formatters, and most people who need to think about
  14. them can pass in one of these predefined strings as `formatter`
  15. rather than making a new Formatter object:
  16. For HTML documents:
  17. * 'html' - HTML entity substitution for generic HTML documents. (default)
  18. * 'html5' - HTML entity substitution for HTML5 documents, as
  19. well as some optimizations in the way tags are rendered.
  20. * 'html5-4.12.0' - The version of the 'html5' formatter used prior to
  21. Beautiful Soup 4.13.0.
  22. * 'minimal' - Only make the substitutions necessary to guarantee
  23. valid HTML.
  24. * None - Do not perform any substitution. This will be faster
  25. but may result in invalid markup.
  26. For XML documents:
  27. * 'html' - Entity substitution for XHTML documents.
  28. * 'minimal' - Only make the substitutions necessary to guarantee
  29. valid XML. (default)
  30. * None - Do not perform any substitution. This will be faster
  31. but may result in invalid markup.
  32. """
  33. #: Constant name denoting HTML markup
  34. HTML: str = "html"
  35. #: Constant name denoting XML markup
  36. XML: str = "xml"
  37. #: Default values for the various constructor options when the
  38. #: markup language is HTML.
  39. HTML_DEFAULTS: Dict[str, Set[str]] = dict(
  40. cdata_containing_tags=set(["script", "style"]),
  41. )
  42. language: Optional[str] #: :meta private:
  43. entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
  44. void_element_close_prefix: str #: :meta private:
  45. cdata_containing_tags: Set[str] #: :meta private:
  46. indent: str #: :meta private:
  47. #: If this is set to true by the constructor, then attributes whose
  48. #: values are sent to the empty string will be treated as HTML
  49. #: boolean attributes. (Attributes whose value is None are always
  50. #: rendered this way.)
  51. empty_attributes_are_booleans: bool
  52. def _default(
  53. self, language: str, value: Optional[Set[str]], kwarg: str
  54. ) -> Set[str]:
  55. if value is not None:
  56. return value
  57. if language == self.XML:
  58. # When XML is the markup language in use, all of the
  59. # defaults are the empty list.
  60. return set()
  61. # Otherwise, it depends on what's in HTML_DEFAULTS.
  62. return self.HTML_DEFAULTS[kwarg]
  63. def __init__(
  64. self,
  65. language: Optional[str] = None,
  66. entity_substitution: Optional[_EntitySubstitutionFunction] = None,
  67. void_element_close_prefix: str = "/",
  68. cdata_containing_tags: Optional[Set[str]] = None,
  69. empty_attributes_are_booleans: bool = False,
  70. indent: Union[int,str] = 1,
  71. ):
  72. r"""Constructor.
  73. :param language: This should be `Formatter.XML` if you are formatting
  74. XML markup and `Formatter.HTML` if you are formatting HTML markup.
  75. :param entity_substitution: A function to call to replace special
  76. characters with XML/HTML entities. For examples, see
  77. bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
  78. :param void_element_close_prefix: By default, void elements
  79. are represented as <tag/> (XML rules) rather than <tag>
  80. (HTML rules). To get <tag>, pass in the empty string.
  81. :param cdata_containing_tags: The set of tags that are defined
  82. as containing CDATA in this dialect. For example, in HTML,
  83. <script> and <style> tags are defined as containing CDATA,
  84. and their contents should not be formatted.
  85. :param empty_attributes_are_booleans: If this is set to true,
  86. then attributes whose values are sent to the empty string
  87. will be treated as `HTML boolean
  88. attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
  89. whose value is None are always rendered this way.)
  90. :param indent: If indent is a non-negative integer or string,
  91. then the contents of elements will be indented
  92. appropriately when pretty-printing. An indent level of 0,
  93. negative, or "" will only insert newlines. Using a
  94. positive integer indent indents that many spaces per
  95. level. If indent is a string (such as "\t"), that string
  96. is used to indent each level. The default behavior is to
  97. indent one space per level.
  98. """
  99. self.language = language or self.HTML
  100. self.entity_substitution = entity_substitution
  101. self.void_element_close_prefix = void_element_close_prefix
  102. self.cdata_containing_tags = self._default(
  103. self.language, cdata_containing_tags, "cdata_containing_tags"
  104. )
  105. self.empty_attributes_are_booleans = empty_attributes_are_booleans
  106. if indent is None:
  107. indent = 0
  108. indent_str: str
  109. if isinstance(indent, int):
  110. if indent < 0:
  111. indent = 0
  112. indent_str = " " * indent
  113. elif isinstance(indent, str):
  114. indent_str = indent
  115. else:
  116. indent_str = " "
  117. self.indent = indent_str
  118. def substitute(self, ns: str) -> str:
  119. """Process a string that needs to undergo entity substitution.
  120. This may be a string encountered in an attribute value or as
  121. text.
  122. :param ns: A string.
  123. :return: The same string but with certain characters replaced by named
  124. or numeric entities.
  125. """
  126. if not self.entity_substitution:
  127. return ns
  128. from .element import NavigableString
  129. if (
  130. isinstance(ns, NavigableString)
  131. and ns.parent is not None
  132. and ns.parent.name in self.cdata_containing_tags
  133. ):
  134. # Do nothing.
  135. return ns
  136. # Substitute.
  137. return self.entity_substitution(ns)
  138. def attribute_value(self, value: str) -> str:
  139. """Process the value of an attribute.
  140. :param ns: A string.
  141. :return: A string with certain characters replaced by named
  142. or numeric entities.
  143. """
  144. return self.substitute(value)
  145. def attributes(
  146. self, tag: bs4.element.Tag # type:ignore
  147. ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
  148. """Reorder a tag's attributes however you want.
  149. By default, attributes are sorted alphabetically. This makes
  150. behavior consistent between Python 2 and Python 3, and preserves
  151. backwards compatibility with older versions of Beautiful Soup.
  152. If `empty_attributes_are_booleans` is True, then
  153. attributes whose values are set to the empty string will be
  154. treated as boolean attributes.
  155. """
  156. if tag.attrs is None:
  157. return []
  158. items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
  159. return sorted(
  160. (k, (None if self.empty_attributes_are_booleans and v == "" else v))
  161. for k, v in items
  162. )
  163. class HTMLFormatter(Formatter):
  164. """A generic Formatter for HTML."""
  165. REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
  166. def __init__(
  167. self,
  168. entity_substitution: Optional[_EntitySubstitutionFunction] = None,
  169. void_element_close_prefix: str = "/",
  170. cdata_containing_tags: Optional[Set[str]] = None,
  171. empty_attributes_are_booleans: bool = False,
  172. indent: Union[int,str] = 1,
  173. ):
  174. super(HTMLFormatter, self).__init__(
  175. self.HTML,
  176. entity_substitution,
  177. void_element_close_prefix,
  178. cdata_containing_tags,
  179. empty_attributes_are_booleans,
  180. indent=indent
  181. )
  182. class XMLFormatter(Formatter):
  183. """A generic Formatter for XML."""
  184. REGISTRY: Dict[Optional[str], XMLFormatter] = {}
  185. def __init__(
  186. self,
  187. entity_substitution: Optional[_EntitySubstitutionFunction] = None,
  188. void_element_close_prefix: str = "/",
  189. cdata_containing_tags: Optional[Set[str]] = None,
  190. empty_attributes_are_booleans: bool = False,
  191. indent: Union[int,str] = 1,
  192. ):
  193. super(XMLFormatter, self).__init__(
  194. self.XML,
  195. entity_substitution,
  196. void_element_close_prefix,
  197. cdata_containing_tags,
  198. empty_attributes_are_booleans,
  199. indent=indent,
  200. )
  201. # Set up aliases for the default formatters.
  202. HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
  203. entity_substitution=EntitySubstitution.substitute_html
  204. )
  205. HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
  206. entity_substitution=EntitySubstitution.substitute_html5,
  207. void_element_close_prefix="",
  208. empty_attributes_are_booleans=True,
  209. )
  210. HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
  211. entity_substitution=EntitySubstitution.substitute_html,
  212. void_element_close_prefix="",
  213. empty_attributes_are_booleans=True,
  214. )
  215. HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
  216. entity_substitution=EntitySubstitution.substitute_xml
  217. )
  218. HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
  219. XMLFormatter.REGISTRY["html"] = XMLFormatter(
  220. entity_substitution=EntitySubstitution.substitute_html
  221. )
  222. XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
  223. entity_substitution=EntitySubstitution.substitute_xml
  224. )
  225. XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
  226. # Define type aliases to improve readability.
  227. #
  228. #: A function to call to replace special characters with XML or HTML
  229. #: entities.
  230. _EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
  231. # Many of the output-centered methods take an argument that can either
  232. # be a Formatter object or the name of a Formatter to be looked up.
  233. _FormatterOrName = Union[Formatter, str]