_typing.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. # Custom type aliases used throughout Beautiful Soup to improve readability.
  2. # Notes on improvements to the type system in newer versions of Python
  3. # that can be used once Beautiful Soup drops support for older
  4. # versions:
  5. #
  6. # * ClassVar can be put on class variables now.
  7. # * In 3.10, x|y is an accepted shorthand for Union[x,y].
  8. # * In 3.10, TypeAlias gains capabilities that can be used to
  9. # improve the tree matching types (I don't remember what, exactly).
  10. # * In 3.9 it's possible to specialize the re.Match type,
  11. # e.g. re.Match[str]. In 3.8 there's a typing.re namespace for this,
  12. # but it's removed in 3.12, so to support the widest possible set of
  13. # versions I'm not using it.
  14. from typing_extensions import (
  15. runtime_checkable,
  16. Protocol,
  17. TypeAlias,
  18. )
  19. from typing import (
  20. Any,
  21. Callable,
  22. Dict,
  23. IO,
  24. Iterable,
  25. Mapping,
  26. Optional,
  27. Pattern,
  28. TYPE_CHECKING,
  29. Union,
  30. )
  31. if TYPE_CHECKING:
  32. from bs4.element import (
  33. AttributeValueList,
  34. NamespacedAttribute,
  35. NavigableString,
  36. PageElement,
  37. ResultSet,
  38. Tag,
  39. )
  40. @runtime_checkable
  41. class _RegularExpressionProtocol(Protocol):
  42. """A protocol object which can accept either Python's built-in
  43. `re.Pattern` objects, or the similar ``Regex`` objects defined by the
  44. third-party ``regex`` package.
  45. """
  46. def search(
  47. self, string: str, pos: int = ..., endpos: int = ...
  48. ) -> Optional[Any]: ...
  49. @property
  50. def pattern(self) -> str: ...
  51. # Aliases for markup in various stages of processing.
  52. #
  53. #: The rawest form of markup: either a string, bytestring, or an open filehandle.
  54. _IncomingMarkup: TypeAlias = Union[str, bytes, IO[str], IO[bytes]]
  55. #: Markup that is in memory but has (potentially) yet to be converted
  56. #: to Unicode.
  57. _RawMarkup: TypeAlias = Union[str, bytes]
  58. # Aliases for character encodings
  59. #
  60. #: A data encoding.
  61. _Encoding: TypeAlias = str
  62. #: One or more data encodings.
  63. _Encodings: TypeAlias = Iterable[_Encoding]
  64. # Aliases for XML namespaces
  65. #
  66. #: The prefix for an XML namespace.
  67. _NamespacePrefix: TypeAlias = str
  68. #: The URL of an XML namespace
  69. _NamespaceURL: TypeAlias = str
  70. #: A mapping of prefixes to namespace URLs.
  71. _NamespaceMapping: TypeAlias = Dict[_NamespacePrefix, _NamespaceURL]
  72. #: A mapping of namespace URLs to prefixes
  73. _InvertedNamespaceMapping: TypeAlias = Dict[_NamespaceURL, _NamespacePrefix]
  74. # Aliases for the attribute values associated with HTML/XML tags.
  75. #
  76. #: The value associated with an HTML or XML attribute. This is the
  77. #: relatively unprocessed value Beautiful Soup expects to come from a
  78. #: `TreeBuilder`.
  79. _RawAttributeValue: TypeAlias = str
  80. #: A dictionary of names to `_RawAttributeValue` objects. This is how
  81. #: Beautiful Soup expects a `TreeBuilder` to represent a tag's
  82. #: attribute values.
  83. _RawAttributeValues: TypeAlias = (
  84. "Mapping[Union[str, NamespacedAttribute], _RawAttributeValue]"
  85. )
  86. #: An attribute value in its final form, as stored in the
  87. # `Tag` class, after it has been processed and (in some cases)
  88. # split into a list of strings.
  89. _AttributeValue: TypeAlias = Union[str, "AttributeValueList"]
  90. #: A dictionary of names to :py:data:`_AttributeValue` objects. This is what
  91. #: a tag's attributes look like after processing.
  92. _AttributeValues: TypeAlias = Dict[str, _AttributeValue]
  93. #: The methods that deal with turning :py:data:`_RawAttributeValue` into
  94. #: :py:data:`_AttributeValue` may be called several times, even after the values
  95. #: are already processed (e.g. when cloning a tag), so they need to
  96. #: be able to acommodate both possibilities.
  97. _RawOrProcessedAttributeValues: TypeAlias = Union[_RawAttributeValues, _AttributeValues]
  98. #: A number of tree manipulation methods can take either a `PageElement` or a
  99. #: normal Python string (which will be converted to a `NavigableString`).
  100. _InsertableElement: TypeAlias = Union["PageElement", str]
  101. # Aliases to represent the many possibilities for matching bits of a
  102. # parse tree.
  103. #
  104. # This is very complicated because we're applying a formal type system
  105. # to some very DWIM code. The types we end up with will be the types
  106. # of the arguments to the SoupStrainer constructor and (more
  107. # familiarly to Beautiful Soup users) the find* methods.
  108. #: A function that takes a PageElement and returns a yes-or-no answer.
  109. _PageElementMatchFunction: TypeAlias = Callable[["PageElement"], bool]
  110. #: A function that takes the raw parsed ingredients of a markup tag
  111. #: and returns a yes-or-no answer.
  112. # Not necessary at the moment.
  113. # _AllowTagCreationFunction:TypeAlias = Callable[[Optional[str], str, Optional[_RawAttributeValues]], bool]
  114. #: A function that takes the raw parsed ingredients of a markup string node
  115. #: and returns a yes-or-no answer.
  116. # Not necessary at the moment.
  117. # _AllowStringCreationFunction:TypeAlias = Callable[[Optional[str]], bool]
  118. #: A function that takes a `Tag` and returns a yes-or-no answer.
  119. #: A `TagNameMatchRule` expects this kind of function, if you're
  120. #: going to pass it a function.
  121. _TagMatchFunction: TypeAlias = Callable[["Tag"], bool]
  122. #: A function that takes a string (or None) and returns a yes-or-no
  123. #: answer. An `AttributeValueMatchRule` expects this kind of function, if
  124. #: you're going to pass it a function.
  125. _NullableStringMatchFunction: TypeAlias = Callable[[Optional[str]], bool]
  126. #: A function that takes a string and returns a yes-or-no answer. A
  127. # `StringMatchRule` expects this kind of function, if you're going to
  128. # pass it a function.
  129. _StringMatchFunction: TypeAlias = Callable[[str], bool]
  130. #: Either a tag name, an attribute value or a string can be matched
  131. #: against a string, bytestring, regular expression, or a boolean.
  132. _BaseStrainable: TypeAlias = Union[str, bytes, Pattern[str], bool]
  133. #: A tag can be matched either with the `_BaseStrainable` options, or
  134. #: using a function that takes the `Tag` as its sole argument.
  135. _BaseStrainableElement: TypeAlias = Union[_BaseStrainable, _TagMatchFunction]
  136. #: A tag's attribute value can be matched either with the
  137. #: `_BaseStrainable` options, or using a function that takes that
  138. #: value as its sole argument.
  139. _BaseStrainableAttribute: TypeAlias = Union[_BaseStrainable, _NullableStringMatchFunction]
  140. #: A tag can be matched using either a single criterion or a list of
  141. #: criteria.
  142. _StrainableElement: TypeAlias = Union[
  143. _BaseStrainableElement, Iterable[_BaseStrainableElement]
  144. ]
  145. #: An attribute value can be matched using either a single criterion
  146. #: or a list of criteria.
  147. _StrainableAttribute: TypeAlias = Union[
  148. _BaseStrainableAttribute, Iterable[_BaseStrainableAttribute]
  149. ]
  150. #: An string can be matched using the same techniques as
  151. #: an attribute value.
  152. _StrainableString: TypeAlias = _StrainableAttribute
  153. #: A dictionary may be used to match against multiple attribute vlaues at once.
  154. _StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute]
  155. #: Many Beautiful soup methods return a PageElement or an ResultSet of
  156. #: PageElements. A PageElement is either a Tag or a NavigableString.
  157. #: These convenience aliases make it easier for IDE users to see which methods
  158. #: are available on the objects they're dealing with.
  159. _OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"]
  160. _AtMostOneElement: TypeAlias = Optional[_OneElement]
  161. _AtMostOneTag: TypeAlias = Optional["Tag"]
  162. _AtMostOneNavigableString: TypeAlias = Optional["NavigableString"]
  163. _QueryResults: TypeAlias = "ResultSet[_OneElement]"
  164. _SomeTags: TypeAlias = "ResultSet[Tag]"
  165. _SomeNavigableStrings: TypeAlias = "ResultSet[NavigableString]"