__init__.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. from __future__ import annotations
  2. # Use of this source code is governed by the MIT license.
  3. __license__ = "MIT"
  4. from collections import defaultdict
  5. import re
  6. from types import ModuleType
  7. from typing import (
  8. Any,
  9. cast,
  10. Dict,
  11. Iterable,
  12. List,
  13. Optional,
  14. Pattern,
  15. Set,
  16. Tuple,
  17. Type,
  18. TYPE_CHECKING,
  19. )
  20. import warnings
  21. import sys
  22. from bs4.element import (
  23. AttributeDict,
  24. AttributeValueList,
  25. CharsetMetaAttributeValue,
  26. ContentMetaAttributeValue,
  27. RubyParenthesisString,
  28. RubyTextString,
  29. Stylesheet,
  30. Script,
  31. TemplateString,
  32. nonwhitespace_re,
  33. )
  34. # Exceptions were moved to their own module in 4.13. Import here for
  35. # backwards compatibility.
  36. from bs4.exceptions import ParserRejectedMarkup
  37. from bs4._typing import (
  38. _AttributeValues,
  39. _RawAttributeValue,
  40. )
  41. from bs4._warnings import XMLParsedAsHTMLWarning
  42. if TYPE_CHECKING:
  43. from bs4 import BeautifulSoup
  44. from bs4.element import (
  45. NavigableString,
  46. Tag,
  47. )
  48. from bs4._typing import (
  49. _AttributeValue,
  50. _Encoding,
  51. _Encodings,
  52. _RawOrProcessedAttributeValues,
  53. _RawMarkup,
  54. )
  55. __all__ = [
  56. "HTMLTreeBuilder",
  57. "SAXTreeBuilder",
  58. "TreeBuilder",
  59. "TreeBuilderRegistry",
  60. ]
  61. # Some useful features for a TreeBuilder to have.
  62. FAST = "fast"
  63. PERMISSIVE = "permissive"
  64. STRICT = "strict"
  65. XML = "xml"
  66. HTML = "html"
  67. HTML_5 = "html5"
  68. __all__ = [
  69. "TreeBuilderRegistry",
  70. "TreeBuilder",
  71. "HTMLTreeBuilder",
  72. "DetectsXMLParsedAsHTML",
  73. "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
  74. ]
  75. class TreeBuilderRegistry(object):
  76. """A way of looking up TreeBuilder subclasses by their name or by desired
  77. features.
  78. """
  79. builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
  80. builders: List[Type[TreeBuilder]]
  81. def __init__(self) -> None:
  82. self.builders_for_feature = defaultdict(list)
  83. self.builders = []
  84. def register(self, treebuilder_class: type[TreeBuilder]) -> None:
  85. """Register a treebuilder based on its advertised features.
  86. :param treebuilder_class: A subclass of `TreeBuilder`. its
  87. `TreeBuilder.features` attribute should list its features.
  88. """
  89. for feature in treebuilder_class.features:
  90. self.builders_for_feature[feature].insert(0, treebuilder_class)
  91. self.builders.insert(0, treebuilder_class)
  92. def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
  93. """Look up a TreeBuilder subclass with the desired features.
  94. :param features: A list of features to look for. If none are
  95. provided, the most recently registered TreeBuilder subclass
  96. will be used.
  97. :return: A TreeBuilder subclass, or None if there's no
  98. registered subclass with all the requested features.
  99. """
  100. if len(self.builders) == 0:
  101. # There are no builders at all.
  102. return None
  103. if len(features) == 0:
  104. # They didn't ask for any features. Give them the most
  105. # recently registered builder.
  106. return self.builders[0]
  107. # Go down the list of features in order, and eliminate any builders
  108. # that don't match every feature.
  109. feature_list = list(features)
  110. feature_list.reverse()
  111. candidates = None
  112. candidate_set = None
  113. while len(feature_list) > 0:
  114. feature = feature_list.pop()
  115. we_have_the_feature = self.builders_for_feature.get(feature, [])
  116. if len(we_have_the_feature) > 0:
  117. if candidates is None:
  118. candidates = we_have_the_feature
  119. candidate_set = set(candidates)
  120. elif candidate_set is not None:
  121. # Eliminate any candidates that don't have this feature.
  122. candidate_set = candidate_set.intersection(set(we_have_the_feature))
  123. # The only valid candidates are the ones in candidate_set.
  124. # Go through the original list of candidates and pick the first one
  125. # that's in candidate_set.
  126. if candidate_set is None or candidates is None:
  127. return None
  128. for candidate in candidates:
  129. if candidate in candidate_set:
  130. return candidate
  131. return None
  132. #: The `BeautifulSoup` constructor will take a list of features
  133. #: and use it to look up `TreeBuilder` classes in this registry.
  134. builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
  135. class TreeBuilder(object):
  136. """Turn a textual document into a Beautiful Soup object tree.
  137. This is an abstract superclass which smooths out the behavior of
  138. different parser libraries into a single, unified interface.
  139. :param multi_valued_attributes: If this is set to None, the
  140. TreeBuilder will not turn any values for attributes like
  141. 'class' into lists. Setting this to a dictionary will
  142. customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
  143. for an example.
  144. Internally, these are called "CDATA list attributes", but that
  145. probably doesn't make sense to an end-user, so the argument name
  146. is ``multi_valued_attributes``.
  147. :param preserve_whitespace_tags: A set of tags to treat
  148. the way <pre> tags are treated in HTML. Tags in this set
  149. are immune from pretty-printing; their contents will always be
  150. output as-is.
  151. :param string_containers: A dictionary mapping tag names to
  152. the classes that should be instantiated to contain the textual
  153. contents of those tags. The default is to use NavigableString
  154. for every tag, no matter what the name. You can override the
  155. default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
  156. :param store_line_numbers: If the parser keeps track of the line
  157. numbers and positions of the original markup, that information
  158. will, by default, be stored in each corresponding
  159. :py:class:`bs4.element.Tag` object. You can turn this off by
  160. passing store_line_numbers=False; then Tag.sourcepos and
  161. Tag.sourceline will always be None. If the parser you're using
  162. doesn't keep track of this information, then store_line_numbers
  163. is irrelevant.
  164. :param attribute_dict_class: The value of a multi-valued attribute
  165. (such as HTML's 'class') willl be stored in an instance of this
  166. class. The default is Beautiful Soup's built-in
  167. `AttributeValueList`, which is a normal Python list, and you
  168. will probably never need to change it.
  169. """
  170. USE_DEFAULT: Any = object() #: :meta private:
  171. def __init__(
  172. self,
  173. multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
  174. preserve_whitespace_tags: Set[str] = USE_DEFAULT,
  175. store_line_numbers: bool = USE_DEFAULT,
  176. string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
  177. empty_element_tags: Set[str] = USE_DEFAULT,
  178. attribute_dict_class: Type[AttributeDict] = AttributeDict,
  179. attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
  180. ):
  181. self.soup = None
  182. if multi_valued_attributes is self.USE_DEFAULT:
  183. multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
  184. self.cdata_list_attributes = multi_valued_attributes
  185. if preserve_whitespace_tags is self.USE_DEFAULT:
  186. preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
  187. self.preserve_whitespace_tags = preserve_whitespace_tags
  188. if empty_element_tags is self.USE_DEFAULT:
  189. self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
  190. else:
  191. self.empty_element_tags = empty_element_tags
  192. # TODO: store_line_numbers is probably irrelevant now that
  193. # the behavior of sourceline and sourcepos has been made consistent
  194. # everywhere.
  195. if store_line_numbers == self.USE_DEFAULT:
  196. store_line_numbers = self.TRACKS_LINE_NUMBERS
  197. self.store_line_numbers = store_line_numbers
  198. if string_containers == self.USE_DEFAULT:
  199. string_containers = self.DEFAULT_STRING_CONTAINERS
  200. self.string_containers = string_containers
  201. self.attribute_dict_class = attribute_dict_class
  202. self.attribute_value_list_class = attribute_value_list_class
  203. NAME: str = "[Unknown tree builder]"
  204. ALTERNATE_NAMES: Iterable[str] = []
  205. features: Iterable[str] = []
  206. is_xml: bool = False
  207. picklable: bool = False
  208. soup: Optional[BeautifulSoup] #: :meta private:
  209. #: A tag will be considered an empty-element
  210. #: tag when and only when it has no contents.
  211. empty_element_tags: Optional[Set[str]] = None #: :meta private:
  212. cdata_list_attributes: Dict[str, Set[str]] #: :meta private:
  213. preserve_whitespace_tags: Set[str] #: :meta private:
  214. string_containers: Dict[str, Type[NavigableString]] #: :meta private:
  215. tracks_line_numbers: bool #: :meta private:
  216. #: A value for these tag/attribute combinations is a space- or
  217. #: comma-separated list of CDATA, rather than a single CDATA.
  218. DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
  219. #: Whitespace should be preserved inside these tags.
  220. DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
  221. #: The textual contents of tags with these names should be
  222. #: instantiated with some class other than `bs4.element.NavigableString`.
  223. DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} # type:ignore
  224. #: By default, tags are treated as empty-element tags if they have
  225. #: no contents--that is, using XML rules. HTMLTreeBuilder
  226. #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
  227. #: HTML 4 and HTML5 standards.
  228. DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
  229. #: Most parsers don't keep track of line numbers.
  230. TRACKS_LINE_NUMBERS: bool = False
  231. def initialize_soup(self, soup: BeautifulSoup) -> None:
  232. """The BeautifulSoup object has been initialized and is now
  233. being associated with the TreeBuilder.
  234. :param soup: A BeautifulSoup object.
  235. """
  236. self.soup = soup
  237. def reset(self) -> None:
  238. """Do any work necessary to reset the underlying parser
  239. for a new document.
  240. By default, this does nothing.
  241. """
  242. pass
  243. def can_be_empty_element(self, tag_name: str) -> bool:
  244. """Might a tag with this name be an empty-element tag?
  245. The final markup may or may not actually present this tag as
  246. self-closing.
  247. For instance: an HTMLBuilder does not consider a <p> tag to be
  248. an empty-element tag (it's not in
  249. HTMLBuilder.empty_element_tags). This means an empty <p> tag
  250. will be presented as "<p></p>", not "<p/>" or "<p>".
  251. The default implementation has no opinion about which tags are
  252. empty-element tags, so a tag will be presented as an
  253. empty-element tag if and only if it has no children.
  254. "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
  255. be left alone.
  256. :param tag_name: The name of a markup tag.
  257. """
  258. if self.empty_element_tags is None:
  259. return True
  260. return tag_name in self.empty_element_tags
  261. def feed(self, markup: _RawMarkup) -> None:
  262. """Run incoming markup through some parsing process."""
  263. raise NotImplementedError()
  264. def prepare_markup(
  265. self,
  266. markup: _RawMarkup,
  267. user_specified_encoding: Optional[_Encoding] = None,
  268. document_declared_encoding: Optional[_Encoding] = None,
  269. exclude_encodings: Optional[_Encodings] = None,
  270. ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
  271. """Run any preliminary steps necessary to make incoming markup
  272. acceptable to the parser.
  273. :param markup: The markup that's about to be parsed.
  274. :param user_specified_encoding: The user asked to try this encoding
  275. to convert the markup into a Unicode string.
  276. :param document_declared_encoding: The markup itself claims to be
  277. in this encoding. NOTE: This argument is not used by the
  278. calling code and can probably be removed.
  279. :param exclude_encodings: The user asked *not* to try any of
  280. these encodings.
  281. :yield: A series of 4-tuples: (markup, encoding, declared encoding,
  282. has undergone character replacement)
  283. Each 4-tuple represents a strategy that the parser can try
  284. to convert the document to Unicode and parse it. Each
  285. strategy will be tried in turn.
  286. By default, the only strategy is to parse the markup
  287. as-is. See `LXMLTreeBuilderForXML` and
  288. `HTMLParserTreeBuilder` for implementations that take into
  289. account the quirks of particular parsers.
  290. :meta private:
  291. """
  292. yield markup, None, None, False
  293. def test_fragment_to_document(self, fragment: str) -> str:
  294. """Wrap an HTML fragment to make it look like a document.
  295. Different parsers do this differently. For instance, lxml
  296. introduces an empty <head> tag, and html5lib
  297. doesn't. Abstracting this away lets us write simple tests
  298. which run HTML fragments through the parser and compare the
  299. results against other HTML fragments.
  300. This method should not be used outside of unit tests.
  301. :param fragment: A fragment of HTML.
  302. :return: A full HTML document.
  303. :meta private:
  304. """
  305. return fragment
  306. def set_up_substitutions(self, tag: Tag) -> bool:
  307. """Set up any substitutions that will need to be performed on
  308. a `Tag` when it's output as a string.
  309. By default, this does nothing. See `HTMLTreeBuilder` for a
  310. case where this is used.
  311. :return: Whether or not a substitution was performed.
  312. :meta private:
  313. """
  314. return False
  315. def _replace_cdata_list_attribute_values(
  316. self, tag_name: str, attrs: _RawOrProcessedAttributeValues
  317. ) -> _AttributeValues:
  318. """When an attribute value is associated with a tag that can
  319. have multiple values for that attribute, convert the string
  320. value to a list of strings.
  321. Basically, replaces class="foo bar" with class=["foo", "bar"]
  322. NOTE: This method modifies its input in place.
  323. :param tag_name: The name of a tag.
  324. :param attrs: A dictionary containing the tag's attributes.
  325. Any appropriate attribute values will be modified in place.
  326. :return: The modified dictionary that was originally passed in.
  327. """
  328. # First, cast the attrs dict to _AttributeValues. This might
  329. # not be accurate yet, but it will be by the time this method
  330. # returns.
  331. modified_attrs = cast(_AttributeValues, attrs)
  332. if not modified_attrs or not self.cdata_list_attributes:
  333. # Nothing to do.
  334. return modified_attrs
  335. # There is at least a possibility that we need to modify one of
  336. # the attribute values.
  337. universal: Set[str] = self.cdata_list_attributes.get("*", set())
  338. tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
  339. for attr in list(modified_attrs.keys()):
  340. modified_value: _AttributeValue
  341. if attr in universal or (tag_specific and attr in tag_specific):
  342. # We have a "class"-type attribute whose string
  343. # value is a whitespace-separated list of
  344. # values. Split it into a list.
  345. original_value: _AttributeValue = modified_attrs[attr]
  346. if isinstance(original_value, _RawAttributeValue):
  347. # This is a _RawAttributeValue (a string) that
  348. # needs to be split and converted to a
  349. # AttributeValueList so it can be an
  350. # _AttributeValue.
  351. modified_value = self.attribute_value_list_class(
  352. nonwhitespace_re.findall(original_value)
  353. )
  354. else:
  355. # html5lib calls setAttributes twice for the
  356. # same tag when rearranging the parse tree. On
  357. # the second call the attribute value here is
  358. # already a list. This can also happen when a
  359. # Tag object is cloned. If this happens, leave
  360. # the value alone rather than trying to split
  361. # it again.
  362. modified_value = original_value
  363. modified_attrs[attr] = modified_value
  364. return modified_attrs
  365. class SAXTreeBuilder(TreeBuilder):
  366. """A Beautiful Soup treebuilder that listens for SAX events.
  367. This is not currently used for anything, and it will be removed
  368. soon. It was a good idea, but it wasn't properly integrated into the
  369. rest of Beautiful Soup, so there have been long stretches where it
  370. hasn't worked properly.
  371. """
  372. def __init__(self, *args: Any, **kwargs: Any) -> None:
  373. warnings.warn(
  374. "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
  375. DeprecationWarning,
  376. stacklevel=2,
  377. )
  378. super(SAXTreeBuilder, self).__init__(*args, **kwargs)
  379. def feed(self, markup: _RawMarkup) -> None:
  380. raise NotImplementedError()
  381. def close(self) -> None:
  382. pass
  383. def startElement(self, name: str, attrs: Dict[str, str]) -> None:
  384. attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
  385. # print("Start %s, %r" % (name, attrs))
  386. assert self.soup is not None
  387. self.soup.handle_starttag(name, None, None, attrs)
  388. def endElement(self, name: str) -> None:
  389. # print("End %s" % name)
  390. assert self.soup is not None
  391. self.soup.handle_endtag(name)
  392. def startElementNS(
  393. self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
  394. ) -> None:
  395. # Throw away (ns, nodeName) for now.
  396. self.startElement(nodeName, attrs)
  397. def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
  398. # Throw away (ns, nodeName) for now.
  399. self.endElement(nodeName)
  400. # handler.endElementNS((ns, node.nodeName), node.nodeName)
  401. def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
  402. # Ignore the prefix for now.
  403. pass
  404. def endPrefixMapping(self, prefix: str) -> None:
  405. # Ignore the prefix for now.
  406. # handler.endPrefixMapping(prefix)
  407. pass
  408. def characters(self, content: str) -> None:
  409. assert self.soup is not None
  410. self.soup.handle_data(content)
  411. def startDocument(self) -> None:
  412. pass
  413. def endDocument(self) -> None:
  414. pass
  415. class HTMLTreeBuilder(TreeBuilder):
  416. """This TreeBuilder knows facts about HTML, such as which tags are treated
  417. specially by the HTML standard.
  418. """
  419. #: Some HTML tags are defined as having no contents. Beautiful Soup
  420. #: treats these specially.
  421. DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = set(
  422. [
  423. # These are from HTML5.
  424. "area",
  425. "base",
  426. "br",
  427. "col",
  428. "embed",
  429. "hr",
  430. "img",
  431. "input",
  432. "keygen",
  433. "link",
  434. "menuitem",
  435. "meta",
  436. "param",
  437. "source",
  438. "track",
  439. "wbr",
  440. # These are from earlier versions of HTML and are removed in HTML5.
  441. "basefont",
  442. "bgsound",
  443. "command",
  444. "frame",
  445. "image",
  446. "isindex",
  447. "nextid",
  448. "spacer",
  449. ]
  450. )
  451. #: The HTML standard defines these tags as block-level elements. Beautiful
  452. #: Soup does not treat these elements differently from other elements,
  453. #: but it may do so eventually, and this information is available if
  454. #: you need to use it.
  455. DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
  456. [
  457. "address",
  458. "article",
  459. "aside",
  460. "blockquote",
  461. "canvas",
  462. "dd",
  463. "div",
  464. "dl",
  465. "dt",
  466. "fieldset",
  467. "figcaption",
  468. "figure",
  469. "footer",
  470. "form",
  471. "h1",
  472. "h2",
  473. "h3",
  474. "h4",
  475. "h5",
  476. "h6",
  477. "header",
  478. "hr",
  479. "li",
  480. "main",
  481. "nav",
  482. "noscript",
  483. "ol",
  484. "output",
  485. "p",
  486. "pre",
  487. "section",
  488. "table",
  489. "tfoot",
  490. "ul",
  491. "video",
  492. ]
  493. )
  494. #: These HTML tags need special treatment so they can be
  495. #: represented by a string class other than `bs4.element.NavigableString`.
  496. #:
  497. #: For some of these tags, it's because the HTML standard defines
  498. #: an unusual content model for them. I made this list by going
  499. #: through the HTML spec
  500. #: (https://html.spec.whatwg.org/#metadata-content) and looking for
  501. #: "metadata content" elements that can contain strings.
  502. #:
  503. #: The Ruby tags (<rt> and <rp>) are here despite being normal
  504. #: "phrasing content" tags, because the content they contain is
  505. #: qualitatively different from other text in the document, and it
  506. #: can be useful to be able to distinguish it.
  507. #:
  508. #: TODO: Arguably <noscript> could go here but it seems
  509. #: qualitatively different from the other tags.
  510. DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { # type:ignore
  511. "rt": RubyTextString,
  512. "rp": RubyParenthesisString,
  513. "style": Stylesheet,
  514. "script": Script,
  515. "template": TemplateString,
  516. }
  517. #: The HTML standard defines these attributes as containing a
  518. #: space-separated list of values, not a single value. That is,
  519. #: class="foo bar" means that the 'class' attribute has two values,
  520. #: 'foo' and 'bar', not the single value 'foo bar'. When we
  521. #: encounter one of these attributes, we will parse its value into
  522. #: a list of values if possible. Upon output, the list will be
  523. #: converted back into a string.
  524. DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
  525. "*": {"class", "accesskey", "dropzone"},
  526. "a": {"rel", "rev"},
  527. "link": {"rel", "rev"},
  528. "td": {"headers"},
  529. "th": {"headers"},
  530. "form": {"accept-charset"},
  531. "object": {"archive"},
  532. # These are HTML5 specific, as are *.accesskey and *.dropzone above.
  533. "area": {"rel"},
  534. "icon": {"sizes"},
  535. "iframe": {"sandbox"},
  536. "output": {"for"},
  537. }
  538. #: By default, whitespace inside these HTML tags will be
  539. #: preserved rather than being collapsed.
  540. DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
  541. def set_up_substitutions(self, tag: Tag) -> bool:
  542. """Replace the declared encoding in a <meta> tag with a placeholder,
  543. to be substituted when the tag is output to a string.
  544. An HTML document may come in to Beautiful Soup as one
  545. encoding, but exit in a different encoding, and the <meta> tag
  546. needs to be changed to reflect this.
  547. :return: Whether or not a substitution was performed.
  548. :meta private:
  549. """
  550. # We are only interested in <meta> tags
  551. if tag.name != "meta":
  552. return False
  553. # TODO: This cast will fail in the (very unlikely) scenario
  554. # that the programmer who instantiates the TreeBuilder
  555. # specifies meta['content'] or meta['charset'] as
  556. # cdata_list_attributes.
  557. content: Optional[str] = cast(Optional[str], tag.get("content"))
  558. charset: Optional[str] = cast(Optional[str], tag.get("charset"))
  559. # But we can accommodate meta['http-equiv'] being made a
  560. # cdata_list_attribute (again, very unlikely) without much
  561. # trouble.
  562. http_equiv: List[str] = tag.get_attribute_list("http-equiv")
  563. # We are interested in <meta> tags that say what encoding the
  564. # document was originally in. This means HTML 5-style <meta>
  565. # tags that provide the "charset" attribute. It also means
  566. # HTML 4-style <meta> tags that provide the "content"
  567. # attribute and have "http-equiv" set to "content-type".
  568. #
  569. # In both cases we will replace the value of the appropriate
  570. # attribute with a standin object that can take on any
  571. # encoding.
  572. substituted = False
  573. if charset is not None:
  574. # HTML 5 style:
  575. # <meta charset="utf8">
  576. tag["charset"] = CharsetMetaAttributeValue(charset)
  577. substituted = True
  578. elif content is not None and any(
  579. x.lower() == "content-type" for x in http_equiv
  580. ):
  581. # HTML 4 style:
  582. # <meta http-equiv="content-type" content="text/html; charset=utf8">
  583. tag["content"] = ContentMetaAttributeValue(content)
  584. substituted = True
  585. return substituted
  586. class DetectsXMLParsedAsHTML(object):
  587. """A mixin class for any class (a TreeBuilder, or some class used by a
  588. TreeBuilder) that's in a position to detect whether an XML
  589. document is being incorrectly parsed as HTML, and issue an
  590. appropriate warning.
  591. This requires being able to observe an incoming processing
  592. instruction that might be an XML declaration, and also able to
  593. observe tags as they're opened. If you can't do that for a given
  594. `TreeBuilder`, there's a less reliable implementation based on
  595. examining the raw markup.
  596. """
  597. #: Regular expression for seeing if string markup has an <html> tag.
  598. LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
  599. #: Regular expression for seeing if byte markup has an <html> tag.
  600. LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
  601. #: The start of an XML document string.
  602. XML_PREFIX: str = "<?xml"
  603. #: The start of an XML document bytestring.
  604. XML_PREFIX_B: bytes = b"<?xml"
  605. # This is typed as str, not `ProcessingInstruction`, because this
  606. # check may be run before any Beautiful Soup objects are created.
  607. _first_processing_instruction: Optional[str] #: :meta private:
  608. _root_tag_name: Optional[str] #: :meta private:
  609. @classmethod
  610. def warn_if_markup_looks_like_xml(
  611. cls, markup: Optional[_RawMarkup], stacklevel: int = 3
  612. ) -> bool:
  613. """Perform a check on some markup to see if it looks like XML
  614. that's not XHTML. If so, issue a warning.
  615. This is much less reliable than doing the check while parsing,
  616. but some of the tree builders can't do that.
  617. :param stacklevel: The stacklevel of the code calling this\
  618. function.
  619. :return: True if the markup looks like non-XHTML XML, False
  620. otherwise.
  621. """
  622. if markup is None:
  623. return False
  624. markup = markup[:500]
  625. if isinstance(markup, bytes):
  626. markup_b: bytes = markup
  627. looks_like_xml = markup_b.startswith(
  628. cls.XML_PREFIX_B
  629. ) and not cls.LOOKS_LIKE_HTML_B.search(markup)
  630. else:
  631. markup_s: str = markup
  632. looks_like_xml = markup_s.startswith(
  633. cls.XML_PREFIX
  634. ) and not cls.LOOKS_LIKE_HTML.search(markup)
  635. if looks_like_xml:
  636. cls._warn(stacklevel=stacklevel + 2)
  637. return True
  638. return False
  639. @classmethod
  640. def _warn(cls, stacklevel: int = 5) -> None:
  641. """Issue a warning about XML being parsed as HTML."""
  642. warnings.warn(
  643. XMLParsedAsHTMLWarning.MESSAGE,
  644. XMLParsedAsHTMLWarning,
  645. stacklevel=stacklevel,
  646. )
  647. def _initialize_xml_detector(self) -> None:
  648. """Call this method before parsing a document."""
  649. self._first_processing_instruction = None
  650. self._root_tag_name = None
  651. def _document_might_be_xml(self, processing_instruction: str) -> None:
  652. """Call this method when encountering an XML declaration, or a
  653. "processing instruction" that might be an XML declaration.
  654. This helps Beautiful Soup detect potential issues later, if
  655. the XML document turns out to be a non-XHTML document that's
  656. being parsed as XML.
  657. """
  658. if (
  659. self._first_processing_instruction is not None
  660. or self._root_tag_name is not None
  661. ):
  662. # The document has already started. Don't bother checking
  663. # anymore.
  664. return
  665. self._first_processing_instruction = processing_instruction
  666. # We won't know until we encounter the first tag whether or
  667. # not this is actually a problem.
  668. def _root_tag_encountered(self, name: str) -> None:
  669. """Call this when you encounter the document's root tag.
  670. This is where we actually check whether an XML document is
  671. being incorrectly parsed as HTML, and issue the warning.
  672. """
  673. if self._root_tag_name is not None:
  674. # This method was incorrectly called multiple times. Do
  675. # nothing.
  676. return
  677. self._root_tag_name = name
  678. if (
  679. name != "html"
  680. and self._first_processing_instruction is not None
  681. and self._first_processing_instruction.lower().startswith("xml ")
  682. ):
  683. # We encountered an XML declaration and then a tag other
  684. # than 'html'. This is a reliable indicator that a
  685. # non-XHTML document is being parsed as XML.
  686. self._warn(stacklevel=10)
  687. def register_treebuilders_from(module: ModuleType) -> None:
  688. """Copy TreeBuilders from the given module into this module."""
  689. this_module = sys.modules[__name__]
  690. for name in module.__all__:
  691. obj = getattr(module, name)
  692. if issubclass(obj, TreeBuilder):
  693. setattr(this_module, name, obj)
  694. this_module.__all__.append(name)
  695. # Register the builder while we're at it.
  696. this_module.builder_registry.register(obj)
  697. # Builders are registered in reverse order of priority, so that custom
  698. # builder registrations will take precedence. In general, we want lxml
  699. # to take precedence over html5lib, because it's faster. And we only
  700. # want to use HTMLParser as a last resort.
  701. from . import _htmlparser # noqa: E402
  702. register_treebuilders_from(_htmlparser)
  703. try:
  704. from . import _html5lib
  705. register_treebuilders_from(_html5lib)
  706. except ImportError:
  707. # They don't have html5lib installed.
  708. pass
  709. try:
  710. from . import _lxml
  711. register_treebuilders_from(_lxml)
  712. except ImportError:
  713. # They don't have lxml installed.
  714. pass