_html5lib.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. # Use of this source code is governed by the MIT license.
  2. __license__ = "MIT"
  3. __all__ = [
  4. "HTML5TreeBuilder",
  5. ]
  6. from typing import (
  7. Any,
  8. cast,
  9. Dict,
  10. Iterable,
  11. Optional,
  12. Sequence,
  13. TYPE_CHECKING,
  14. Tuple,
  15. Union,
  16. )
  17. from typing_extensions import TypeAlias
  18. from bs4._typing import (
  19. _AttributeValue,
  20. _AttributeValues,
  21. _Encoding,
  22. _Encodings,
  23. _NamespaceURL,
  24. _RawMarkup,
  25. )
  26. import warnings
  27. from bs4.builder import (
  28. DetectsXMLParsedAsHTML,
  29. PERMISSIVE,
  30. HTML,
  31. HTML_5,
  32. HTMLTreeBuilder,
  33. )
  34. from bs4.element import (
  35. NamespacedAttribute,
  36. PageElement,
  37. nonwhitespace_re,
  38. )
  39. import html5lib
  40. from html5lib.constants import (
  41. namespaces,
  42. )
  43. from bs4.element import (
  44. Comment,
  45. Doctype,
  46. NavigableString,
  47. Tag,
  48. )
  49. if TYPE_CHECKING:
  50. from bs4 import BeautifulSoup
  51. from html5lib.treebuilders import base as treebuilder_base
  52. class HTML5TreeBuilder(HTMLTreeBuilder):
  53. """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
  54. build a tree.
  55. Note that `HTML5TreeBuilder` does not support some common HTML
  56. `TreeBuilder` features. Some of these features could theoretically
  57. be implemented, but at the very least it's quite difficult,
  58. because html5lib moves the parse tree around as it's being built.
  59. Specifically:
  60. * This `TreeBuilder` doesn't use different subclasses of
  61. `NavigableString` (e.g. `Script`) based on the name of the tag
  62. in which the string was found.
  63. * You can't use a `SoupStrainer` to parse only part of a document.
  64. """
  65. NAME: str = "html5lib"
  66. features: Iterable[str] = [NAME, PERMISSIVE, HTML_5, HTML]
  67. #: html5lib can tell us which line number and position in the
  68. #: original file is the source of an element.
  69. TRACKS_LINE_NUMBERS: bool = True
  70. underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:
  71. user_specified_encoding: Optional[_Encoding]
  72. def prepare_markup(
  73. self,
  74. markup: _RawMarkup,
  75. user_specified_encoding: Optional[_Encoding] = None,
  76. document_declared_encoding: Optional[_Encoding] = None,
  77. exclude_encodings: Optional[_Encodings] = None,
  78. ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
  79. # Store the user-specified encoding for use later on.
  80. self.user_specified_encoding = user_specified_encoding
  81. # document_declared_encoding and exclude_encodings aren't used
  82. # ATM because the html5lib TreeBuilder doesn't use
  83. # UnicodeDammit.
  84. for variable, name in (
  85. (document_declared_encoding, "document_declared_encoding"),
  86. (exclude_encodings, "exclude_encodings"),
  87. ):
  88. if variable:
  89. warnings.warn(
  90. f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
  91. stacklevel=3,
  92. )
  93. # html5lib only parses HTML, so if it's given XML that's worth
  94. # noting.
  95. DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
  96. yield (markup, None, None, False)
  97. # These methods are defined by Beautiful Soup.
  98. def feed(self, markup: _RawMarkup) -> None:
  99. """Run some incoming markup through some parsing process,
  100. populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
  101. """
  102. if self.soup is not None and self.soup.parse_only is not None:
  103. warnings.warn(
  104. "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
  105. stacklevel=4,
  106. )
  107. # self.underlying_builder is probably None now, but it'll be set
  108. # when html5lib calls self.create_treebuilder().
  109. parser = html5lib.HTMLParser(tree=self.create_treebuilder)
  110. assert self.underlying_builder is not None
  111. self.underlying_builder.parser = parser
  112. extra_kwargs = dict()
  113. if not isinstance(markup, str):
  114. # kwargs, specifically override_encoding, will eventually
  115. # be passed in to html5lib's
  116. # HTMLBinaryInputStream.__init__.
  117. extra_kwargs["override_encoding"] = self.user_specified_encoding
  118. doc = parser.parse(markup, **extra_kwargs) # type:ignore
  119. # Set the character encoding detected by the tokenizer.
  120. if isinstance(markup, str):
  121. # We need to special-case this because html5lib sets
  122. # charEncoding to UTF-8 if it gets Unicode input.
  123. doc.original_encoding = None
  124. else:
  125. original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore
  126. # The encoding is an html5lib Encoding object. We want to
  127. # use a string for compatibility with other tree builders.
  128. original_encoding = original_encoding.name
  129. doc.original_encoding = original_encoding
  130. self.underlying_builder.parser = None
  131. def create_treebuilder(
  132. self, namespaceHTMLElements: bool
  133. ) -> "TreeBuilderForHtml5lib":
  134. """Called by html5lib to instantiate the kind of class it
  135. calls a 'TreeBuilder'.
  136. :param namespaceHTMLElements: Whether or not to namespace HTML elements.
  137. :meta private:
  138. """
  139. self.underlying_builder = TreeBuilderForHtml5lib(
  140. namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
  141. )
  142. return self.underlying_builder
  143. def test_fragment_to_document(self, fragment: str) -> str:
  144. """See `TreeBuilder`."""
  145. return "<html><head></head><body>%s</body></html>" % fragment
  146. class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
  147. soup: "BeautifulSoup" #: :meta private:
  148. parser: Optional[html5lib.HTMLParser] #: :meta private:
  149. def __init__(
  150. self,
  151. namespaceHTMLElements: bool,
  152. soup: Optional["BeautifulSoup"] = None,
  153. store_line_numbers: bool = True,
  154. **kwargs: Any,
  155. ):
  156. if soup:
  157. self.soup = soup
  158. else:
  159. warnings.warn(
  160. "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
  161. DeprecationWarning,
  162. stacklevel=2,
  163. )
  164. from bs4 import BeautifulSoup
  165. # TODO: Why is the parser 'html.parser' here? Using
  166. # html5lib doesn't cause an infinite loop and is more
  167. # accurate. Best to get rid of this entire section, I think.
  168. self.soup = BeautifulSoup(
  169. "", "html.parser", store_line_numbers=store_line_numbers, **kwargs
  170. )
  171. # TODO: What are **kwargs exactly? Should they be passed in
  172. # here in addition to/instead of being passed to the BeautifulSoup
  173. # constructor?
  174. super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
  175. # This will be set later to a real html5lib HTMLParser object,
  176. # which we can use to track the current line number.
  177. self.parser = None
  178. self.store_line_numbers = store_line_numbers
  179. def documentClass(self) -> "Element":
  180. self.soup.reset()
  181. return Element(self.soup, self.soup, None)
  182. def insertDoctype(self, token: Dict[str, Any]) -> None:
  183. name: str = cast(str, token["name"])
  184. publicId: Optional[str] = cast(Optional[str], token["publicId"])
  185. systemId: Optional[str] = cast(Optional[str], token["systemId"])
  186. doctype = Doctype.for_name_and_ids(name, publicId, systemId)
  187. self.soup.object_was_parsed(doctype)
  188. def elementClass(self, name: str, namespace: str) -> "Element":
  189. sourceline: Optional[int] = None
  190. sourcepos: Optional[int] = None
  191. if self.parser is not None and self.store_line_numbers:
  192. # This represents the point immediately after the end of the
  193. # tag. We don't know when the tag started, but we do know
  194. # where it ended -- the character just before this one.
  195. sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore
  196. assert sourcepos is not None
  197. sourcepos = sourcepos - 1
  198. tag = self.soup.new_tag(
  199. name, namespace, sourceline=sourceline, sourcepos=sourcepos
  200. )
  201. return Element(tag, self.soup, namespace)
  202. def commentClass(self, data: str) -> "TextNode":
  203. return TextNode(Comment(data), self.soup)
  204. def fragmentClass(self) -> "Element":
  205. """This is only used by html5lib HTMLParser.parseFragment(),
  206. which is never used by Beautiful Soup, only by the html5lib
  207. unit tests. Since we don't currently hook into those tests,
  208. the implementation is left blank.
  209. """
  210. raise NotImplementedError()
  211. def getFragment(self) -> "Element":
  212. """This is only used by the html5lib unit tests. Since we
  213. don't currently hook into those tests, the implementation is
  214. left blank.
  215. """
  216. raise NotImplementedError()
  217. def appendChild(self, node: "Element") -> None:
  218. # TODO: This code is not covered by the BS4 tests, and
  219. # apparently not triggered by the html5lib test suite either.
  220. # But it doesn't seem test-specific and there are calls to it
  221. # (or a method with the same name) all over html5lib, so I'm
  222. # leaving the implementation in place rather than replacing it
  223. # with NotImplementedError()
  224. self.soup.append(node.element)
  225. def getDocument(self) -> "BeautifulSoup":
  226. return self.soup
  227. def testSerializer(self, node: "Element") -> None:
  228. """This is only used by the html5lib unit tests. Since we
  229. don't currently hook into those tests, the implementation is
  230. left blank.
  231. """
  232. raise NotImplementedError()
  233. class AttrList(object):
  234. """Represents a Tag's attributes in a way compatible with html5lib."""
  235. element: Tag
  236. attrs: _AttributeValues
  237. def __init__(self, element: Tag):
  238. self.element = element
  239. self.attrs = dict(self.element.attrs)
  240. def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
  241. return list(self.attrs.items()).__iter__()
  242. def __setitem__(self, name: str, value: _AttributeValue) -> None:
  243. # If this attribute is a multi-valued attribute for this element,
  244. # turn its value into a list.
  245. list_attr = self.element.cdata_list_attributes or {}
  246. if name in list_attr.get("*", []) or (
  247. self.element.name in list_attr
  248. and name in list_attr.get(self.element.name, [])
  249. ):
  250. # A node that is being cloned may have already undergone
  251. # this procedure. Check for this and skip it.
  252. if not isinstance(value, list):
  253. assert isinstance(value, str)
  254. value = self.element.attribute_value_list_class(
  255. nonwhitespace_re.findall(value)
  256. )
  257. self.element[name] = value
  258. def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
  259. return list(self.attrs.items())
  260. def keys(self) -> Iterable[str]:
  261. return list(self.attrs.keys())
  262. def __len__(self) -> int:
  263. return len(self.attrs)
  264. def __getitem__(self, name: str) -> _AttributeValue:
  265. return self.attrs[name]
  266. def __contains__(self, name: str) -> bool:
  267. return name in list(self.attrs.keys())
  268. class BeautifulSoupNode(treebuilder_base.Node):
  269. # A node can correspond to _either_ a Tag _or_ a NavigableString.
  270. tag: Optional[Tag]
  271. string: Optional[NavigableString]
  272. soup: "BeautifulSoup"
  273. namespace: Optional[_NamespaceURL]
  274. @property
  275. def element(self) -> PageElement:
  276. assert self.tag is not None or self.string is not None
  277. if self.tag is not None:
  278. return self.tag
  279. else:
  280. assert self.string is not None
  281. return self.string
  282. @property
  283. def nodeType(self) -> int:
  284. """Return the html5lib constant corresponding to the type of
  285. the underlying DOM object.
  286. NOTE: This property is only accessed by the html5lib test
  287. suite, not by Beautiful Soup proper.
  288. """
  289. raise NotImplementedError()
  290. # TODO-TYPING: typeshed stubs are incorrect about this;
  291. # cloneNode returns a new Node, not None.
  292. def cloneNode(self) -> treebuilder_base.Node: # type:ignore
  293. raise NotImplementedError()
  294. class Element(BeautifulSoupNode):
  295. namespace: Optional[_NamespaceURL]
  296. def __init__(
  297. self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
  298. ):
  299. self.tag = element
  300. self.string = None
  301. self.soup = soup
  302. self.namespace = namespace
  303. treebuilder_base.Node.__init__(self, element.name)
  304. def appendChild(self, node: "BeautifulSoupNode") -> None:
  305. string_child: Optional[NavigableString] = None
  306. child: PageElement
  307. if type(node.string) is NavigableString:
  308. # We check for NavigableString *only* because we want to avoid
  309. # joining PreformattedStrings, such as Comments, with nearby strings.
  310. string_child = child = node.string
  311. else:
  312. child = node.element
  313. node.parent = self
  314. if (
  315. child is not None
  316. and child.parent is not None
  317. and not isinstance(child, str)
  318. ):
  319. node.element.extract()
  320. if (
  321. string_child is not None
  322. and self.tag is not None and self.tag.contents
  323. and type(self.tag.contents[-1]) is NavigableString
  324. ):
  325. # We are appending a string onto another string.
  326. # TODO This has O(n^2) performance, for input like
  327. # "a</a>a</a>a</a>..."
  328. old_element = self.tag.contents[-1]
  329. new_element = self.soup.new_string(old_element + string_child)
  330. old_element.replace_with(new_element)
  331. self.soup._most_recent_element = new_element
  332. else:
  333. if isinstance(node, str):
  334. # Create a brand new NavigableString from this string.
  335. child = self.soup.new_string(node)
  336. # Tell Beautiful Soup to act as if it parsed this element
  337. # immediately after the parent's last descendant. (Or
  338. # immediately after the parent, if it has no children.)
  339. if self.tag is not None and self.tag.contents:
  340. most_recent_element = self.tag._last_descendant(False)
  341. elif self.element.next_element is not None:
  342. # Something from further ahead in the parse tree is
  343. # being inserted into this earlier element. This is
  344. # very annoying because it means an expensive search
  345. # for the last element in the tree.
  346. most_recent_element = self.soup._last_descendant()
  347. else:
  348. most_recent_element = self.element
  349. self.soup.object_was_parsed(
  350. child, parent=self.tag, most_recent_element=most_recent_element
  351. )
  352. def getAttributes(self) -> AttrList:
  353. assert self.tag is not None
  354. return AttrList(self.tag)
  355. # An HTML5lib attribute name may either be a single string,
  356. # or a tuple (namespace, name).
  357. _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
  358. # Now we can define the type this method accepts as a dictionary
  359. # mapping those attribute names to single string values.
  360. _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
  361. def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
  362. assert self.tag is not None
  363. if attributes is not None and len(attributes) > 0:
  364. # Replace any namespaced attributes with
  365. # NamespacedAttribute objects.
  366. for name, value in list(attributes.items()):
  367. if isinstance(name, tuple):
  368. new_name = NamespacedAttribute(*name)
  369. del attributes[name]
  370. attributes[new_name] = value
  371. # We can now cast attributes to the type of Dict
  372. # used by Beautiful Soup.
  373. normalized_attributes = cast(_AttributeValues, attributes)
  374. # Values for tags like 'class' came in as single strings;
  375. # replace them with lists of strings as appropriate.
  376. self.soup.builder._replace_cdata_list_attribute_values(
  377. self.name, normalized_attributes
  378. )
  379. # Then set the attributes on the Tag associated with this
  380. # BeautifulSoupNode.
  381. for name, value_or_values in list(normalized_attributes.items()):
  382. self.tag[name] = value_or_values
  383. # The attributes may contain variables that need substitution.
  384. # Call set_up_substitutions manually.
  385. #
  386. # The Tag constructor called this method when the Tag was created,
  387. # but we just set/changed the attributes, so call it again.
  388. self.soup.builder.set_up_substitutions(self.tag)
  389. attributes = property(getAttributes, setAttributes)
  390. def insertText(
  391. self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
  392. ) -> None:
  393. text = TextNode(self.soup.new_string(data), self.soup)
  394. if insertBefore:
  395. self.insertBefore(text, insertBefore)
  396. else:
  397. self.appendChild(text)
  398. def insertBefore(
  399. self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
  400. ) -> None:
  401. assert self.tag is not None
  402. index = self.tag.index(refNode.element)
  403. if (
  404. type(node.element) is NavigableString
  405. and self.tag.contents
  406. and type(self.tag.contents[index - 1]) is NavigableString
  407. ):
  408. # (See comments in appendChild)
  409. old_node = self.tag.contents[index - 1]
  410. assert type(old_node) is NavigableString
  411. new_str = self.soup.new_string(old_node + node.element)
  412. old_node.replace_with(new_str)
  413. else:
  414. self.tag.insert(index, node.element)
  415. node.parent = self
  416. def removeChild(self, node: "Element") -> None:
  417. node.element.extract()
  418. def reparentChildren(self, newParent: "Element") -> None:
  419. """Move all of this tag's children into another tag."""
  420. # print("MOVE", self.element.contents)
  421. # print("FROM", self.element)
  422. # print("TO", new_parent.element)
  423. element = self.tag
  424. assert element is not None
  425. new_parent_element = newParent.tag
  426. assert new_parent_element is not None
  427. # Determine what this tag's next_element will be once all the children
  428. # are removed.
  429. final_next_element = element.next_sibling
  430. new_parents_last_descendant = new_parent_element._last_descendant(False, False)
  431. if len(new_parent_element.contents) > 0:
  432. # The new parent already contains children. We will be
  433. # appending this tag's children to the end.
  434. # We can make this assertion since we know new_parent has
  435. # children.
  436. assert new_parents_last_descendant is not None
  437. new_parents_last_child = new_parent_element.contents[-1]
  438. new_parents_last_descendant_next_element = (
  439. new_parents_last_descendant.next_element
  440. )
  441. else:
  442. # The new parent contains no children.
  443. new_parents_last_child = None
  444. new_parents_last_descendant_next_element = new_parent_element.next_element
  445. to_append = element.contents
  446. if len(to_append) > 0:
  447. # Set the first child's previous_element and previous_sibling
  448. # to elements within the new parent
  449. first_child = to_append[0]
  450. if new_parents_last_descendant is not None:
  451. first_child.previous_element = new_parents_last_descendant
  452. else:
  453. first_child.previous_element = new_parent_element
  454. first_child.previous_sibling = new_parents_last_child
  455. if new_parents_last_descendant is not None:
  456. new_parents_last_descendant.next_element = first_child
  457. else:
  458. new_parent_element.next_element = first_child
  459. if new_parents_last_child is not None:
  460. new_parents_last_child.next_sibling = first_child
  461. # Find the very last element being moved. It is now the
  462. # parent's last descendant. It has no .next_sibling and
  463. # its .next_element is whatever the previous last
  464. # descendant had.
  465. last_childs_last_descendant = to_append[-1]._last_descendant(
  466. is_initialized=False, accept_self=True
  467. )
  468. # Since we passed accept_self=True into _last_descendant,
  469. # there's no possibility that the result is None.
  470. assert last_childs_last_descendant is not None
  471. last_childs_last_descendant.next_element = (
  472. new_parents_last_descendant_next_element
  473. )
  474. if new_parents_last_descendant_next_element is not None:
  475. # TODO-COVERAGE: This code has no test coverage and
  476. # I'm not sure how to get html5lib to go through this
  477. # path, but it's just the other side of the previous
  478. # line.
  479. new_parents_last_descendant_next_element.previous_element = (
  480. last_childs_last_descendant
  481. )
  482. last_childs_last_descendant.next_sibling = None
  483. for child in to_append:
  484. child.parent = new_parent_element
  485. new_parent_element.contents.append(child)
  486. # Now that this element has no children, change its .next_element.
  487. element.contents = []
  488. element.next_element = final_next_element
  489. # print("DONE WITH MOVE")
  490. # print("FROM", self.element)
  491. # print("TO", new_parent_element)
  492. # TODO-TYPING: typeshed stubs are incorrect about this;
  493. # hasContent returns a boolean, not None.
  494. def hasContent(self) -> bool: # type:ignore
  495. return self.tag is None or len(self.tag.contents) > 0
  496. # TODO-TYPING: typeshed stubs are incorrect about this;
  497. # cloneNode returns a new Node, not None.
  498. def cloneNode(self) -> treebuilder_base.Node: # type:ignore
  499. assert self.tag is not None
  500. tag = self.soup.new_tag(self.tag.name, self.namespace)
  501. node = Element(tag, self.soup, self.namespace)
  502. for key, value in self.attributes:
  503. node.attributes[key] = value
  504. return node
  505. def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
  506. if self.namespace is None:
  507. return namespaces["html"], self.name
  508. else:
  509. return self.namespace, self.name
  510. nameTuple = property(getNameTuple)
  511. class TextNode(BeautifulSoupNode):
  512. def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
  513. treebuilder_base.Node.__init__(self, None)
  514. self.tag = None
  515. self.string = element
  516. self.soup = soup