_htmlparser.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. # encoding: utf-8
  2. """Use the HTMLParser library to parse HTML files that aren't too bad."""
  3. from __future__ import annotations
  4. # Use of this source code is governed by the MIT license.
  5. __license__ = "MIT"
  6. __all__ = [
  7. "HTMLParserTreeBuilder",
  8. ]
  9. from html.parser import HTMLParser
  10. from typing import (
  11. Any,
  12. Callable,
  13. cast,
  14. Dict,
  15. Iterable,
  16. List,
  17. Optional,
  18. TYPE_CHECKING,
  19. Tuple,
  20. Type,
  21. Union,
  22. )
  23. from bs4.element import (
  24. AttributeDict,
  25. CData,
  26. Comment,
  27. Declaration,
  28. Doctype,
  29. ProcessingInstruction,
  30. )
  31. from bs4.dammit import EntitySubstitution, UnicodeDammit
  32. from bs4.builder import (
  33. DetectsXMLParsedAsHTML,
  34. HTML,
  35. HTMLTreeBuilder,
  36. STRICT,
  37. )
  38. from bs4.exceptions import ParserRejectedMarkup
  39. if TYPE_CHECKING:
  40. from bs4 import BeautifulSoup
  41. from bs4.element import NavigableString
  42. from bs4._typing import (
  43. _Encoding,
  44. _Encodings,
  45. _RawMarkup,
  46. )
  47. HTMLPARSER = "html.parser"
  48. _DuplicateAttributeHandler = Callable[[Dict[str, str], str, str], None]
  49. class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
  50. #: Constant to handle duplicate attributes by ignoring later values
  51. #: and keeping the earlier ones.
  52. REPLACE: str = "replace"
  53. #: Constant to handle duplicate attributes by replacing earlier values
  54. #: with later ones.
  55. IGNORE: str = "ignore"
  56. """A subclass of the Python standard library's HTMLParser class, which
  57. listens for HTMLParser events and translates them into calls
  58. to Beautiful Soup's tree construction API.
  59. :param on_duplicate_attribute: A strategy for what to do if a
  60. tag includes the same attribute more than once. Accepted
  61. values are: REPLACE (replace earlier values with later
  62. ones, the default), IGNORE (keep the earliest value
  63. encountered), or a callable. A callable must take three
  64. arguments: the dictionary of attributes already processed,
  65. the name of the duplicate attribute, and the most recent value
  66. encountered.
  67. """
  68. def __init__(
  69. self,
  70. soup: BeautifulSoup,
  71. *args: Any,
  72. on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] = REPLACE,
  73. **kwargs: Any,
  74. ):
  75. self.soup = soup
  76. self.on_duplicate_attribute = on_duplicate_attribute
  77. self.attribute_dict_class = soup.builder.attribute_dict_class
  78. HTMLParser.__init__(self, *args, **kwargs)
  79. # Keep a list of empty-element tags that were encountered
  80. # without an explicit closing tag. If we encounter a closing tag
  81. # of this type, we'll associate it with one of those entries.
  82. #
  83. # This isn't a stack because we don't care about the
  84. # order. It's a list of closing tags we've already handled and
  85. # will ignore, assuming they ever show up.
  86. self.already_closed_empty_element = []
  87. self._initialize_xml_detector()
  88. on_duplicate_attribute: Union[str, _DuplicateAttributeHandler]
  89. already_closed_empty_element: List[str]
  90. soup: BeautifulSoup
  91. def error(self, message: str) -> None:
  92. # NOTE: This method is required so long as Python 3.9 is
  93. # supported. The corresponding code is removed from HTMLParser
  94. # in 3.5, but not removed from ParserBase until 3.10.
  95. # https://github.com/python/cpython/issues/76025
  96. #
  97. # The original implementation turned the error into a warning,
  98. # but in every case I discovered, this made HTMLParser
  99. # immediately crash with an error message that was less
  100. # helpful than the warning. The new implementation makes it
  101. # more clear that html.parser just can't parse this
  102. # markup. The 3.10 implementation does the same, though it
  103. # raises AssertionError rather than calling a method. (We
  104. # catch this error and wrap it in a ParserRejectedMarkup.)
  105. raise ParserRejectedMarkup(message)
  106. def handle_startendtag(
  107. self, tag: str, attrs: List[Tuple[str, Optional[str]]]
  108. ) -> None:
  109. """Handle an incoming empty-element tag.
  110. html.parser only calls this method when the markup looks like
  111. <tag/>.
  112. """
  113. # `handle_empty_element` tells handle_starttag not to close the tag
  114. # just because its name matches a known empty-element tag. We
  115. # know that this is an empty-element tag, and we want to call
  116. # handle_endtag ourselves.
  117. self.handle_starttag(tag, attrs, handle_empty_element=False)
  118. self.handle_endtag(tag)
  119. def handle_starttag(
  120. self,
  121. tag: str,
  122. attrs: List[Tuple[str, Optional[str]]],
  123. handle_empty_element: bool = True,
  124. ) -> None:
  125. """Handle an opening tag, e.g. '<tag>'
  126. :param handle_empty_element: True if this tag is known to be
  127. an empty-element tag (i.e. there is not expected to be any
  128. closing tag).
  129. """
  130. # TODO: handle namespaces here?
  131. attr_dict: AttributeDict = self.attribute_dict_class()
  132. for key, value in attrs:
  133. # Change None attribute values to the empty string
  134. # for consistency with the other tree builders.
  135. if value is None:
  136. value = ""
  137. if key in attr_dict:
  138. # A single attribute shows up multiple times in this
  139. # tag. How to handle it depends on the
  140. # on_duplicate_attribute setting.
  141. on_dupe = self.on_duplicate_attribute
  142. if on_dupe == self.IGNORE:
  143. pass
  144. elif on_dupe in (None, self.REPLACE):
  145. attr_dict[key] = value
  146. else:
  147. on_dupe = cast(_DuplicateAttributeHandler, on_dupe)
  148. on_dupe(attr_dict, key, value)
  149. else:
  150. attr_dict[key] = value
  151. # print("START", tag)
  152. sourceline: Optional[int]
  153. sourcepos: Optional[int]
  154. if self.soup.builder.store_line_numbers:
  155. sourceline, sourcepos = self.getpos()
  156. else:
  157. sourceline = sourcepos = None
  158. tagObj = self.soup.handle_starttag(
  159. tag, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos
  160. )
  161. if tagObj is not None and tagObj.is_empty_element and handle_empty_element:
  162. # Unlike other parsers, html.parser doesn't send separate end tag
  163. # events for empty-element tags. (It's handled in
  164. # handle_startendtag, but only if the original markup looked like
  165. # <tag/>.)
  166. #
  167. # So we need to call handle_endtag() ourselves. Since we
  168. # know the start event is identical to the end event, we
  169. # don't want handle_endtag() to cross off any previous end
  170. # events for tags of this name.
  171. self.handle_endtag(tag, check_already_closed=False)
  172. # But we might encounter an explicit closing tag for this tag
  173. # later on. If so, we want to ignore it.
  174. self.already_closed_empty_element.append(tag)
  175. if self._root_tag_name is None:
  176. self._root_tag_encountered(tag)
  177. def handle_endtag(self, tag: str, check_already_closed: bool = True) -> None:
  178. """Handle a closing tag, e.g. '</tag>'
  179. :param tag: A tag name.
  180. :param check_already_closed: True if this tag is expected to
  181. be the closing portion of an empty-element tag,
  182. e.g. '<tag></tag>'.
  183. """
  184. # print("END", tag)
  185. if check_already_closed and tag in self.already_closed_empty_element:
  186. # This is a redundant end tag for an empty-element tag.
  187. # We've already called handle_endtag() for it, so just
  188. # check it off the list.
  189. # print("ALREADY CLOSED", tag)
  190. self.already_closed_empty_element.remove(tag)
  191. else:
  192. self.soup.handle_endtag(tag)
  193. def handle_data(self, data: str) -> None:
  194. """Handle some textual data that shows up between tags."""
  195. self.soup.handle_data(data)
  196. def handle_charref(self, name: str) -> None:
  197. """Handle a numeric character reference by converting it to the
  198. corresponding Unicode character and treating it as textual
  199. data.
  200. :param name: Character number, possibly in hexadecimal.
  201. """
  202. # TODO: This was originally a workaround for a bug in
  203. # HTMLParser. (http://bugs.python.org/issue13633) The bug has
  204. # been fixed, but removing this code still makes some
  205. # Beautiful Soup tests fail. This needs investigation.
  206. real_name:int
  207. if name.startswith("x"):
  208. real_name = int(name.lstrip("x"), 16)
  209. elif name.startswith("X"):
  210. real_name = int(name.lstrip("X"), 16)
  211. else:
  212. real_name = int(name)
  213. data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
  214. if replacement_added:
  215. self.soup.contains_replacement_characters = True
  216. self.handle_data(data)
  217. def handle_entityref(self, name: str) -> None:
  218. """Handle a named entity reference by converting it to the
  219. corresponding Unicode character(s) and treating it as textual
  220. data.
  221. :param name: Name of the entity reference.
  222. """
  223. character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
  224. if character is not None:
  225. data = character
  226. else:
  227. # If this were XML, it would be ambiguous whether "&foo"
  228. # was an character entity reference with a missing
  229. # semicolon or the literal string "&foo". Since this is
  230. # HTML, we have a complete list of all character entity references,
  231. # and this one wasn't found, so assume it's the literal string "&foo".
  232. data = "&%s" % name
  233. self.handle_data(data)
  234. def handle_comment(self, data: str) -> None:
  235. """Handle an HTML comment.
  236. :param data: The text of the comment.
  237. """
  238. self.soup.endData()
  239. self.soup.handle_data(data)
  240. self.soup.endData(Comment)
  241. def handle_decl(self, decl: str) -> None:
  242. """Handle a DOCTYPE declaration.
  243. :param data: The text of the declaration.
  244. """
  245. self.soup.endData()
  246. decl = decl[len("DOCTYPE ") :]
  247. self.soup.handle_data(decl)
  248. self.soup.endData(Doctype)
  249. def unknown_decl(self, data: str) -> None:
  250. """Handle a declaration of unknown type -- probably a CDATA block.
  251. :param data: The text of the declaration.
  252. """
  253. cls: Type[NavigableString]
  254. if data.upper().startswith("CDATA["):
  255. cls = CData
  256. data = data[len("CDATA[") :]
  257. else:
  258. cls = Declaration
  259. self.soup.endData()
  260. self.soup.handle_data(data)
  261. self.soup.endData(cls)
  262. def handle_pi(self, data: str) -> None:
  263. """Handle a processing instruction.
  264. :param data: The text of the instruction.
  265. """
  266. self.soup.endData()
  267. self.soup.handle_data(data)
  268. self._document_might_be_xml(data)
  269. self.soup.endData(ProcessingInstruction)
  270. class HTMLParserTreeBuilder(HTMLTreeBuilder):
  271. """A Beautiful soup `bs4.builder.TreeBuilder` that uses the
  272. :py:class:`html.parser.HTMLParser` parser, found in the Python
  273. standard library.
  274. """
  275. is_xml: bool = False
  276. picklable: bool = True
  277. NAME: str = HTMLPARSER
  278. features: Iterable[str] = [NAME, HTML, STRICT]
  279. parser_args: Tuple[Iterable[Any], Dict[str, Any]]
  280. #: The html.parser knows which line number and position in the
  281. #: original file is the source of an element.
  282. TRACKS_LINE_NUMBERS: bool = True
  283. def __init__(
  284. self,
  285. parser_args: Optional[Iterable[Any]] = None,
  286. parser_kwargs: Optional[Dict[str, Any]] = None,
  287. **kwargs: Any,
  288. ):
  289. """Constructor.
  290. :param parser_args: Positional arguments to pass into
  291. the BeautifulSoupHTMLParser constructor, once it's
  292. invoked.
  293. :param parser_kwargs: Keyword arguments to pass into
  294. the BeautifulSoupHTMLParser constructor, once it's
  295. invoked.
  296. :param kwargs: Keyword arguments for the superclass constructor.
  297. """
  298. # Some keyword arguments will be pulled out of kwargs and placed
  299. # into parser_kwargs.
  300. extra_parser_kwargs = dict()
  301. for arg in ("on_duplicate_attribute",):
  302. if arg in kwargs:
  303. value = kwargs.pop(arg)
  304. extra_parser_kwargs[arg] = value
  305. super(HTMLParserTreeBuilder, self).__init__(**kwargs)
  306. parser_args = parser_args or []
  307. parser_kwargs = parser_kwargs or {}
  308. parser_kwargs.update(extra_parser_kwargs)
  309. parser_kwargs["convert_charrefs"] = False
  310. self.parser_args = (parser_args, parser_kwargs)
  311. def prepare_markup(
  312. self,
  313. markup: _RawMarkup,
  314. user_specified_encoding: Optional[_Encoding] = None,
  315. document_declared_encoding: Optional[_Encoding] = None,
  316. exclude_encodings: Optional[_Encodings] = None,
  317. ) -> Iterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]:
  318. """Run any preliminary steps necessary to make incoming markup
  319. acceptable to the parser.
  320. :param markup: Some markup -- probably a bytestring.
  321. :param user_specified_encoding: The user asked to try this encoding.
  322. :param document_declared_encoding: The markup itself claims to be
  323. in this encoding.
  324. :param exclude_encodings: The user asked _not_ to try any of
  325. these encodings.
  326. :yield: A series of 4-tuples: (markup, encoding, declared encoding,
  327. has undergone character replacement)
  328. Each 4-tuple represents a strategy for parsing the document.
  329. This TreeBuilder uses Unicode, Dammit to convert the markup
  330. into Unicode, so the ``markup`` element of the tuple will
  331. always be a string.
  332. """
  333. if isinstance(markup, str):
  334. # Parse Unicode as-is.
  335. yield (markup, None, None, False)
  336. return
  337. # Ask UnicodeDammit to sniff the most likely encoding.
  338. known_definite_encodings: List[_Encoding] = []
  339. if user_specified_encoding:
  340. # This was provided by the end-user; treat it as a known
  341. # definite encoding per the algorithm laid out in the
  342. # HTML5 spec. (See the EncodingDetector class for
  343. # details.)
  344. known_definite_encodings.append(user_specified_encoding)
  345. user_encodings: List[_Encoding] = []
  346. if document_declared_encoding:
  347. # This was found in the document; treat it as a slightly
  348. # lower-priority user encoding.
  349. user_encodings.append(document_declared_encoding)
  350. dammit = UnicodeDammit(
  351. markup,
  352. known_definite_encodings=known_definite_encodings,
  353. user_encodings=user_encodings,
  354. is_html=True,
  355. exclude_encodings=exclude_encodings,
  356. )
  357. if dammit.unicode_markup is None:
  358. # In every case I've seen, Unicode, Dammit is able to
  359. # convert the markup into Unicode, even if it needs to use
  360. # REPLACEMENT CHARACTER. But there is a code path that
  361. # could result in unicode_markup being None, and
  362. # HTMLParser can only parse Unicode, so here we handle
  363. # that code path.
  364. raise ParserRejectedMarkup(
  365. "Could not convert input to Unicode, and html.parser will not accept bytestrings."
  366. )
  367. else:
  368. yield (
  369. dammit.unicode_markup,
  370. dammit.original_encoding,
  371. dammit.declared_html_encoding,
  372. dammit.contains_replacement_characters,
  373. )
  374. def feed(self, markup: _RawMarkup, _parser_class:type[BeautifulSoupHTMLParser] =BeautifulSoupHTMLParser) -> None:
  375. """
  376. :param markup: The markup to feed into the parser.
  377. :param _parser_class: An HTMLParser subclass to use. This is only intended for use in unit tests.
  378. """
  379. args, kwargs = self.parser_args
  380. # HTMLParser.feed will only handle str, but
  381. # BeautifulSoup.markup is allowed to be _RawMarkup, because
  382. # it's set by the yield value of
  383. # TreeBuilder.prepare_markup. Fortunately,
  384. # HTMLParserTreeBuilder.prepare_markup always yields a str
  385. # (UnicodeDammit.unicode_markup).
  386. assert isinstance(markup, str)
  387. # We know BeautifulSoup calls TreeBuilder.initialize_soup
  388. # before calling feed(), so we can assume self.soup
  389. # is set.
  390. assert self.soup is not None
  391. parser = _parser_class(self.soup, *args, **kwargs)
  392. try:
  393. parser.feed(markup)
  394. parser.close()
  395. except AssertionError as e:
  396. # html.parser raises AssertionError in rare cases to
  397. # indicate a fatal problem with the markup, especially
  398. # when there's an error in the doctype declaration.
  399. raise ParserRejectedMarkup(e)
  400. parser.already_closed_empty_element = []