html5lib_shim.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. # flake8: noqa
  2. """
  3. Shim module between Bleach and html5lib. This makes it easier to upgrade the
  4. html5lib library without having to change a lot of code.
  5. """
  6. import re
  7. import string
  8. import warnings
  9. # ignore html5lib deprecation warnings to use bleach; we are bleach
  10. # apply before we import submodules that import html5lib
  11. warnings.filterwarnings(
  12. "ignore",
  13. message="html5lib's sanitizer is deprecated",
  14. category=DeprecationWarning,
  15. module="bleach._vendor.html5lib",
  16. )
  17. from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
  18. HTMLParser,
  19. getTreeWalker,
  20. )
  21. from bleach._vendor.html5lib import (
  22. constants,
  23. ) # noqa: E402 module level import not at top of file
  24. from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
  25. namespaces,
  26. prefixes,
  27. )
  28. from bleach._vendor.html5lib.constants import (
  29. _ReparseException as ReparseException,
  30. ) # noqa: E402 module level import not at top of file
  31. from bleach._vendor.html5lib.filters.base import (
  32. Filter,
  33. ) # noqa: E402 module level import not at top of file
  34. from bleach._vendor.html5lib.filters.sanitizer import (
  35. allowed_protocols,
  36. allowed_css_properties,
  37. allowed_svg_properties,
  38. attr_val_is_uri,
  39. svg_attr_val_allows_ref,
  40. svg_allow_local_href,
  41. ) # noqa: E402 module level import not at top of file
  42. from bleach._vendor.html5lib.filters.sanitizer import (
  43. Filter as SanitizerFilter,
  44. ) # noqa: E402 module level import not at top of file
  45. from bleach._vendor.html5lib._inputstream import (
  46. HTMLInputStream,
  47. ) # noqa: E402 module level import not at top of file
  48. from bleach._vendor.html5lib.serializer import (
  49. escape,
  50. HTMLSerializer,
  51. ) # noqa: E402 module level import not at top of file
  52. from bleach._vendor.html5lib._tokenizer import (
  53. attributeMap,
  54. HTMLTokenizer,
  55. ) # noqa: E402 module level import not at top of file
  56. from bleach._vendor.html5lib._trie import (
  57. Trie,
  58. ) # noqa: E402 module level import not at top of file
  59. #: Map of entity name to expanded entity
  60. ENTITIES = constants.entities
  61. #: Trie of html entity string -> character representation
  62. ENTITIES_TRIE = Trie(ENTITIES)
  63. #: Token type constants--these never change
  64. TAG_TOKEN_TYPES = {
  65. constants.tokenTypes["StartTag"],
  66. constants.tokenTypes["EndTag"],
  67. constants.tokenTypes["EmptyTag"],
  68. }
  69. TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
  70. TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
  71. TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
  72. TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
  73. #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
  74. #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
  75. HTML_TAGS = frozenset(
  76. (
  77. "a",
  78. "abbr",
  79. "address",
  80. "area",
  81. "article",
  82. "aside",
  83. "audio",
  84. "b",
  85. "base",
  86. "bdi",
  87. "bdo",
  88. "blockquote",
  89. "body",
  90. "br",
  91. "button",
  92. "canvas",
  93. "caption",
  94. "cite",
  95. "code",
  96. "col",
  97. "colgroup",
  98. "data",
  99. "datalist",
  100. "dd",
  101. "del",
  102. "details",
  103. "dfn",
  104. "dialog",
  105. "div",
  106. "dl",
  107. "dt",
  108. "em",
  109. "embed",
  110. "fieldset",
  111. "figcaption",
  112. "figure",
  113. "footer",
  114. "form",
  115. "h1",
  116. "h2",
  117. "h3",
  118. "h4",
  119. "h5",
  120. "h6",
  121. "head",
  122. "header",
  123. "hgroup",
  124. "hr",
  125. "html",
  126. "i",
  127. "iframe",
  128. "img",
  129. "input",
  130. "ins",
  131. "kbd",
  132. "keygen",
  133. "label",
  134. "legend",
  135. "li",
  136. "link",
  137. "map",
  138. "mark",
  139. "menu",
  140. "meta",
  141. "meter",
  142. "nav",
  143. "noscript",
  144. "object",
  145. "ol",
  146. "optgroup",
  147. "option",
  148. "output",
  149. "p",
  150. "param",
  151. "picture",
  152. "pre",
  153. "progress",
  154. "q",
  155. "rp",
  156. "rt",
  157. "ruby",
  158. "s",
  159. "samp",
  160. "script",
  161. "section",
  162. "select",
  163. "slot",
  164. "small",
  165. "source",
  166. "span",
  167. "strong",
  168. "style",
  169. "sub",
  170. "summary",
  171. "sup",
  172. "table",
  173. "tbody",
  174. "td",
  175. "template",
  176. "textarea",
  177. "tfoot",
  178. "th",
  179. "thead",
  180. "time",
  181. "title",
  182. "tr",
  183. "track",
  184. "u",
  185. "ul",
  186. "var",
  187. "video",
  188. "wbr",
  189. )
  190. )
  191. #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
  192. #: from mozilla on 2019.07.11
  193. #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
  194. HTML_TAGS_BLOCK_LEVEL = frozenset(
  195. (
  196. "address",
  197. "article",
  198. "aside",
  199. "blockquote",
  200. "details",
  201. "dialog",
  202. "dd",
  203. "div",
  204. "dl",
  205. "dt",
  206. "fieldset",
  207. "figcaption",
  208. "figure",
  209. "footer",
  210. "form",
  211. "h1",
  212. "h2",
  213. "h3",
  214. "h4",
  215. "h5",
  216. "h6",
  217. "header",
  218. "hgroup",
  219. "hr",
  220. "li",
  221. "main",
  222. "nav",
  223. "ol",
  224. "p",
  225. "pre",
  226. "section",
  227. "table",
  228. "ul",
  229. )
  230. )
  231. class InputStreamWithMemory:
  232. """Wraps an HTMLInputStream to remember characters since last <
  233. This wraps existing HTMLInputStream classes to keep track of the stream
  234. since the last < which marked an open tag state.
  235. """
  236. def __init__(self, inner_stream):
  237. self._inner_stream = inner_stream
  238. self.reset = self._inner_stream.reset
  239. self.position = self._inner_stream.position
  240. self._buffer = []
  241. @property
  242. def errors(self):
  243. return self._inner_stream.errors
  244. @property
  245. def charEncoding(self):
  246. return self._inner_stream.charEncoding
  247. @property
  248. def changeEncoding(self):
  249. return self._inner_stream.changeEncoding
  250. def char(self):
  251. c = self._inner_stream.char()
  252. # char() can return None if EOF, so ignore that
  253. if c:
  254. self._buffer.append(c)
  255. return c
  256. def charsUntil(self, characters, opposite=False):
  257. chars = self._inner_stream.charsUntil(characters, opposite=opposite)
  258. self._buffer.extend(list(chars))
  259. return chars
  260. def unget(self, char):
  261. if self._buffer:
  262. self._buffer.pop(-1)
  263. return self._inner_stream.unget(char)
  264. def get_tag(self):
  265. """Returns the stream history since last '<'
  266. Since the buffer starts at the last '<' as as seen by tagOpenState(),
  267. we know that everything from that point to when this method is called
  268. is the "tag" that is being tokenized.
  269. """
  270. return "".join(self._buffer)
  271. def start_tag(self):
  272. """Resets stream history to just '<'
  273. This gets called by tagOpenState() which marks a '<' that denotes an
  274. open tag. Any time we see that, we reset the buffer.
  275. """
  276. self._buffer = ["<"]
  277. class BleachHTMLTokenizer(HTMLTokenizer):
  278. """Tokenizer that doesn't consume character entities"""
  279. def __init__(self, consume_entities=False, **kwargs):
  280. super().__init__(**kwargs)
  281. self.consume_entities = consume_entities
  282. # Wrap the stream with one that remembers the history
  283. self.stream = InputStreamWithMemory(self.stream)
  284. # Remember the last token emitted; needed for block element spacing
  285. self.emitted_last_token = None
  286. def __iter__(self):
  287. last_error_token = None
  288. for token in super().__iter__():
  289. if last_error_token is not None:
  290. if (
  291. last_error_token["data"] == "invalid-character-in-attribute-name"
  292. and token["type"] in TAG_TOKEN_TYPES
  293. and token.get("data")
  294. ):
  295. # token["data"] is an html5lib attributeMap
  296. # (OrderedDict 3.7+ and dict otherwise)
  297. # of attr name to attr value
  298. #
  299. # Remove attribute names that have ', " or < in them
  300. # because those characters are invalid for attribute names.
  301. token["data"] = attributeMap(
  302. (attr_name, attr_value)
  303. for attr_name, attr_value in token["data"].items()
  304. if (
  305. '"' not in attr_name
  306. and "'" not in attr_name
  307. and "<" not in attr_name
  308. )
  309. )
  310. last_error_token = None
  311. yield token
  312. elif (
  313. last_error_token["data"] == "expected-closing-tag-but-got-char"
  314. and self.parser.tags is not None
  315. and token["data"].lower().strip() not in self.parser.tags
  316. ):
  317. # We've got either a malformed tag or a pseudo-tag or
  318. # something that html5lib wants to turn into a malformed
  319. # comment which Bleach clean() will drop so we interfere
  320. # with the token stream to handle it more correctly.
  321. #
  322. # If this is an allowed tag, it's malformed and we just let
  323. # the html5lib parser deal with it--we don't enter into this
  324. # block.
  325. #
  326. # If this is not an allowed tag, then we convert it to
  327. # characters and it'll get escaped in the sanitizer.
  328. token["data"] = self.stream.get_tag()
  329. token["type"] = TAG_TOKEN_TYPE_CHARACTERS
  330. last_error_token = None
  331. yield token
  332. elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
  333. # If the token is a parse error, then let the last_error_token
  334. # go, and make token the new last_error_token
  335. yield last_error_token
  336. last_error_token = token
  337. else:
  338. yield last_error_token
  339. yield token
  340. last_error_token = None
  341. continue
  342. # If the token is a ParseError, we hold on to it so we can get the
  343. # next token and potentially fix it.
  344. if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
  345. last_error_token = token
  346. continue
  347. yield token
  348. if last_error_token:
  349. if last_error_token["data"] == "eof-in-tag-name":
  350. # Handle the case where the text being parsed ends with <
  351. # followed by a series of characters. It's treated as a tag
  352. # name that abruptly ends, but we should treat that like
  353. # character data
  354. yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
  355. elif last_error_token["data"] in (
  356. "duplicate-attribute",
  357. "eof-in-attribute-name",
  358. "eof-in-attribute-value-no-quotes",
  359. "expected-end-of-tag-but-got-eof",
  360. ):
  361. # Handle the case where the text being parsed ends with <
  362. # followed by characters and then space and then:
  363. #
  364. # * more characters
  365. # * more characters repeated with a space between (e.g. "abc abc")
  366. # * more characters and then a space and then an EOF (e.g. "abc def ")
  367. #
  368. # These cases are treated as a tag name followed by an
  369. # attribute that abruptly ends, but we should treat that like
  370. # character data instead.
  371. yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
  372. else:
  373. yield last_error_token
  374. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  375. # If this tokenizer is set to consume entities, then we can let the
  376. # superclass do its thing.
  377. if self.consume_entities:
  378. return super().consumeEntity(allowedChar, fromAttribute)
  379. # If this tokenizer is set to not consume entities, then we don't want
  380. # to consume and convert them, so this overrides the html5lib tokenizer's
  381. # consumeEntity so that it's now a no-op.
  382. #
  383. # However, when that gets called, it's consumed an &, so we put that back in
  384. # the stream.
  385. if fromAttribute:
  386. self.currentToken["data"][-1][1] += "&"
  387. else:
  388. self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
  389. def tagOpenState(self):
  390. # This state marks a < that is either a StartTag, EndTag, EmptyTag,
  391. # or ParseError. In all cases, we want to drop any stream history
  392. # we've collected so far and we do that by calling start_tag() on
  393. # the input stream wrapper.
  394. self.stream.start_tag()
  395. return super().tagOpenState()
  396. def emitCurrentToken(self):
  397. token = self.currentToken
  398. if (
  399. self.parser.tags is not None
  400. and token["type"] in TAG_TOKEN_TYPES
  401. and token["name"].lower() not in self.parser.tags
  402. ):
  403. # If this is a start/end/empty tag for a tag that's not in our
  404. # allowed list, then it gets stripped or escaped. In both of these
  405. # cases it gets converted to a Characters token.
  406. if self.parser.strip:
  407. if (
  408. self.emitted_last_token
  409. and token["type"] == TAG_TOKEN_TYPE_START
  410. and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
  411. ):
  412. # If this is a block level tag we're stripping, we drop it
  413. # for a newline because that's what a browser would parse
  414. # it as
  415. new_data = "\n"
  416. else:
  417. # For all other things being stripped, we throw in an empty
  418. # string token
  419. new_data = ""
  420. else:
  421. # If we're escaping the token, we want to escape the exact
  422. # original string. Since tokenizing also normalizes data
  423. # and this is a tag-like thing, we've lost some information.
  424. # So we go back through the stream to get the original
  425. # string and use that.
  426. new_data = self.stream.get_tag()
  427. new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
  428. self.currentToken = self.emitted_last_token = new_token
  429. self.tokenQueue.append(new_token)
  430. self.state = self.dataState
  431. return
  432. self.emitted_last_token = self.currentToken
  433. super().emitCurrentToken()
  434. class BleachHTMLParser(HTMLParser):
  435. """Parser that uses BleachHTMLTokenizer"""
  436. def __init__(self, tags, strip, consume_entities, **kwargs):
  437. """
  438. :arg tags: set of allowed tags--everything else is either stripped or
  439. escaped; if None, then this doesn't look at tags at all
  440. :arg strip: whether to strip disallowed tags (True) or escape them (False);
  441. if tags=None, then this doesn't have any effect
  442. :arg consume_entities: whether to consume entities (default behavior) or
  443. leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
  444. """
  445. self.tags = (
  446. frozenset((tag.lower() for tag in tags)) if tags is not None else None
  447. )
  448. self.strip = strip
  449. self.consume_entities = consume_entities
  450. super().__init__(**kwargs)
  451. def _parse(
  452. self, stream, innerHTML=False, container="div", scripting=True, **kwargs
  453. ):
  454. # set scripting=True to parse <noscript> as though JS is enabled to
  455. # match the expected context in browsers
  456. #
  457. # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
  458. #
  459. # Override HTMLParser so we can swap out the tokenizer for our own.
  460. self.innerHTMLMode = innerHTML
  461. self.container = container
  462. self.scripting = scripting
  463. self.tokenizer = BleachHTMLTokenizer(
  464. stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
  465. )
  466. self.reset()
  467. try:
  468. self.mainLoop()
  469. except ReparseException:
  470. self.reset()
  471. self.mainLoop()
  472. def convert_entity(value):
  473. """Convert an entity (minus the & and ; part) into what it represents
  474. This handles numeric, hex, and text entities.
  475. :arg value: the string (minus the ``&`` and ``;`` part) to convert
  476. :returns: unicode character or None if it's an ambiguous ampersand that
  477. doesn't match a character entity
  478. """
  479. if value[0] == "#":
  480. if len(value) < 2:
  481. return None
  482. if value[1] in ("x", "X"):
  483. # hex-encoded code point
  484. int_as_string, base = value[2:], 16
  485. else:
  486. # decimal code point
  487. int_as_string, base = value[1:], 10
  488. if int_as_string == "":
  489. return None
  490. code_point = int(int_as_string, base)
  491. if 0 < code_point < 0x110000:
  492. return chr(code_point)
  493. else:
  494. return None
  495. return ENTITIES.get(value, None)
  496. def convert_entities(text):
  497. """Converts all found entities in the text
  498. :arg text: the text to convert entities in
  499. :returns: unicode text with converted entities
  500. """
  501. if "&" not in text:
  502. return text
  503. new_text = []
  504. for part in next_possible_entity(text):
  505. if not part:
  506. continue
  507. if part.startswith("&"):
  508. entity = match_entity(part)
  509. if entity is not None:
  510. converted = convert_entity(entity)
  511. # If it's not an ambiguous ampersand, then replace with the
  512. # unicode character. Otherwise, we leave the entity in.
  513. if converted is not None:
  514. new_text.append(converted)
  515. remainder = part[len(entity) + 2 :]
  516. if part:
  517. new_text.append(remainder)
  518. continue
  519. new_text.append(part)
  520. return "".join(new_text)
  521. def match_entity(stream):
  522. """Returns first entity in stream or None if no entity exists
  523. Note: For Bleach purposes, entities must start with a "&" and end with a
  524. ";". This ignores ambiguous character entities that have no ";" at the end.
  525. :arg stream: the character stream
  526. :returns: the entity string without "&" or ";" if it's a valid character
  527. entity; ``None`` otherwise
  528. """
  529. # Nix the & at the beginning
  530. if stream[0] != "&":
  531. raise ValueError('Stream should begin with "&"')
  532. stream = stream[1:]
  533. stream = list(stream)
  534. possible_entity = ""
  535. end_characters = "<&=;" + string.whitespace
  536. # Handle number entities
  537. if stream and stream[0] == "#":
  538. possible_entity = "#"
  539. stream.pop(0)
  540. if stream and stream[0] in ("x", "X"):
  541. allowed = "0123456789abcdefABCDEF"
  542. possible_entity += stream.pop(0)
  543. else:
  544. allowed = "0123456789"
  545. # FIXME(willkg): Do we want to make sure these are valid number
  546. # entities? This doesn't do that currently.
  547. while stream and stream[0] not in end_characters:
  548. c = stream.pop(0)
  549. if c not in allowed:
  550. break
  551. possible_entity += c
  552. if possible_entity and stream and stream[0] == ";":
  553. return possible_entity
  554. return None
  555. # Handle character entities
  556. while stream and stream[0] not in end_characters:
  557. c = stream.pop(0)
  558. possible_entity += c
  559. if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
  560. # If it's not a prefix, then it's not an entity and we're
  561. # out
  562. return None
  563. if possible_entity and stream and stream[0] == ";":
  564. return possible_entity
  565. return None
  566. AMP_SPLIT_RE = re.compile("(&)")
  567. def next_possible_entity(text):
  568. """Takes a text and generates a list of possible entities
  569. :arg text: the text to look at
  570. :returns: generator where each part (except the first) starts with an
  571. "&"
  572. """
  573. for i, part in enumerate(AMP_SPLIT_RE.split(text)):
  574. if i == 0:
  575. yield part
  576. elif i % 2 == 0:
  577. yield "&" + part
  578. class BleachHTMLSerializer(HTMLSerializer):
  579. """HTMLSerializer that undoes & -> &amp; in attributes and sets
  580. escape_rcdata to True
  581. """
  582. # per the HTMLSerializer.__init__ docstring:
  583. #
  584. # Whether to escape characters that need to be
  585. # escaped within normal elements within rcdata elements such as
  586. # style.
  587. #
  588. escape_rcdata = True
  589. def escape_base_amp(self, stoken):
  590. """Escapes just bare & in HTML attribute values"""
  591. # First, undo escaping of &. We need to do this because html5lib's
  592. # HTMLSerializer expected the tokenizer to consume all the character
  593. # entities and convert them to their respective characters, but the
  594. # BleachHTMLTokenizer doesn't do that. For example, this fixes
  595. # &amp;entity; back to &entity; .
  596. stoken = stoken.replace("&amp;", "&")
  597. # However, we do want all bare & that are not marking character
  598. # entities to be changed to &amp;, so let's do that carefully here.
  599. for part in next_possible_entity(stoken):
  600. if not part:
  601. continue
  602. if part.startswith("&"):
  603. entity = match_entity(part)
  604. # Only leave entities in that are not ambiguous. If they're
  605. # ambiguous, then we escape the ampersand.
  606. if entity is not None and convert_entity(entity) is not None:
  607. yield f"&{entity};"
  608. # Length of the entity plus 2--one for & at the beginning
  609. # and one for ; at the end
  610. part = part[len(entity) + 2 :]
  611. if part:
  612. yield part
  613. continue
  614. yield part.replace("&", "&amp;")
  615. def serialize(self, treewalker, encoding=None):
  616. """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
  617. Note that this converts & to &amp; in attribute values where the & isn't
  618. already part of an unambiguous character entity.
  619. """
  620. in_tag = False
  621. after_equals = False
  622. for stoken in super().serialize(treewalker, encoding):
  623. if in_tag:
  624. if stoken == ">":
  625. in_tag = False
  626. elif after_equals:
  627. if stoken != '"':
  628. yield from self.escape_base_amp(stoken)
  629. after_equals = False
  630. continue
  631. elif stoken == "=":
  632. after_equals = True
  633. yield stoken
  634. else:
  635. if stoken.startswith("<"):
  636. in_tag = True
  637. yield stoken