sanitizer.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. from itertools import chain
  2. import re
  3. import warnings
  4. from xml.sax.saxutils import unescape
  5. from bleach import html5lib_shim
  6. from bleach import parse_shim
  7. #: Set of allowed tags
  8. ALLOWED_TAGS = frozenset(
  9. (
  10. "a",
  11. "abbr",
  12. "acronym",
  13. "b",
  14. "blockquote",
  15. "code",
  16. "em",
  17. "i",
  18. "li",
  19. "ol",
  20. "strong",
  21. "ul",
  22. )
  23. )
  24. #: Map of allowed attributes by tag
  25. ALLOWED_ATTRIBUTES = {
  26. "a": ["href", "title"],
  27. "abbr": ["title"],
  28. "acronym": ["title"],
  29. }
  30. #: Set of allowed protocols
  31. ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))
  32. #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
  33. INVISIBLE_CHARACTERS = "".join(
  34. [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
  35. )
  36. #: Regexp for characters that are invisible
  37. INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
  38. #: String to replace invisible characters with. This can be a character, a
  39. #: string, or even a function that takes a Python re matchobj
  40. INVISIBLE_REPLACEMENT_CHAR = "?"
  41. class NoCssSanitizerWarning(UserWarning):
  42. pass
  43. class Cleaner:
  44. """Cleaner for cleaning HTML fragments of malicious content
  45. This cleaner is a security-focused function whose sole purpose is to remove
  46. malicious content from a string such that it can be displayed as content in
  47. a web page.
  48. To use::
  49. from bleach.sanitizer import Cleaner
  50. cleaner = Cleaner()
  51. for text in all_the_yucky_things:
  52. sanitized = cleaner.clean(text)
  53. .. Note::
  54. This cleaner is not designed to use to transform content to be used in
  55. non-web-page contexts.
  56. .. Warning::
  57. This cleaner is not thread-safe--the html parser has internal state.
  58. Create a separate cleaner per thread!
  59. """
  60. def __init__(
  61. self,
  62. tags=ALLOWED_TAGS,
  63. attributes=ALLOWED_ATTRIBUTES,
  64. protocols=ALLOWED_PROTOCOLS,
  65. strip=False,
  66. strip_comments=True,
  67. filters=None,
  68. css_sanitizer=None,
  69. ):
  70. """Initializes a Cleaner
  71. :arg set tags: set of allowed tags; defaults to
  72. ``bleach.sanitizer.ALLOWED_TAGS``
  73. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  74. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  75. :arg set protocols: set of allowed protocols for links; defaults
  76. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  77. :arg bool strip: whether or not to strip disallowed elements
  78. :arg bool strip_comments: whether or not to strip HTML comments
  79. :arg list filters: list of html5lib Filter classes to pass streamed content through
  80. .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
  81. .. Warning::
  82. Using filters changes the output of ``bleach.Cleaner.clean``.
  83. Make sure the way the filters change the output are secure.
  84. :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
  85. sanitizing style attribute values and style text; defaults to None
  86. """
  87. self.tags = tags
  88. self.attributes = attributes
  89. self.protocols = protocols
  90. self.strip = strip
  91. self.strip_comments = strip_comments
  92. self.filters = filters or []
  93. self.css_sanitizer = css_sanitizer
  94. self.parser = html5lib_shim.BleachHTMLParser(
  95. tags=self.tags,
  96. strip=self.strip,
  97. consume_entities=False,
  98. namespaceHTMLElements=False,
  99. )
  100. self.walker = html5lib_shim.getTreeWalker("etree")
  101. self.serializer = html5lib_shim.BleachHTMLSerializer(
  102. quote_attr_values="always",
  103. omit_optional_tags=False,
  104. escape_lt_in_attrs=True,
  105. # We want to leave entities as they are without escaping or
  106. # resolving or expanding
  107. resolve_entities=False,
  108. # Bleach has its own sanitizer, so don't use the html5lib one
  109. sanitize=False,
  110. # clean preserves attr order
  111. alphabetical_attributes=False,
  112. )
  113. if css_sanitizer is None:
  114. # FIXME(willkg): this doesn't handle when attributes or an
  115. # attributes value is a callable
  116. attributes_values = []
  117. if isinstance(attributes, list):
  118. attributes_values = attributes
  119. elif isinstance(attributes, dict):
  120. attributes_values = []
  121. for values in attributes.values():
  122. if isinstance(values, (list, tuple)):
  123. attributes_values.extend(values)
  124. if "style" in attributes_values:
  125. warnings.warn(
  126. "'style' attribute specified, but css_sanitizer not set.",
  127. category=NoCssSanitizerWarning,
  128. )
  129. def clean(self, text):
  130. """Cleans text and returns sanitized result as unicode
  131. :arg str text: text to be cleaned
  132. :returns: sanitized text as unicode
  133. :raises TypeError: if ``text`` is not a text type
  134. """
  135. if not isinstance(text, str):
  136. message = (
  137. f"argument cannot be of {text.__class__.__name__!r} type, "
  138. + "must be of text type"
  139. )
  140. raise TypeError(message)
  141. if not text:
  142. return ""
  143. dom = self.parser.parseFragment(text)
  144. filtered = BleachSanitizerFilter(
  145. source=self.walker(dom),
  146. allowed_tags=self.tags,
  147. attributes=self.attributes,
  148. strip_disallowed_tags=self.strip,
  149. strip_html_comments=self.strip_comments,
  150. css_sanitizer=self.css_sanitizer,
  151. allowed_protocols=self.protocols,
  152. )
  153. # Apply any filters after the BleachSanitizerFilter
  154. for filter_class in self.filters:
  155. filtered = filter_class(source=filtered)
  156. return self.serializer.render(filtered)
  157. def attribute_filter_factory(attributes):
  158. """Generates attribute filter function for the given attributes value
  159. The attributes value can take one of several shapes. This returns a filter
  160. function appropriate to the attributes value. One nice thing about this is
  161. that there's less if/then shenanigans in the ``allow_token`` method.
  162. """
  163. if callable(attributes):
  164. return attributes
  165. if isinstance(attributes, dict):
  166. def _attr_filter(tag, attr, value):
  167. if tag in attributes:
  168. attr_val = attributes[tag]
  169. if callable(attr_val):
  170. return attr_val(tag, attr, value)
  171. if attr in attr_val:
  172. return True
  173. if "*" in attributes:
  174. attr_val = attributes["*"]
  175. if callable(attr_val):
  176. return attr_val(tag, attr, value)
  177. return attr in attr_val
  178. return False
  179. return _attr_filter
  180. if isinstance(attributes, list):
  181. def _attr_filter(tag, attr, value):
  182. return attr in attributes
  183. return _attr_filter
  184. raise ValueError("attributes needs to be a callable, a list or a dict")
  185. class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
  186. """html5lib Filter that sanitizes text
  187. This filter can be used anywhere html5lib filters can be used.
  188. """
  189. def __init__(
  190. self,
  191. source,
  192. allowed_tags=ALLOWED_TAGS,
  193. attributes=ALLOWED_ATTRIBUTES,
  194. allowed_protocols=ALLOWED_PROTOCOLS,
  195. attr_val_is_uri=html5lib_shim.attr_val_is_uri,
  196. svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,
  197. svg_allow_local_href=html5lib_shim.svg_allow_local_href,
  198. strip_disallowed_tags=False,
  199. strip_html_comments=True,
  200. css_sanitizer=None,
  201. ):
  202. """Creates a BleachSanitizerFilter instance
  203. :arg source: html5lib TreeWalker stream as an html5lib TreeWalker
  204. :arg set allowed_tags: set of allowed tags; defaults to
  205. ``bleach.sanitizer.ALLOWED_TAGS``
  206. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  207. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  208. :arg set allowed_protocols: set of allowed protocols for links; defaults
  209. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  210. :arg attr_val_is_uri: set of attributes that have URI values
  211. :arg svg_attr_val_allows_ref: set of SVG attributes that can have
  212. references
  213. :arg svg_allow_local_href: set of SVG elements that can have local
  214. hrefs
  215. :arg bool strip_disallowed_tags: whether or not to strip disallowed
  216. tags
  217. :arg bool strip_html_comments: whether or not to strip HTML comments
  218. :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
  219. sanitizing style attribute values and style text; defaults to None
  220. """
  221. # NOTE(willkg): This is the superclass of
  222. # html5lib.filters.sanitizer.Filter. We call this directly skipping the
  223. # __init__ for html5lib.filters.sanitizer.Filter because that does
  224. # things we don't need to do and kicks up the deprecation warning for
  225. # using Sanitizer.
  226. html5lib_shim.Filter.__init__(self, source)
  227. self.allowed_tags = frozenset(allowed_tags)
  228. self.allowed_protocols = frozenset(allowed_protocols)
  229. self.attr_filter = attribute_filter_factory(attributes)
  230. self.strip_disallowed_tags = strip_disallowed_tags
  231. self.strip_html_comments = strip_html_comments
  232. self.attr_val_is_uri = attr_val_is_uri
  233. self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
  234. self.css_sanitizer = css_sanitizer
  235. self.svg_allow_local_href = svg_allow_local_href
  236. def sanitize_stream(self, token_iterator):
  237. for token in token_iterator:
  238. ret = self.sanitize_token(token)
  239. if not ret:
  240. continue
  241. if isinstance(ret, list):
  242. yield from ret
  243. else:
  244. yield ret
  245. def merge_characters(self, token_iterator):
  246. """Merge consecutive Characters tokens in a stream"""
  247. characters_buffer = []
  248. for token in token_iterator:
  249. if characters_buffer:
  250. if token["type"] == "Characters":
  251. characters_buffer.append(token)
  252. continue
  253. else:
  254. # Merge all the characters tokens together into one and then
  255. # operate on it.
  256. new_token = {
  257. "data": "".join(
  258. [char_token["data"] for char_token in characters_buffer]
  259. ),
  260. "type": "Characters",
  261. }
  262. characters_buffer = []
  263. yield new_token
  264. elif token["type"] == "Characters":
  265. characters_buffer.append(token)
  266. continue
  267. yield token
  268. new_token = {
  269. "data": "".join([char_token["data"] for char_token in characters_buffer]),
  270. "type": "Characters",
  271. }
  272. yield new_token
  273. def __iter__(self):
  274. return self.merge_characters(
  275. self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
  276. )
  277. def sanitize_token(self, token):
  278. """Sanitize a token either by HTML-encoding or dropping.
  279. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
  280. ['attribute', 'pairs'], 'tag': callable}.
  281. Here callable is a function with two arguments of attribute name and
  282. value. It should return true of false.
  283. Also gives the option to strip tags instead of encoding.
  284. :arg dict token: token to sanitize
  285. :returns: token or list of tokens
  286. """
  287. token_type = token["type"]
  288. if token_type in ["StartTag", "EndTag", "EmptyTag"]:
  289. if token["name"] in self.allowed_tags:
  290. return self.allow_token(token)
  291. elif self.strip_disallowed_tags:
  292. return None
  293. else:
  294. return self.disallowed_token(token)
  295. elif token_type == "Comment":
  296. if not self.strip_html_comments:
  297. # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
  298. token["data"] = html5lib_shim.escape(
  299. token["data"], entities={'"': "&quot;", "'": "&#x27;"}
  300. )
  301. return token
  302. else:
  303. return None
  304. elif token_type == "Characters":
  305. return self.sanitize_characters(token)
  306. else:
  307. return token
  308. def sanitize_characters(self, token):
  309. """Handles Characters tokens
  310. Our overridden tokenizer doesn't do anything with entities. However,
  311. that means that the serializer will convert all ``&`` in Characters
  312. tokens to ``&amp;``.
  313. Since we don't want that, we extract entities here and convert them to
  314. Entity tokens so the serializer will let them be.
  315. :arg token: the Characters token to work on
  316. :returns: a list of tokens
  317. """
  318. data = token.get("data", "")
  319. if not data:
  320. return token
  321. data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
  322. token["data"] = data
  323. # If there isn't a & in the data, we can return now
  324. if "&" not in data:
  325. return token
  326. new_tokens = []
  327. # For each possible entity that starts with a "&", we try to extract an
  328. # actual entity and re-tokenize accordingly
  329. for part in html5lib_shim.next_possible_entity(data):
  330. if not part:
  331. continue
  332. if part.startswith("&"):
  333. entity = html5lib_shim.match_entity(part)
  334. if entity is not None:
  335. if entity == "amp":
  336. # LinkifyFilter can't match urls across token boundaries
  337. # which is problematic with &amp; since that shows up in
  338. # querystrings all the time. This special-cases &amp;
  339. # and converts it to a & and sticks it in as a
  340. # Characters token. It'll get merged with surrounding
  341. # tokens in the BleachSanitizerfilter.__iter__ and
  342. # escaped in the serializer.
  343. new_tokens.append({"type": "Characters", "data": "&"})
  344. else:
  345. new_tokens.append({"type": "Entity", "name": entity})
  346. # Length of the entity plus 2--one for & at the beginning
  347. # and one for ; at the end
  348. remainder = part[len(entity) + 2 :]
  349. if remainder:
  350. new_tokens.append({"type": "Characters", "data": remainder})
  351. continue
  352. new_tokens.append({"type": "Characters", "data": part})
  353. return new_tokens
  354. def sanitize_uri_value(self, value, allowed_protocols):
  355. """Checks a uri value to see if it's allowed
  356. :arg value: the uri value to sanitize
  357. :arg allowed_protocols: set of allowed protocols
  358. :returns: allowed value or None
  359. """
  360. # NOTE(willkg): This transforms the value into a normalized one that's
  361. # easier to match and verify, but shouldn't get returned since it's
  362. # vastly different than the original value.
  363. # Convert all character entities in the value
  364. normalized_uri = html5lib_shim.convert_entities(value)
  365. # Nix backtick, space characters, and control characters
  366. normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
  367. # Remove REPLACEMENT characters
  368. normalized_uri = normalized_uri.replace("\ufffd", "")
  369. # Lowercase it--this breaks the value, but makes it easier to match
  370. # against
  371. normalized_uri = normalized_uri.lower()
  372. try:
  373. # Drop attributes with uri values that have protocols that aren't
  374. # allowed
  375. parsed = parse_shim.urlparse(normalized_uri)
  376. except ValueError:
  377. # URI is impossible to parse, therefore it's not allowed
  378. return None
  379. if parsed.scheme:
  380. # If urlparse found a scheme, check that
  381. if parsed.scheme in allowed_protocols:
  382. return value
  383. else:
  384. # Allow uris that are just an anchor
  385. if normalized_uri.startswith("#"):
  386. return value
  387. # Handle protocols that urlparse doesn't recognize like "myprotocol"
  388. if (
  389. ":" in normalized_uri
  390. and normalized_uri.split(":")[0] in allowed_protocols
  391. ):
  392. return value
  393. # If there's no protocol/scheme specified, then assume it's "http" or
  394. # "https" and see if that's allowed
  395. if "http" in allowed_protocols or "https" in allowed_protocols:
  396. return value
  397. return None
  398. def allow_token(self, token):
  399. """Handles the case where we're allowing the tag"""
  400. if "data" in token:
  401. # Loop through all the attributes and drop the ones that are not
  402. # allowed, are unsafe or break other rules. Additionally, fix
  403. # attribute values that need fixing.
  404. #
  405. # At the end of this loop, we have the final set of attributes
  406. # we're keeping.
  407. attrs = {}
  408. for namespaced_name, val in token["data"].items():
  409. namespace, name = namespaced_name
  410. # Drop attributes that are not explicitly allowed
  411. #
  412. # NOTE(willkg): We pass in the attribute name--not a namespaced
  413. # name.
  414. if not self.attr_filter(token["name"], name, val):
  415. continue
  416. # Drop attributes with uri values that use a disallowed protocol
  417. # Sanitize attributes with uri values
  418. if namespaced_name in self.attr_val_is_uri:
  419. new_value = self.sanitize_uri_value(val, self.allowed_protocols)
  420. if new_value is None:
  421. continue
  422. val = new_value
  423. # Drop values in svg attrs with non-local IRIs
  424. if namespaced_name in self.svg_attr_val_allows_ref:
  425. new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
  426. new_val = new_val.strip()
  427. if not new_val:
  428. continue
  429. else:
  430. # Replace the val with the unescaped version because
  431. # it's a iri
  432. val = new_val
  433. # Drop href and xlink:href attr for svg elements with non-local IRIs
  434. if (None, token["name"]) in self.svg_allow_local_href:
  435. if namespaced_name in [
  436. (None, "href"),
  437. (html5lib_shim.namespaces["xlink"], "href"),
  438. ]:
  439. if re.search(r"^\s*[^#\s]", val):
  440. continue
  441. # If it's a style attribute, sanitize it
  442. if namespaced_name == (None, "style"):
  443. if self.css_sanitizer:
  444. val = self.css_sanitizer.sanitize_css(val)
  445. else:
  446. # FIXME(willkg): if style is allowed, but no
  447. # css_sanitizer was set up, then this is probably a
  448. # mistake and we should raise an error here
  449. #
  450. # For now, we're going to set the value to "" because
  451. # there was no sanitizer set
  452. val = ""
  453. # At this point, we want to keep the attribute, so add it in
  454. attrs[namespaced_name] = val
  455. token["data"] = attrs
  456. return token
  457. def disallowed_token(self, token):
  458. token_type = token["type"]
  459. if token_type == "EndTag":
  460. token["data"] = f"</{token['name']}>"
  461. elif token["data"]:
  462. assert token_type in ("StartTag", "EmptyTag")
  463. attrs = []
  464. for (ns, name), v in token["data"].items():
  465. # If we end up with a namespace, but no name, switch them so we
  466. # have a valid name to use.
  467. if ns and not name:
  468. ns, name = name, ns
  469. # Figure out namespaced name if the namespace is appropriate
  470. # and exists; if the ns isn't in prefixes, then drop it.
  471. if ns is None or ns not in html5lib_shim.prefixes:
  472. namespaced_name = name
  473. else:
  474. namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"
  475. # NOTE(willkg): HTMLSerializer escapes attribute values
  476. # already, so if we do it here (like HTMLSerializer does),
  477. # then we end up double-escaping.
  478. attrs.append(f' {namespaced_name}="{v}"')
  479. token["data"] = f"<{token['name']}{''.join(attrs)}>"
  480. else:
  481. token["data"] = f"<{token['name']}>"
  482. if token.get("selfClosing"):
  483. token["data"] = f"{token['data'][:-1]}/>"
  484. token["type"] = "Characters"
  485. del token["name"]
  486. return token