escape.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. #
  2. # Copyright 2009 Facebook
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  5. # not use this file except in compliance with the License. You may obtain
  6. # a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. # License for the specific language governing permissions and limitations
  14. # under the License.
  15. """Escaping/unescaping methods for HTML, JSON, URLs, and others.
  16. Also includes a few other miscellaneous string manipulation functions that
  17. have crept in over time.
  18. Many functions in this module have near-equivalents in the standard library
  19. (the differences mainly relate to handling of bytes and unicode strings,
  20. and were more relevant in Python 2). In new code, the standard library
  21. functions are encouraged instead of this module where applicable. See the
  22. docstrings on each function for details.
  23. """
  24. import html
  25. import json
  26. import re
  27. import urllib.parse
  28. from tornado.util import unicode_type
  29. import typing
  30. from typing import Union, Any, Optional, Dict, List, Callable
  31. def xhtml_escape(value: Union[str, bytes]) -> str:
  32. """Escapes a string so it is valid within HTML or XML.
  33. Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
  34. When used in attribute values the escaped strings must be enclosed
  35. in quotes.
  36. Equivalent to `html.escape` except that this function always returns
  37. type `str` while `html.escape` returns `bytes` if its input is `bytes`.
  38. .. versionchanged:: 3.2
  39. Added the single quote to the list of escaped characters.
  40. .. versionchanged:: 6.4
  41. Now simply wraps `html.escape`. This is equivalent to the old behavior
  42. except that single quotes are now escaped as ``&#x27;`` instead of
  43. ``&#39;`` and performance may be different.
  44. """
  45. return html.escape(to_unicode(value))
  46. def xhtml_unescape(value: Union[str, bytes]) -> str:
  47. """Un-escapes an XML-escaped string.
  48. Equivalent to `html.unescape` except that this function always returns
  49. type `str` while `html.unescape` returns `bytes` if its input is `bytes`.
  50. .. versionchanged:: 6.4
  51. Now simply wraps `html.unescape`. This changes behavior for some inputs
  52. as required by the HTML 5 specification
  53. https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  54. Some invalid inputs such as surrogates now raise an error, and numeric
  55. references to certain ISO-8859-1 characters are now handled correctly.
  56. """
  57. return html.unescape(to_unicode(value))
  58. # The fact that json_encode wraps json.dumps is an implementation detail.
  59. # Please see https://github.com/tornadoweb/tornado/pull/706
  60. # before sending a pull request that adds **kwargs to this function.
  61. def json_encode(value: Any) -> str:
  62. """JSON-encodes the given Python object.
  63. Equivalent to `json.dumps` with the additional guarantee that the output
  64. will never contain the character sequence ``</`` which can be problematic
  65. when JSON is embedded in an HTML ``<script>`` tag.
  66. """
  67. # JSON permits but does not require forward slashes to be escaped.
  68. # This is useful when json data is emitted in a <script> tag
  69. # in HTML, as it prevents </script> tags from prematurely terminating
  70. # the JavaScript. Some json libraries do this escaping by default,
  71. # although python's standard library does not, so we do it here.
  72. # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
  73. return json.dumps(value).replace("</", "<\\/")
  74. def json_decode(value: Union[str, bytes]) -> Any:
  75. """Returns Python objects for the given JSON string.
  76. Supports both `str` and `bytes` inputs. Equvalent to `json.loads`.
  77. """
  78. return json.loads(value)
  79. def squeeze(value: str) -> str:
  80. """Replace all sequences of whitespace chars with a single space."""
  81. return re.sub(r"[\x00-\x20]+", " ", value).strip()
  82. def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
  83. """Returns a URL-encoded version of the given value.
  84. Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus``
  85. argument.
  86. If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be
  87. represented as ``%2F``. This is appropriate for query strings. If ``plus`` is false, spaces
  88. will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path
  89. component of a URL. Note that the default of ``plus=True`` is effectively the
  90. reverse of Python's urllib module.
  91. .. versionadded:: 3.1
  92. The ``plus`` argument
  93. """
  94. quote = urllib.parse.quote_plus if plus else urllib.parse.quote
  95. return quote(value)
  96. @typing.overload
  97. def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:
  98. pass
  99. @typing.overload
  100. def url_unescape(
  101. value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True
  102. ) -> str:
  103. pass
  104. def url_unescape(
  105. value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True
  106. ) -> Union[str, bytes]:
  107. """Decodes the given value from a URL.
  108. The argument may be either a byte or unicode string.
  109. If encoding is None, the result will be a byte string and this function is equivalent to
  110. `urllib.parse.unquote_to_bytes` if ``plus=False``. Otherwise, the result is a unicode string in
  111. the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or
  112. `urllib.parse.unquote` except that this function also accepts `bytes` as input.
  113. If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs
  114. must be represented as "%2B"). This is appropriate for query strings and form-encoded values
  115. but not for the path component of a URL. Note that this default is the reverse of Python's
  116. urllib module.
  117. .. versionadded:: 3.1
  118. The ``plus`` argument
  119. """
  120. if encoding is None:
  121. if plus:
  122. # unquote_to_bytes doesn't have a _plus variant
  123. value = to_basestring(value).replace("+", " ")
  124. return urllib.parse.unquote_to_bytes(value)
  125. else:
  126. unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote
  127. return unquote(to_basestring(value), encoding=encoding)
  128. def parse_qs_bytes(
  129. qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False
  130. ) -> Dict[str, List[bytes]]:
  131. """Parses a query string like urlparse.parse_qs,
  132. but takes bytes and returns the values as byte strings.
  133. Keys still become type str (interpreted as latin1 in python3!)
  134. because it's too painful to keep them as byte strings in
  135. python3 and in practice they're nearly always ascii anyway.
  136. """
  137. # This is gross, but python3 doesn't give us another way.
  138. # Latin1 is the universal donor of character encodings.
  139. if isinstance(qs, bytes):
  140. qs = qs.decode("latin1")
  141. result = urllib.parse.parse_qs(
  142. qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"
  143. )
  144. encoded = {}
  145. for k, v in result.items():
  146. encoded[k] = [i.encode("latin1") for i in v]
  147. return encoded
  148. _UTF8_TYPES = (bytes, type(None))
  149. @typing.overload
  150. def utf8(value: bytes) -> bytes:
  151. pass
  152. @typing.overload
  153. def utf8(value: str) -> bytes:
  154. pass
  155. @typing.overload
  156. def utf8(value: None) -> None:
  157. pass
  158. def utf8(value: Union[None, str, bytes]) -> Optional[bytes]:
  159. """Converts a string argument to a byte string.
  160. If the argument is already a byte string or None, it is returned unchanged.
  161. Otherwise it must be a unicode string and is encoded as utf8.
  162. """
  163. if isinstance(value, _UTF8_TYPES):
  164. return value
  165. if not isinstance(value, unicode_type):
  166. raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
  167. return value.encode("utf-8")
  168. _TO_UNICODE_TYPES = (unicode_type, type(None))
  169. @typing.overload
  170. def to_unicode(value: str) -> str:
  171. pass
  172. @typing.overload
  173. def to_unicode(value: bytes) -> str:
  174. pass
  175. @typing.overload
  176. def to_unicode(value: None) -> None:
  177. pass
  178. def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:
  179. """Converts a string argument to a unicode string.
  180. If the argument is already a unicode string or None, it is returned
  181. unchanged. Otherwise it must be a byte string and is decoded as utf8.
  182. """
  183. if isinstance(value, _TO_UNICODE_TYPES):
  184. return value
  185. if not isinstance(value, bytes):
  186. raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
  187. return value.decode("utf-8")
  188. # to_unicode was previously named _unicode not because it was private,
  189. # but to avoid conflicts with the built-in unicode() function/type
  190. _unicode = to_unicode
  191. # When dealing with the standard library across python 2 and 3 it is
  192. # sometimes useful to have a direct conversion to the native string type
  193. native_str = to_unicode
  194. to_basestring = to_unicode
  195. def recursive_unicode(obj: Any) -> Any:
  196. """Walks a simple data structure, converting byte strings to unicode.
  197. Supports lists, tuples, and dictionaries.
  198. """
  199. if isinstance(obj, dict):
  200. return {recursive_unicode(k): recursive_unicode(v) for (k, v) in obj.items()}
  201. elif isinstance(obj, list):
  202. return list(recursive_unicode(i) for i in obj)
  203. elif isinstance(obj, tuple):
  204. return tuple(recursive_unicode(i) for i in obj)
  205. elif isinstance(obj, bytes):
  206. return to_unicode(obj)
  207. else:
  208. return obj
  209. # I originally used the regex from
  210. # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
  211. # but it gets all exponential on certain patterns (such as too many trailing
  212. # dots), causing the regex matcher to never return.
  213. # This regex should avoid those problems.
  214. # Use to_unicode instead of tornado.util.u - we don't want backslashes getting
  215. # processed as escapes.
  216. _URL_RE = re.compile(
  217. to_unicode(
  218. r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""" # noqa: E501
  219. )
  220. )
  221. def linkify(
  222. text: Union[str, bytes],
  223. shorten: bool = False,
  224. extra_params: Union[str, Callable[[str], str]] = "",
  225. require_protocol: bool = False,
  226. permitted_protocols: List[str] = ["http", "https"],
  227. ) -> str:
  228. """Converts plain text into HTML with links.
  229. For example: ``linkify("Hello http://tornadoweb.org!")`` would return
  230. ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
  231. Parameters:
  232. * ``shorten``: Long urls will be shortened for display.
  233. * ``extra_params``: Extra text to include in the link tag, or a callable
  234. taking the link as an argument and returning the extra text
  235. e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
  236. or::
  237. def extra_params_cb(url):
  238. if url.startswith("http://example.com"):
  239. return 'class="internal"'
  240. else:
  241. return 'class="external" rel="nofollow"'
  242. linkify(text, extra_params=extra_params_cb)
  243. * ``require_protocol``: Only linkify urls which include a protocol. If
  244. this is False, urls such as www.facebook.com will also be linkified.
  245. * ``permitted_protocols``: List (or set) of protocols which should be
  246. linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
  247. "mailto"])``. It is very unsafe to include protocols such as
  248. ``javascript``.
  249. """
  250. if extra_params and not callable(extra_params):
  251. extra_params = " " + extra_params.strip()
  252. def make_link(m: typing.Match) -> str:
  253. url = m.group(1)
  254. proto = m.group(2)
  255. if require_protocol and not proto:
  256. return url # not protocol, no linkify
  257. if proto and proto not in permitted_protocols:
  258. return url # bad protocol, no linkify
  259. href = m.group(1)
  260. if not proto:
  261. href = "http://" + href # no proto specified, use http
  262. if callable(extra_params):
  263. params = " " + extra_params(href).strip()
  264. else:
  265. params = extra_params
  266. # clip long urls. max_len is just an approximation
  267. max_len = 30
  268. if shorten and len(url) > max_len:
  269. before_clip = url
  270. if proto:
  271. proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
  272. else:
  273. proto_len = 0
  274. parts = url[proto_len:].split("/")
  275. if len(parts) > 1:
  276. # Grab the whole host part plus the first bit of the path
  277. # The path is usually not that interesting once shortened
  278. # (no more slug, etc), so it really just provides a little
  279. # extra indication of shortening.
  280. url = (
  281. url[:proto_len]
  282. + parts[0]
  283. + "/"
  284. + parts[1][:8].split("?")[0].split(".")[0]
  285. )
  286. if len(url) > max_len * 1.5: # still too long
  287. url = url[:max_len]
  288. if url != before_clip:
  289. amp = url.rfind("&")
  290. # avoid splitting html char entities
  291. if amp > max_len - 5:
  292. url = url[:amp]
  293. url += "..."
  294. if len(url) >= len(before_clip):
  295. url = before_clip
  296. else:
  297. # full url is visible on mouse-over (for those who don't
  298. # have a status bar, such as Safari by default)
  299. params += ' title="%s"' % href
  300. return f'<a href="{href}"{params}>{url}</a>'
  301. # First HTML-escape so that our strings are all safe.
  302. # The regex is modified to avoid character entites other than &amp; so
  303. # that we won't pick up &quot;, etc.
  304. text = _unicode(xhtml_escape(text))
  305. return _URL_RE.sub(make_link, text)