| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401 |
- #
- # Copyright 2009 Facebook
- #
- # Licensed under the Apache License, Version 2.0 (the "License"); you may
- # not use this file except in compliance with the License. You may obtain
- # a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- # License for the specific language governing permissions and limitations
- # under the License.
- """Escaping/unescaping methods for HTML, JSON, URLs, and others.
- Also includes a few other miscellaneous string manipulation functions that
- have crept in over time.
- Many functions in this module have near-equivalents in the standard library
- (the differences mainly relate to handling of bytes and unicode strings,
- and were more relevant in Python 2). In new code, the standard library
- functions are encouraged instead of this module where applicable. See the
- docstrings on each function for details.
- """
- import html
- import json
- import re
- import urllib.parse
- from tornado.util import unicode_type
- import typing
- from typing import Union, Any, Optional, Dict, List, Callable
- def xhtml_escape(value: Union[str, bytes]) -> str:
- """Escapes a string so it is valid within HTML or XML.
- Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
- When used in attribute values the escaped strings must be enclosed
- in quotes.
- Equivalent to `html.escape` except that this function always returns
- type `str` while `html.escape` returns `bytes` if its input is `bytes`.
- .. versionchanged:: 3.2
- Added the single quote to the list of escaped characters.
- .. versionchanged:: 6.4
- Now simply wraps `html.escape`. This is equivalent to the old behavior
- except that single quotes are now escaped as ``'`` instead of
- ``'`` and performance may be different.
- """
- return html.escape(to_unicode(value))
- def xhtml_unescape(value: Union[str, bytes]) -> str:
- """Un-escapes an XML-escaped string.
- Equivalent to `html.unescape` except that this function always returns
- type `str` while `html.unescape` returns `bytes` if its input is `bytes`.
- .. versionchanged:: 6.4
- Now simply wraps `html.unescape`. This changes behavior for some inputs
- as required by the HTML 5 specification
- https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
- Some invalid inputs such as surrogates now raise an error, and numeric
- references to certain ISO-8859-1 characters are now handled correctly.
- """
- return html.unescape(to_unicode(value))
- # The fact that json_encode wraps json.dumps is an implementation detail.
- # Please see https://github.com/tornadoweb/tornado/pull/706
- # before sending a pull request that adds **kwargs to this function.
- def json_encode(value: Any) -> str:
- """JSON-encodes the given Python object.
- Equivalent to `json.dumps` with the additional guarantee that the output
- will never contain the character sequence ``</`` which can be problematic
- when JSON is embedded in an HTML ``<script>`` tag.
- """
- # JSON permits but does not require forward slashes to be escaped.
- # This is useful when json data is emitted in a <script> tag
- # in HTML, as it prevents </script> tags from prematurely terminating
- # the JavaScript. Some json libraries do this escaping by default,
- # although python's standard library does not, so we do it here.
- # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
- return json.dumps(value).replace("</", "<\\/")
- def json_decode(value: Union[str, bytes]) -> Any:
- """Returns Python objects for the given JSON string.
- Supports both `str` and `bytes` inputs. Equvalent to `json.loads`.
- """
- return json.loads(value)
- def squeeze(value: str) -> str:
- """Replace all sequences of whitespace chars with a single space."""
- return re.sub(r"[\x00-\x20]+", " ", value).strip()
- def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
- """Returns a URL-encoded version of the given value.
- Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus``
- argument.
- If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be
- represented as ``%2F``. This is appropriate for query strings. If ``plus`` is false, spaces
- will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path
- component of a URL. Note that the default of ``plus=True`` is effectively the
- reverse of Python's urllib module.
- .. versionadded:: 3.1
- The ``plus`` argument
- """
- quote = urllib.parse.quote_plus if plus else urllib.parse.quote
- return quote(value)
- @typing.overload
- def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:
- pass
- @typing.overload
- def url_unescape(
- value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True
- ) -> str:
- pass
- def url_unescape(
- value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True
- ) -> Union[str, bytes]:
- """Decodes the given value from a URL.
- The argument may be either a byte or unicode string.
- If encoding is None, the result will be a byte string and this function is equivalent to
- `urllib.parse.unquote_to_bytes` if ``plus=False``. Otherwise, the result is a unicode string in
- the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or
- `urllib.parse.unquote` except that this function also accepts `bytes` as input.
- If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs
- must be represented as "%2B"). This is appropriate for query strings and form-encoded values
- but not for the path component of a URL. Note that this default is the reverse of Python's
- urllib module.
- .. versionadded:: 3.1
- The ``plus`` argument
- """
- if encoding is None:
- if plus:
- # unquote_to_bytes doesn't have a _plus variant
- value = to_basestring(value).replace("+", " ")
- return urllib.parse.unquote_to_bytes(value)
- else:
- unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote
- return unquote(to_basestring(value), encoding=encoding)
- def parse_qs_bytes(
- qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False
- ) -> Dict[str, List[bytes]]:
- """Parses a query string like urlparse.parse_qs,
- but takes bytes and returns the values as byte strings.
- Keys still become type str (interpreted as latin1 in python3!)
- because it's too painful to keep them as byte strings in
- python3 and in practice they're nearly always ascii anyway.
- """
- # This is gross, but python3 doesn't give us another way.
- # Latin1 is the universal donor of character encodings.
- if isinstance(qs, bytes):
- qs = qs.decode("latin1")
- result = urllib.parse.parse_qs(
- qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"
- )
- encoded = {}
- for k, v in result.items():
- encoded[k] = [i.encode("latin1") for i in v]
- return encoded
- _UTF8_TYPES = (bytes, type(None))
- @typing.overload
- def utf8(value: bytes) -> bytes:
- pass
- @typing.overload
- def utf8(value: str) -> bytes:
- pass
- @typing.overload
- def utf8(value: None) -> None:
- pass
- def utf8(value: Union[None, str, bytes]) -> Optional[bytes]:
- """Converts a string argument to a byte string.
- If the argument is already a byte string or None, it is returned unchanged.
- Otherwise it must be a unicode string and is encoded as utf8.
- """
- if isinstance(value, _UTF8_TYPES):
- return value
- if not isinstance(value, unicode_type):
- raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
- return value.encode("utf-8")
- _TO_UNICODE_TYPES = (unicode_type, type(None))
- @typing.overload
- def to_unicode(value: str) -> str:
- pass
- @typing.overload
- def to_unicode(value: bytes) -> str:
- pass
- @typing.overload
- def to_unicode(value: None) -> None:
- pass
- def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:
- """Converts a string argument to a unicode string.
- If the argument is already a unicode string or None, it is returned
- unchanged. Otherwise it must be a byte string and is decoded as utf8.
- """
- if isinstance(value, _TO_UNICODE_TYPES):
- return value
- if not isinstance(value, bytes):
- raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
- return value.decode("utf-8")
- # to_unicode was previously named _unicode not because it was private,
- # but to avoid conflicts with the built-in unicode() function/type
- _unicode = to_unicode
- # When dealing with the standard library across python 2 and 3 it is
- # sometimes useful to have a direct conversion to the native string type
- native_str = to_unicode
- to_basestring = to_unicode
- def recursive_unicode(obj: Any) -> Any:
- """Walks a simple data structure, converting byte strings to unicode.
- Supports lists, tuples, and dictionaries.
- """
- if isinstance(obj, dict):
- return {recursive_unicode(k): recursive_unicode(v) for (k, v) in obj.items()}
- elif isinstance(obj, list):
- return list(recursive_unicode(i) for i in obj)
- elif isinstance(obj, tuple):
- return tuple(recursive_unicode(i) for i in obj)
- elif isinstance(obj, bytes):
- return to_unicode(obj)
- else:
- return obj
- # I originally used the regex from
- # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
- # but it gets all exponential on certain patterns (such as too many trailing
- # dots), causing the regex matcher to never return.
- # This regex should avoid those problems.
- # Use to_unicode instead of tornado.util.u - we don't want backslashes getting
- # processed as escapes.
- _URL_RE = re.compile(
- to_unicode(
- r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""" # noqa: E501
- )
- )
- def linkify(
- text: Union[str, bytes],
- shorten: bool = False,
- extra_params: Union[str, Callable[[str], str]] = "",
- require_protocol: bool = False,
- permitted_protocols: List[str] = ["http", "https"],
- ) -> str:
- """Converts plain text into HTML with links.
- For example: ``linkify("Hello http://tornadoweb.org!")`` would return
- ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
- Parameters:
- * ``shorten``: Long urls will be shortened for display.
- * ``extra_params``: Extra text to include in the link tag, or a callable
- taking the link as an argument and returning the extra text
- e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
- or::
- def extra_params_cb(url):
- if url.startswith("http://example.com"):
- return 'class="internal"'
- else:
- return 'class="external" rel="nofollow"'
- linkify(text, extra_params=extra_params_cb)
- * ``require_protocol``: Only linkify urls which include a protocol. If
- this is False, urls such as www.facebook.com will also be linkified.
- * ``permitted_protocols``: List (or set) of protocols which should be
- linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
- "mailto"])``. It is very unsafe to include protocols such as
- ``javascript``.
- """
- if extra_params and not callable(extra_params):
- extra_params = " " + extra_params.strip()
- def make_link(m: typing.Match) -> str:
- url = m.group(1)
- proto = m.group(2)
- if require_protocol and not proto:
- return url # not protocol, no linkify
- if proto and proto not in permitted_protocols:
- return url # bad protocol, no linkify
- href = m.group(1)
- if not proto:
- href = "http://" + href # no proto specified, use http
- if callable(extra_params):
- params = " " + extra_params(href).strip()
- else:
- params = extra_params
- # clip long urls. max_len is just an approximation
- max_len = 30
- if shorten and len(url) > max_len:
- before_clip = url
- if proto:
- proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
- else:
- proto_len = 0
- parts = url[proto_len:].split("/")
- if len(parts) > 1:
- # Grab the whole host part plus the first bit of the path
- # The path is usually not that interesting once shortened
- # (no more slug, etc), so it really just provides a little
- # extra indication of shortening.
- url = (
- url[:proto_len]
- + parts[0]
- + "/"
- + parts[1][:8].split("?")[0].split(".")[0]
- )
- if len(url) > max_len * 1.5: # still too long
- url = url[:max_len]
- if url != before_clip:
- amp = url.rfind("&")
- # avoid splitting html char entities
- if amp > max_len - 5:
- url = url[:amp]
- url += "..."
- if len(url) >= len(before_clip):
- url = before_clip
- else:
- # full url is visible on mouse-over (for those who don't
- # have a status bar, such as Safari by default)
- params += ' title="%s"' % href
- return f'<a href="{href}"{params}>{url}</a>'
- # First HTML-escape so that our strings are all safe.
- # The regex is modified to avoid character entites other than & so
- # that we won't pick up ", etc.
- text = _unicode(xhtml_escape(text))
- return _URL_RE.sub(make_link, text)
|