bytes.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. from webencodings import UTF8, decode, lookup
  2. from .parser import parse_stylesheet
  3. def decode_stylesheet_bytes(css_bytes, protocol_encoding=None,
  4. environment_encoding=None):
  5. """Determine the character encoding of a CSS stylesheet and decode it.
  6. This is based on the presence of a :abbr:`BOM (Byte Order Mark)`,
  7. a ``@charset`` rule, and encoding meta-information.
  8. :type css_bytes: :obj:`bytes`
  9. :param css_bytes: A CSS byte string.
  10. :type protocol_encoding: :obj:`str`
  11. :param protocol_encoding:
  12. The encoding label, if any, defined by HTTP or equivalent protocol.
  13. (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
  14. :type environment_encoding: :class:`webencodings.Encoding`
  15. :param environment_encoding:
  16. The `environment encoding
  17. <https://www.w3.org/TR/css-syntax/#environment-encoding>`_, if any.
  18. :returns:
  19. A 2-tuple of a decoded Unicode string and the
  20. :class:`webencodings.Encoding` object that was used.
  21. """
  22. # https://drafts.csswg.org/css-syntax/#the-input-byte-stream
  23. if protocol_encoding:
  24. fallback = lookup(protocol_encoding)
  25. if fallback:
  26. return decode(css_bytes, fallback)
  27. if css_bytes.startswith(b'@charset "'):
  28. # 10 is len(b'@charset "')
  29. # 100 is arbitrary so that no encoding label is more than 100-10 bytes.
  30. end_quote = css_bytes.find(b'"', 10, 100)
  31. if end_quote != -1 and css_bytes.startswith(b'";', end_quote):
  32. fallback = lookup(css_bytes[10:end_quote].decode('latin1'))
  33. if fallback:
  34. if fallback.name in ('utf-16be', 'utf-16le'):
  35. return decode(css_bytes, UTF8)
  36. return decode(css_bytes, fallback)
  37. if environment_encoding:
  38. return decode(css_bytes, environment_encoding)
  39. return decode(css_bytes, UTF8)
  40. def parse_stylesheet_bytes(css_bytes, protocol_encoding=None,
  41. environment_encoding=None,
  42. skip_comments=False, skip_whitespace=False):
  43. """Parse :diagram:`stylesheet` from bytes,
  44. determining the character encoding as web browsers do.
  45. This is used when reading a file or fetching a URL.
  46. The character encoding is determined from the initial bytes
  47. (a :abbr:`BOM (Byte Order Mark)` or a ``@charset`` rule)
  48. as well as the parameters. The ultimate fallback is UTF-8.
  49. :type css_bytes: :obj:`bytes`
  50. :param css_bytes: A CSS byte string.
  51. :type protocol_encoding: :obj:`str`
  52. :param protocol_encoding:
  53. The encoding label, if any, defined by HTTP or equivalent protocol.
  54. (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
  55. :type environment_encoding: :class:`webencodings.Encoding`
  56. :param environment_encoding:
  57. The `environment encoding`_, if any.
  58. :type skip_comments: :obj:`bool`
  59. :param skip_comments:
  60. Ignore CSS comments at the top-level of the stylesheet.
  61. If the input is a string, ignore all comments.
  62. :type skip_whitespace: :obj:`bool`
  63. :param skip_whitespace:
  64. Ignore whitespace at the top-level of the stylesheet.
  65. Whitespace is still preserved
  66. in the :attr:`~tinycss2.ast.QualifiedRule.prelude`
  67. and the :attr:`~tinycss2.ast.QualifiedRule.content` of rules.
  68. :returns:
  69. A ``(rules, encoding)`` tuple.
  70. * ``rules`` is a list of
  71. :class:`~tinycss2.ast.QualifiedRule`,
  72. :class:`~tinycss2.ast.AtRule`,
  73. :class:`~tinycss2.ast.Comment` (if ``skip_comments`` is false),
  74. :class:`~tinycss2.ast.WhitespaceToken`
  75. (if ``skip_whitespace`` is false),
  76. and :class:`~tinycss2.ast.ParseError` objects.
  77. * ``encoding`` is the :class:`webencodings.Encoding` object
  78. that was used.
  79. If ``rules`` contains an ``@import`` rule, this is
  80. the `environment encoding`_ for the imported stylesheet.
  81. .. _environment encoding:
  82. https://www.w3.org/TR/css-syntax/#environment-encoding
  83. .. code-block:: python
  84. response = urlopen('http://example.net/foo.css')
  85. rules, encoding = parse_stylesheet_bytes(
  86. css_bytes=response.read(),
  87. # Python 3.x
  88. protocol_encoding=response.info().get_content_type().get_param('charset'),
  89. # Python 2.x
  90. protocol_encoding=response.info().gettype().getparam('charset'),
  91. )
  92. for rule in rules:
  93. ...
  94. """
  95. css_unicode, encoding = decode_stylesheet_bytes(
  96. css_bytes, protocol_encoding, environment_encoding)
  97. stylesheet = parse_stylesheet(css_unicode, skip_comments, skip_whitespace)
  98. return stylesheet, encoding