tokenizer.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. import re
  2. import sys
  3. from webencodings import ascii_lower
  4. from .ast import ( # isort: skip
  5. AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,
  6. HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,
  7. PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,
  8. WhitespaceToken)
  9. from .serializer import serialize_string_value, serialize_url
  10. _NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')
  11. _HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')
  12. def parse_component_value_list(css, skip_comments=False):
  13. """Parse a list of component values.
  14. :type css: :obj:`str`
  15. :param css: A CSS string.
  16. :type skip_comments: :obj:`bool`
  17. :param skip_comments:
  18. Ignore CSS comments.
  19. The return values (and recursively its blocks and functions)
  20. will not contain any :class:`~tinycss2.ast.Comment` object.
  21. :returns: A list of :term:`component values`.
  22. """
  23. css = (css.replace('\0', '\uFFFD')
  24. # This turns out to be faster than a regexp:
  25. .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))
  26. length = len(css)
  27. token_start_pos = pos = 0 # Character index in the css source.
  28. line = 1 # First line is line 1.
  29. last_newline = -1
  30. root = tokens = []
  31. end_char = None # Pop the stack when encountering this character.
  32. stack = [] # Stack of nested blocks: (tokens, end_char) tuples.
  33. while pos < length:
  34. newline = css.rfind('\n', token_start_pos, pos)
  35. if newline != -1:
  36. line += 1 + css.count('\n', token_start_pos, newline)
  37. last_newline = newline
  38. # First character in a line is in column 1.
  39. column = pos - last_newline
  40. token_start_pos = pos
  41. c = css[pos]
  42. if c in ' \n\t':
  43. pos += 1
  44. while css.startswith((' ', '\n', '\t'), pos):
  45. pos += 1
  46. value = css[token_start_pos:pos]
  47. tokens.append(WhitespaceToken(line, column, value))
  48. continue
  49. elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and
  50. css[pos + 2] in '0123456789abcdefABCDEF?'):
  51. start, end, pos = _consume_unicode_range(css, pos + 2)
  52. tokens.append(UnicodeRangeToken(line, column, start, end))
  53. continue
  54. elif css.startswith('-->', pos): # Check before identifiers
  55. tokens.append(LiteralToken(line, column, '-->'))
  56. pos += 3
  57. continue
  58. elif _is_ident_start(css, pos):
  59. value, pos = _consume_ident(css, pos)
  60. if not css.startswith('(', pos): # Not a function
  61. tokens.append(IdentToken(line, column, value))
  62. continue
  63. pos += 1 # Skip the '('
  64. if ascii_lower(value) == 'url':
  65. url_pos = pos
  66. while css.startswith((' ', '\n', '\t'), url_pos):
  67. url_pos += 1
  68. if url_pos >= length or css[url_pos] not in ('"', "'"):
  69. value, pos, error = _consume_url(css, pos)
  70. if value is not None:
  71. repr = 'url({})'.format(serialize_url(value))
  72. if error is not None:
  73. error_key = error[0]
  74. if error_key == 'eof-in-string':
  75. repr = repr[:-2]
  76. else:
  77. assert error_key == 'eof-in-url'
  78. repr = repr[:-1]
  79. tokens.append(URLToken(line, column, value, repr))
  80. if error is not None:
  81. tokens.append(ParseError(line, column, *error))
  82. continue
  83. arguments = []
  84. tokens.append(FunctionBlock(line, column, value, arguments))
  85. stack.append((tokens, end_char))
  86. end_char = ')'
  87. tokens = arguments
  88. continue
  89. match = _NUMBER_RE.match(css, pos)
  90. if match:
  91. pos = match.end()
  92. repr_ = css[token_start_pos:pos]
  93. value = float(repr_)
  94. int_value = int(repr_) if not any(match.groups()) else None
  95. if pos < length and _is_ident_start(css, pos):
  96. unit, pos = _consume_ident(css, pos)
  97. tokens.append(DimensionToken(
  98. line, column, value, int_value, repr_, unit))
  99. elif css.startswith('%', pos):
  100. pos += 1
  101. tokens.append(PercentageToken(line, column, value, int_value, repr_))
  102. else:
  103. tokens.append(NumberToken(line, column, value, int_value, repr_))
  104. elif c == '@':
  105. pos += 1
  106. if pos < length and _is_ident_start(css, pos):
  107. value, pos = _consume_ident(css, pos)
  108. tokens.append(AtKeywordToken(line, column, value))
  109. else:
  110. tokens.append(LiteralToken(line, column, '@'))
  111. elif c == '#':
  112. pos += 1
  113. if pos < length and (
  114. css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'
  115. '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or
  116. ord(css[pos]) > 0x7F or # Non-ASCII
  117. # Valid escape:
  118. (css[pos] == '\\' and not css.startswith('\\\n', pos))):
  119. is_identifier = _is_ident_start(css, pos)
  120. value, pos = _consume_ident(css, pos)
  121. tokens.append(HashToken(line, column, value, is_identifier))
  122. else:
  123. tokens.append(LiteralToken(line, column, '#'))
  124. elif c == '{':
  125. content = []
  126. tokens.append(CurlyBracketsBlock(line, column, content))
  127. stack.append((tokens, end_char))
  128. end_char = '}'
  129. tokens = content
  130. pos += 1
  131. elif c == '[':
  132. content = []
  133. tokens.append(SquareBracketsBlock(line, column, content))
  134. stack.append((tokens, end_char))
  135. end_char = ']'
  136. tokens = content
  137. pos += 1
  138. elif c == '(':
  139. content = []
  140. tokens.append(ParenthesesBlock(line, column, content))
  141. stack.append((tokens, end_char))
  142. end_char = ')'
  143. tokens = content
  144. pos += 1
  145. elif c == end_char: # Matching }, ] or )
  146. # The top-level end_char is None (never equal to a character),
  147. # so we never get here if the stack is empty.
  148. tokens, end_char = stack.pop()
  149. pos += 1
  150. elif c in '}])':
  151. tokens.append(ParseError(line, column, c, 'Unmatched ' + c))
  152. pos += 1
  153. elif c in ('"', "'"):
  154. value, pos, error = _consume_quoted_string(css, pos)
  155. if value is not None:
  156. repr = '"{}"'.format(serialize_string_value(value))
  157. if error is not None:
  158. repr = repr[:-1]
  159. tokens.append(StringToken(line, column, value, repr))
  160. if error is not None:
  161. tokens.append(ParseError(line, column, *error))
  162. elif css.startswith('/*', pos): # Comment
  163. pos = css.find('*/', pos + 2)
  164. if pos == -1:
  165. if not skip_comments:
  166. tokens.append(Comment(line, column, css[token_start_pos + 2:]))
  167. break
  168. if not skip_comments:
  169. tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))
  170. pos += 2
  171. elif css.startswith('<!--', pos):
  172. tokens.append(LiteralToken(line, column, '<!--'))
  173. pos += 4
  174. elif css.startswith('||', pos):
  175. tokens.append(LiteralToken(line, column, '||'))
  176. pos += 2
  177. elif c in '~|^$*':
  178. pos += 1
  179. if css.startswith('=', pos):
  180. pos += 1
  181. tokens.append(LiteralToken(line, column, c + '='))
  182. else:
  183. tokens.append(LiteralToken(line, column, c))
  184. else:
  185. tokens.append(LiteralToken(line, column, c))
  186. pos += 1
  187. return root
  188. def _is_name_start(css, pos):
  189. """Return true if the given character is a name-start code point."""
  190. # https://www.w3.org/TR/css-syntax-3/#name-start-code-point
  191. c = css[pos]
  192. return (
  193. c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or
  194. ord(c) > 0x7F)
  195. def _is_ident_start(css, pos):
  196. """Return True if the given position is the start of a CSS identifier."""
  197. # https://drafts.csswg.org/css-syntax/#would-start-an-identifier
  198. if _is_name_start(css, pos):
  199. return True
  200. elif css[pos] == '-':
  201. pos += 1
  202. return (
  203. # Name-start code point or hyphen:
  204. (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or
  205. # Valid escape:
  206. (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))
  207. elif css[pos] == '\\':
  208. return not css.startswith('\\\n', pos)
  209. return False
  210. def _consume_ident(css, pos):
  211. """Return (unescaped_value, new_pos).
  212. Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.
  213. """
  214. # http://dev.w3.org/csswg/css-syntax/#consume-a-name
  215. chunks = []
  216. length = len(css)
  217. start_pos = pos
  218. while pos < length:
  219. c = css[pos]
  220. if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'
  221. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:
  222. pos += 1
  223. elif c == '\\' and not css.startswith('\\\n', pos):
  224. # Valid escape
  225. chunks.append(css[start_pos:pos])
  226. c, pos = _consume_escape(css, pos + 1)
  227. chunks.append(c)
  228. start_pos = pos
  229. else:
  230. break
  231. chunks.append(css[start_pos:pos])
  232. return ''.join(chunks), pos
  233. def _consume_quoted_string(css, pos):
  234. """Return (unescaped_value, new_pos)."""
  235. # https://drafts.csswg.org/css-syntax/#consume-a-string-token
  236. error = None
  237. quote = css[pos]
  238. assert quote in ('"', "'")
  239. pos += 1
  240. chunks = []
  241. length = len(css)
  242. start_pos = pos
  243. while pos < length:
  244. c = css[pos]
  245. if c == quote:
  246. chunks.append(css[start_pos:pos])
  247. pos += 1
  248. break
  249. elif c == '\\':
  250. chunks.append(css[start_pos:pos])
  251. pos += 1
  252. if pos < length:
  253. if css[pos] == '\n': # Ignore escaped newlines
  254. pos += 1
  255. else:
  256. c, pos = _consume_escape(css, pos)
  257. chunks.append(c)
  258. # else: Escaped EOF, do nothing
  259. start_pos = pos
  260. elif c == '\n': # Unescaped newline
  261. return None, pos, ('bad-string', 'Bad string token')
  262. else:
  263. pos += 1
  264. else:
  265. error = ('eof-in-string', 'EOF in string')
  266. chunks.append(css[start_pos:pos])
  267. return ''.join(chunks), pos, error
  268. def _consume_escape(css, pos):
  269. r"""Return (unescaped_char, new_pos).
  270. Assumes a valid escape: pos is just after '\' and not followed by '\n'.
  271. """
  272. # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character
  273. hex_match = _HEX_ESCAPE_RE.match(css, pos)
  274. if hex_match:
  275. codepoint = int(hex_match.group(1), 16)
  276. return (
  277. chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',
  278. hex_match.end())
  279. elif pos < len(css):
  280. return css[pos], pos + 1
  281. else:
  282. return '\uFFFD', pos
  283. def _consume_url(css, pos):
  284. """Return (unescaped_url, new_pos)
  285. The given pos is assumed to be just after the '(' of 'url('.
  286. """
  287. error = None
  288. length = len(css)
  289. # https://drafts.csswg.org/css-syntax/#consume-a-url-token
  290. # Skip whitespace
  291. while css.startswith((' ', '\n', '\t'), pos):
  292. pos += 1
  293. if pos >= length: # EOF
  294. return '', pos, ('eof-in-url', 'EOF in URL')
  295. c = css[pos]
  296. if c in ('"', "'"):
  297. value, pos, error = _consume_quoted_string(css, pos)
  298. elif c == ')':
  299. return '', pos + 1, error
  300. else:
  301. chunks = []
  302. start_pos = pos
  303. while 1:
  304. if pos >= length: # EOF
  305. chunks.append(css[start_pos:pos])
  306. return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')
  307. c = css[pos]
  308. if c == ')':
  309. chunks.append(css[start_pos:pos])
  310. pos += 1
  311. return ''.join(chunks), pos, error
  312. elif c in ' \n\t':
  313. chunks.append(css[start_pos:pos])
  314. value = ''.join(chunks)
  315. pos += 1
  316. break
  317. elif c == '\\' and not css.startswith('\\\n', pos):
  318. # Valid escape
  319. chunks.append(css[start_pos:pos])
  320. c, pos = _consume_escape(css, pos + 1)
  321. chunks.append(c)
  322. start_pos = pos
  323. elif (c in
  324. '"\'('
  325. # https://drafts.csswg.org/css-syntax/#non-printable-character
  326. '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'
  327. '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'
  328. '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):
  329. value = None # Parse error
  330. pos += 1
  331. break
  332. else:
  333. pos += 1
  334. if value is not None:
  335. while css.startswith((' ', '\n', '\t'), pos):
  336. pos += 1
  337. if pos < length:
  338. if css[pos] == ')':
  339. return value, pos + 1, error
  340. else:
  341. if error is None:
  342. error = ('eof-in-url', 'EOF in URL')
  343. return value, pos, error
  344. # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0
  345. while pos < length:
  346. if css.startswith('\\)', pos):
  347. pos += 2
  348. elif css[pos] == ')':
  349. pos += 1
  350. break
  351. else:
  352. pos += 1
  353. return None, pos, ('bad-url', 'bad URL token')
  354. def _consume_unicode_range(css, pos):
  355. """Return (range, new_pos)
  356. The given pos is assume to be just after the '+' of 'U+' or 'u+'.
  357. """
  358. # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token
  359. length = len(css)
  360. start_pos = pos
  361. max_pos = min(pos + 6, length)
  362. while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
  363. pos += 1
  364. start = css[start_pos:pos]
  365. start_pos = pos
  366. # Same max_pos as before: total of hex digits and question marks <= 6
  367. while pos < max_pos and css[pos] == '?':
  368. pos += 1
  369. question_marks = pos - start_pos
  370. if question_marks:
  371. end = start + 'F' * question_marks
  372. start = start + '0' * question_marks
  373. elif (pos + 1 < length and css[pos] == '-' and
  374. css[pos + 1] in '0123456789abcdefABCDEF'):
  375. pos += 1
  376. start_pos = pos
  377. max_pos = min(pos + 6, length)
  378. while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
  379. pos += 1
  380. end = css[start_pos:pos]
  381. else:
  382. end = start
  383. return int(start, 16), int(end, 16), pos