| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- import re
- import sys
- from webencodings import ascii_lower
- from .ast import ( # isort: skip
- AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,
- HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,
- PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,
- WhitespaceToken)
- from .serializer import serialize_string_value, serialize_url
- _NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')
- _HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')
- def parse_component_value_list(css, skip_comments=False):
- """Parse a list of component values.
- :type css: :obj:`str`
- :param css: A CSS string.
- :type skip_comments: :obj:`bool`
- :param skip_comments:
- Ignore CSS comments.
- The return values (and recursively its blocks and functions)
- will not contain any :class:`~tinycss2.ast.Comment` object.
- :returns: A list of :term:`component values`.
- """
- css = (css.replace('\0', '\uFFFD')
- # This turns out to be faster than a regexp:
- .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))
- length = len(css)
- token_start_pos = pos = 0 # Character index in the css source.
- line = 1 # First line is line 1.
- last_newline = -1
- root = tokens = []
- end_char = None # Pop the stack when encountering this character.
- stack = [] # Stack of nested blocks: (tokens, end_char) tuples.
- while pos < length:
- newline = css.rfind('\n', token_start_pos, pos)
- if newline != -1:
- line += 1 + css.count('\n', token_start_pos, newline)
- last_newline = newline
- # First character in a line is in column 1.
- column = pos - last_newline
- token_start_pos = pos
- c = css[pos]
- if c in ' \n\t':
- pos += 1
- while css.startswith((' ', '\n', '\t'), pos):
- pos += 1
- value = css[token_start_pos:pos]
- tokens.append(WhitespaceToken(line, column, value))
- continue
- elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and
- css[pos + 2] in '0123456789abcdefABCDEF?'):
- start, end, pos = _consume_unicode_range(css, pos + 2)
- tokens.append(UnicodeRangeToken(line, column, start, end))
- continue
- elif css.startswith('-->', pos): # Check before identifiers
- tokens.append(LiteralToken(line, column, '-->'))
- pos += 3
- continue
- elif _is_ident_start(css, pos):
- value, pos = _consume_ident(css, pos)
- if not css.startswith('(', pos): # Not a function
- tokens.append(IdentToken(line, column, value))
- continue
- pos += 1 # Skip the '('
- if ascii_lower(value) == 'url':
- url_pos = pos
- while css.startswith((' ', '\n', '\t'), url_pos):
- url_pos += 1
- if url_pos >= length or css[url_pos] not in ('"', "'"):
- value, pos, error = _consume_url(css, pos)
- if value is not None:
- repr = 'url({})'.format(serialize_url(value))
- if error is not None:
- error_key = error[0]
- if error_key == 'eof-in-string':
- repr = repr[:-2]
- else:
- assert error_key == 'eof-in-url'
- repr = repr[:-1]
- tokens.append(URLToken(line, column, value, repr))
- if error is not None:
- tokens.append(ParseError(line, column, *error))
- continue
- arguments = []
- tokens.append(FunctionBlock(line, column, value, arguments))
- stack.append((tokens, end_char))
- end_char = ')'
- tokens = arguments
- continue
- match = _NUMBER_RE.match(css, pos)
- if match:
- pos = match.end()
- repr_ = css[token_start_pos:pos]
- value = float(repr_)
- int_value = int(repr_) if not any(match.groups()) else None
- if pos < length and _is_ident_start(css, pos):
- unit, pos = _consume_ident(css, pos)
- tokens.append(DimensionToken(
- line, column, value, int_value, repr_, unit))
- elif css.startswith('%', pos):
- pos += 1
- tokens.append(PercentageToken(line, column, value, int_value, repr_))
- else:
- tokens.append(NumberToken(line, column, value, int_value, repr_))
- elif c == '@':
- pos += 1
- if pos < length and _is_ident_start(css, pos):
- value, pos = _consume_ident(css, pos)
- tokens.append(AtKeywordToken(line, column, value))
- else:
- tokens.append(LiteralToken(line, column, '@'))
- elif c == '#':
- pos += 1
- if pos < length and (
- css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'
- '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or
- ord(css[pos]) > 0x7F or # Non-ASCII
- # Valid escape:
- (css[pos] == '\\' and not css.startswith('\\\n', pos))):
- is_identifier = _is_ident_start(css, pos)
- value, pos = _consume_ident(css, pos)
- tokens.append(HashToken(line, column, value, is_identifier))
- else:
- tokens.append(LiteralToken(line, column, '#'))
- elif c == '{':
- content = []
- tokens.append(CurlyBracketsBlock(line, column, content))
- stack.append((tokens, end_char))
- end_char = '}'
- tokens = content
- pos += 1
- elif c == '[':
- content = []
- tokens.append(SquareBracketsBlock(line, column, content))
- stack.append((tokens, end_char))
- end_char = ']'
- tokens = content
- pos += 1
- elif c == '(':
- content = []
- tokens.append(ParenthesesBlock(line, column, content))
- stack.append((tokens, end_char))
- end_char = ')'
- tokens = content
- pos += 1
- elif c == end_char: # Matching }, ] or )
- # The top-level end_char is None (never equal to a character),
- # so we never get here if the stack is empty.
- tokens, end_char = stack.pop()
- pos += 1
- elif c in '}])':
- tokens.append(ParseError(line, column, c, 'Unmatched ' + c))
- pos += 1
- elif c in ('"', "'"):
- value, pos, error = _consume_quoted_string(css, pos)
- if value is not None:
- repr = '"{}"'.format(serialize_string_value(value))
- if error is not None:
- repr = repr[:-1]
- tokens.append(StringToken(line, column, value, repr))
- if error is not None:
- tokens.append(ParseError(line, column, *error))
- elif css.startswith('/*', pos): # Comment
- pos = css.find('*/', pos + 2)
- if pos == -1:
- if not skip_comments:
- tokens.append(Comment(line, column, css[token_start_pos + 2:]))
- break
- if not skip_comments:
- tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))
- pos += 2
- elif css.startswith('<!--', pos):
- tokens.append(LiteralToken(line, column, '<!--'))
- pos += 4
- elif css.startswith('||', pos):
- tokens.append(LiteralToken(line, column, '||'))
- pos += 2
- elif c in '~|^$*':
- pos += 1
- if css.startswith('=', pos):
- pos += 1
- tokens.append(LiteralToken(line, column, c + '='))
- else:
- tokens.append(LiteralToken(line, column, c))
- else:
- tokens.append(LiteralToken(line, column, c))
- pos += 1
- return root
- def _is_name_start(css, pos):
- """Return true if the given character is a name-start code point."""
- # https://www.w3.org/TR/css-syntax-3/#name-start-code-point
- c = css[pos]
- return (
- c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or
- ord(c) > 0x7F)
- def _is_ident_start(css, pos):
- """Return True if the given position is the start of a CSS identifier."""
- # https://drafts.csswg.org/css-syntax/#would-start-an-identifier
- if _is_name_start(css, pos):
- return True
- elif css[pos] == '-':
- pos += 1
- return (
- # Name-start code point or hyphen:
- (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or
- # Valid escape:
- (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))
- elif css[pos] == '\\':
- return not css.startswith('\\\n', pos)
- return False
- def _consume_ident(css, pos):
- """Return (unescaped_value, new_pos).
- Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.
- """
- # http://dev.w3.org/csswg/css-syntax/#consume-a-name
- chunks = []
- length = len(css)
- start_pos = pos
- while pos < length:
- c = css[pos]
- if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:
- pos += 1
- elif c == '\\' and not css.startswith('\\\n', pos):
- # Valid escape
- chunks.append(css[start_pos:pos])
- c, pos = _consume_escape(css, pos + 1)
- chunks.append(c)
- start_pos = pos
- else:
- break
- chunks.append(css[start_pos:pos])
- return ''.join(chunks), pos
- def _consume_quoted_string(css, pos):
- """Return (unescaped_value, new_pos)."""
- # https://drafts.csswg.org/css-syntax/#consume-a-string-token
- error = None
- quote = css[pos]
- assert quote in ('"', "'")
- pos += 1
- chunks = []
- length = len(css)
- start_pos = pos
- while pos < length:
- c = css[pos]
- if c == quote:
- chunks.append(css[start_pos:pos])
- pos += 1
- break
- elif c == '\\':
- chunks.append(css[start_pos:pos])
- pos += 1
- if pos < length:
- if css[pos] == '\n': # Ignore escaped newlines
- pos += 1
- else:
- c, pos = _consume_escape(css, pos)
- chunks.append(c)
- # else: Escaped EOF, do nothing
- start_pos = pos
- elif c == '\n': # Unescaped newline
- return None, pos, ('bad-string', 'Bad string token')
- else:
- pos += 1
- else:
- error = ('eof-in-string', 'EOF in string')
- chunks.append(css[start_pos:pos])
- return ''.join(chunks), pos, error
- def _consume_escape(css, pos):
- r"""Return (unescaped_char, new_pos).
- Assumes a valid escape: pos is just after '\' and not followed by '\n'.
- """
- # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character
- hex_match = _HEX_ESCAPE_RE.match(css, pos)
- if hex_match:
- codepoint = int(hex_match.group(1), 16)
- return (
- chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',
- hex_match.end())
- elif pos < len(css):
- return css[pos], pos + 1
- else:
- return '\uFFFD', pos
- def _consume_url(css, pos):
- """Return (unescaped_url, new_pos)
- The given pos is assumed to be just after the '(' of 'url('.
- """
- error = None
- length = len(css)
- # https://drafts.csswg.org/css-syntax/#consume-a-url-token
- # Skip whitespace
- while css.startswith((' ', '\n', '\t'), pos):
- pos += 1
- if pos >= length: # EOF
- return '', pos, ('eof-in-url', 'EOF in URL')
- c = css[pos]
- if c in ('"', "'"):
- value, pos, error = _consume_quoted_string(css, pos)
- elif c == ')':
- return '', pos + 1, error
- else:
- chunks = []
- start_pos = pos
- while 1:
- if pos >= length: # EOF
- chunks.append(css[start_pos:pos])
- return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')
- c = css[pos]
- if c == ')':
- chunks.append(css[start_pos:pos])
- pos += 1
- return ''.join(chunks), pos, error
- elif c in ' \n\t':
- chunks.append(css[start_pos:pos])
- value = ''.join(chunks)
- pos += 1
- break
- elif c == '\\' and not css.startswith('\\\n', pos):
- # Valid escape
- chunks.append(css[start_pos:pos])
- c, pos = _consume_escape(css, pos + 1)
- chunks.append(c)
- start_pos = pos
- elif (c in
- '"\'('
- # https://drafts.csswg.org/css-syntax/#non-printable-character
- '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'
- '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'
- '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):
- value = None # Parse error
- pos += 1
- break
- else:
- pos += 1
- if value is not None:
- while css.startswith((' ', '\n', '\t'), pos):
- pos += 1
- if pos < length:
- if css[pos] == ')':
- return value, pos + 1, error
- else:
- if error is None:
- error = ('eof-in-url', 'EOF in URL')
- return value, pos, error
- # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0
- while pos < length:
- if css.startswith('\\)', pos):
- pos += 2
- elif css[pos] == ')':
- pos += 1
- break
- else:
- pos += 1
- return None, pos, ('bad-url', 'bad URL token')
- def _consume_unicode_range(css, pos):
- """Return (range, new_pos)
- The given pos is assume to be just after the '+' of 'U+' or 'u+'.
- """
- # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token
- length = len(css)
- start_pos = pos
- max_pos = min(pos + 6, length)
- while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
- pos += 1
- start = css[start_pos:pos]
- start_pos = pos
- # Same max_pos as before: total of hex digits and question marks <= 6
- while pos < max_pos and css[pos] == '?':
- pos += 1
- question_marks = pos - start_pos
- if question_marks:
- end = start + 'F' * question_marks
- start = start + '0' * question_marks
- elif (pos + 1 < length and css[pos] == '-' and
- css[pos + 1] in '0123456789abcdefABCDEF'):
- pos += 1
- start_pos = pos
- max_pos = min(pos + 6, length)
- while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
- pos += 1
- end = css[start_pos:pos]
- else:
- end = start
- return int(start, 16), int(end, 16), pos
|