| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706 |
- # ------------------------------------------------------------------------------
- # pycparser: c_lexer.py
- #
- # CLexer class: lexer for the C language
- #
- # Eli Bendersky [https://eli.thegreenplace.net/]
- # License: BSD
- # ------------------------------------------------------------------------------
- import re
- from dataclasses import dataclass
- from enum import Enum
- from typing import Callable, Dict, List, Optional, Tuple
- @dataclass(slots=True)
- class _Token:
- type: str
- value: str
- lineno: int
- column: int
- class CLexer:
- """A standalone lexer for C.
- Parameters for construction:
- error_func:
- Called with (msg, line, column) on lexing errors.
- on_lbrace_func:
- Called when an LBRACE token is produced (used for scope tracking).
- on_rbrace_func:
- Called when an RBRACE token is produced (used for scope tracking).
- type_lookup_func:
- Called with an identifier name; expected to return True if it is
- a typedef name and should be tokenized as TYPEID.
- Call input(text) to initialize lexing, and then keep calling token() to
- get the next token, until it returns None (at end of input).
- """
- def __init__(
- self,
- error_func: Callable[[str, int, int], None],
- on_lbrace_func: Callable[[], None],
- on_rbrace_func: Callable[[], None],
- type_lookup_func: Callable[[str], bool],
- ) -> None:
- self.error_func = error_func
- self.on_lbrace_func = on_lbrace_func
- self.on_rbrace_func = on_rbrace_func
- self.type_lookup_func = type_lookup_func
- self._init_state()
- def input(self, text: str, filename: str = "") -> None:
- """Initialize the lexer to the given input text.
- filename is an optional name identifying the file from which the input
- comes. The lexer can modify it if #line directives are encountered.
- """
- self._init_state()
- self._lexdata = text
- self._filename = filename
- def _init_state(self) -> None:
- self._lexdata = ""
- self._filename = ""
- self._pos = 0
- self._line_start = 0
- self._pending_tok: Optional[_Token] = None
- self._lineno = 1
- @property
- def filename(self) -> str:
- return self._filename
- def token(self) -> Optional[_Token]:
- # Lexing strategy overview:
- #
- # - We maintain a current position (self._pos), line number, and the
- # byte offset of the current line start. The lexer is a simple loop
- # that skips whitespace/newlines and emits one token per call.
- # - A small amount of logic is handled manually before regex matching:
- #
- # * Preprocessor-style directives: if we see '#', we check whether
- # it's a #line or #pragma directive and consume it inline. #line
- # updates lineno/filename and produces no tokens. #pragma can yield
- # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
- # so we stash the PPPRAGMASTR as _pending_tok to return on the next
- # token() call. Otherwise we return PPHASH.
- # * Newlines update lineno/line-start tracking so tokens can record
- # accurate columns.
- #
- # - The bulk of tokens are recognized in _match_token:
- #
- # * _regex_rules: regex patterns for identifiers, literals, and other
- # complex tokens (including error-producing patterns). The lexer
- # uses a combined _regex_master to scan options at the same time.
- # * _fixed_tokens: exact string matches for operators and punctuation,
- # resolved by longest match.
- #
- # - Error patterns call the error callback and advance minimally, which
- # keeps lexing resilient while reporting useful diagnostics.
- text = self._lexdata
- n = len(text)
- if self._pending_tok is not None:
- tok = self._pending_tok
- self._pending_tok = None
- return tok
- while self._pos < n:
- match text[self._pos]:
- case " " | "\t":
- self._pos += 1
- case "\n":
- self._lineno += 1
- self._pos += 1
- self._line_start = self._pos
- case "#":
- if _line_pattern.match(text, self._pos + 1):
- self._pos += 1
- self._handle_ppline()
- continue
- if _pragma_pattern.match(text, self._pos + 1):
- self._pos += 1
- toks = self._handle_pppragma()
- if len(toks) > 1:
- self._pending_tok = toks[1]
- if len(toks) > 0:
- return toks[0]
- continue
- tok = self._make_token("PPHASH", "#", self._pos)
- self._pos += 1
- return tok
- case _:
- if tok := self._match_token():
- return tok
- else:
- continue
- def _match_token(self) -> Optional[_Token]:
- """Match one token at the current position.
- Returns a Token on success, or None if no token could be matched and
- an error was reported. This method always advances _pos by the matched
- length, or by 1 on error/no-match.
- """
- text = self._lexdata
- pos = self._pos
- # We pick the longest match between:
- # - the master regex (identifiers, literals, error patterns, etc.)
- # - fixed operator/punctuator literals from the bucket for text[pos]
- #
- # The longest match is required to ensure we properly lex something
- # like ".123" (a floating-point constant) as a single entity (with
- # FLOAT_CONST), rather than a PERIOD followed by a number.
- #
- # The fixed-literal buckets are already length-sorted, so within that
- # bucket we can take the first match. However, we still compare its
- # length to the regex match because the regex may have matched a longer
- # token that should take precedence.
- best = None
- if m := _regex_master.match(text, pos):
- tok_type = m.lastgroup
- # All master-regex alternatives are named; lastgroup shouldn't be None.
- assert tok_type is not None
- value = m.group(tok_type)
- length = len(value)
- action, msg = _regex_actions[tok_type]
- best = (length, tok_type, value, action, msg)
- if bucket := _fixed_tokens_by_first.get(text[pos]):
- for entry in bucket:
- if text.startswith(entry.literal, pos):
- length = len(entry.literal)
- if best is None or length > best[0]:
- best = (
- length,
- entry.tok_type,
- entry.literal,
- _RegexAction.TOKEN,
- None,
- )
- break
- if best is None:
- msg = f"Illegal character {repr(text[pos])}"
- self._error(msg, pos)
- self._pos += 1
- return None
- length, tok_type, value, action, msg = best
- if action == _RegexAction.ERROR:
- if tok_type == "BAD_CHAR_CONST":
- msg = f"Invalid char constant {value}"
- # All other ERROR rules provide a message.
- assert msg is not None
- self._error(msg, pos)
- self._pos += max(1, length)
- return None
- if action == _RegexAction.ID:
- tok_type = _keyword_map.get(value, "ID")
- if tok_type == "ID" and self.type_lookup_func(value):
- tok_type = "TYPEID"
- tok = self._make_token(tok_type, value, pos)
- self._pos += length
- if tok.type == "LBRACE":
- self.on_lbrace_func()
- elif tok.type == "RBRACE":
- self.on_rbrace_func()
- return tok
- def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
- """Create a Token at an absolute input position.
- Expects tok_type/value and the absolute byte offset pos in the current
- input. Does not advance lexer state; callers manage _pos themselves.
- Returns a Token with lineno/column computed from current line tracking.
- """
- column = pos - self._line_start + 1
- tok = _Token(tok_type, value, self._lineno, column)
- return tok
- def _error(self, msg: str, pos: int) -> None:
- column = pos - self._line_start + 1
- self.error_func(msg, self._lineno, column)
- def _handle_ppline(self) -> None:
- # Since #line directives aren't supposed to return tokens but should
- # only affect the lexer's state (update line/filename for coords), this
- # method does a bit of parsing on its own. It doesn't return anything,
- # but its side effect is to update self._pos past the directive, and
- # potentially update self._lineno and self._filename, based on the
- # directive's contents.
- #
- # Accepted #line forms from preprocessors:
- # - "#line 66 \"kwas\\df.h\""
- # - "# 9"
- # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
- # - "# 1 \"file.h\" 3"
- # Errors we must report:
- # - "#line \"file.h\"" (filename before line number)
- # - "#line df" (garbage instead of number/string)
- #
- # We scan the directive line once (after an optional 'line' keyword),
- # validating the order: NUMBER, optional STRING, then any NUMBERs.
- # The NUMBERs tail is only accepted if a filename STRING was present.
- text = self._lexdata
- n = len(text)
- line_end = text.find("\n", self._pos)
- if line_end == -1:
- line_end = n
- line = text[self._pos : line_end]
- pos = 0
- line_len = len(line)
- def skip_ws() -> None:
- nonlocal pos
- while pos < line_len and line[pos] in " \t":
- pos += 1
- skip_ws()
- if line.startswith("line", pos):
- pos += 4
- def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
- if pp_line is None:
- self._error("line number missing in #line", self._pos + line_len)
- else:
- self._lineno = int(pp_line)
- if pp_filename is not None:
- self._filename = pp_filename
- self._pos = line_end + 1
- self._line_start = self._pos
- def fail(msg: str, offset: int) -> None:
- self._error(msg, self._pos + offset)
- self._pos = line_end + 1
- self._line_start = self._pos
- skip_ws()
- if pos >= line_len:
- success(None, None)
- return
- if line[pos] == '"':
- fail("filename before line number in #line", pos)
- return
- m = re.match(_decimal_constant, line[pos:])
- if not m:
- fail("invalid #line directive", pos)
- return
- pp_line = m.group(0)
- pos += len(pp_line)
- skip_ws()
- if pos >= line_len:
- success(pp_line, None)
- return
- if line[pos] != '"':
- fail("invalid #line directive", pos)
- return
- m = re.match(_string_literal, line[pos:])
- if not m:
- fail("invalid #line directive", pos)
- return
- pp_filename = m.group(0).lstrip('"').rstrip('"')
- pos += len(m.group(0))
- # Consume arbitrary sequence of numeric flags after the directive
- while True:
- skip_ws()
- if pos >= line_len:
- break
- m = re.match(_decimal_constant, line[pos:])
- if not m:
- fail("invalid #line directive", pos)
- return
- pos += len(m.group(0))
- success(pp_line, pp_filename)
- def _handle_pppragma(self) -> List[_Token]:
- # Parse a full #pragma line; returns a list of tokens with 1 or 2
- # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
- # returned, it means an error occurred, or we're at the end of input.
- #
- # Examples:
- # - "#pragma" -> PPPRAGMA only
- # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
- # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
- # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
- text = self._lexdata
- n = len(text)
- pos = self._pos
- while pos < n and text[pos] in " \t":
- pos += 1
- if pos >= n:
- self._pos = pos
- return []
- if not text.startswith("pragma", pos):
- self._error("invalid #pragma directive", pos)
- self._pos = pos + 1
- return []
- pragma_pos = pos
- pos += len("pragma")
- toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
- while pos < n and text[pos] in " \t":
- pos += 1
- start = pos
- while pos < n and text[pos] != "\n":
- pos += 1
- if pos > start:
- toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
- if pos < n and text[pos] == "\n":
- self._lineno += 1
- pos += 1
- self._line_start = pos
- self._pos = pos
- return toks
- ##
- ## Reserved keywords
- ##
- _keywords: Tuple[str, ...] = (
- "AUTO",
- "BREAK",
- "CASE",
- "CHAR",
- "CONST",
- "CONTINUE",
- "DEFAULT",
- "DO",
- "DOUBLE",
- "ELSE",
- "ENUM",
- "EXTERN",
- "FLOAT",
- "FOR",
- "GOTO",
- "IF",
- "INLINE",
- "INT",
- "LONG",
- "REGISTER",
- "OFFSETOF",
- "RESTRICT",
- "RETURN",
- "SHORT",
- "SIGNED",
- "SIZEOF",
- "STATIC",
- "STRUCT",
- "SWITCH",
- "TYPEDEF",
- "UNION",
- "UNSIGNED",
- "VOID",
- "VOLATILE",
- "WHILE",
- "__INT128",
- "_BOOL",
- "_COMPLEX",
- "_NORETURN",
- "_THREAD_LOCAL",
- "_STATIC_ASSERT",
- "_ATOMIC",
- "_ALIGNOF",
- "_ALIGNAS",
- "_PRAGMA",
- )
- _keyword_map: Dict[str, str] = {}
- for keyword in _keywords:
- # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
- if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
- _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
- else:
- _keyword_map[keyword.lower()] = keyword
- ##
- ## Regexes for use in tokens
- ##
- # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
- _identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
- _hex_prefix = "0[xX]"
- _hex_digits = "[0-9a-fA-F]+"
- _bin_prefix = "0[bB]"
- _bin_digits = "[01]+"
- # integer constants (K&R2: A.2.5.1)
- _integer_suffix_opt = (
- r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
- )
- _decimal_constant = (
- "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
- )
- _octal_constant = "0[0-7]*" + _integer_suffix_opt
- _hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
- _bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
- _bad_octal_constant = "0[0-7]*[89]"
- # comments are not supported
- _unsupported_c_style_comment = r"\/\*"
- _unsupported_cxx_style_comment = r"\/\/"
- # character constants (K&R2: A.2.5.2)
- # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
- # directives with Windows paths as filenames (..\..\dir\file)
- # For the same reason, decimal_escape allows all digit sequences. We want to
- # parse all correct code, even if it means to sometimes parse incorrect
- # code.
- #
- # The original regexes were taken verbatim from the C syntax definition,
- # and were later modified to avoid worst-case exponential running time.
- #
- # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
- # decimal_escape = r"""(\d+)"""
- # hex_escape = r"""(x[0-9a-fA-F]+)"""
- # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
- #
- # The following modifications were made to avoid the ambiguity that allowed
- # backtracking: (https://github.com/eliben/pycparser/issues/61)
- #
- # - \x was removed from simple_escape, unless it was not followed by a hex
- # digit, to avoid ambiguity with hex_escape.
- # - hex_escape allows one or more hex characters, but requires that the next
- # character(if any) is not hex
- # - decimal_escape allows one or more decimal characters, but requires that the
- # next character(if any) is not a decimal
- # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
- # permissive decimal_escape.
- #
- # Without this change, python's `re` module would recursively try parsing each
- # ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
- # `\1`+`23`, `\12`+`3`, and `\123`.
- _simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
- _decimal_escape = r"""(\d+)(?!\d)"""
- _hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
- _bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
- _escape_sequence = (
- r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
- )
- # This complicated regex with lookahead might be slow for strings, so because
- # all of the valid escapes (including \x) allowed
- # 0 or more non-escaped characters after the first character,
- # simple_escape+decimal_escape+hex_escape got simplified to
- _escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
- _cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
- _char_const = "'" + _cconst_char + "'"
- _wchar_const = "L" + _char_const
- _u8char_const = "u8" + _char_const
- _u16char_const = "u" + _char_const
- _u32char_const = "U" + _char_const
- _multicharacter_constant = "'" + _cconst_char + "{2,4}'"
- _unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
- _bad_char_const = (
- r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
- )
- # string literals (K&R2: A.2.6)
- _string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
- _string_literal = '"' + _string_char + '*"'
- _wstring_literal = "L" + _string_literal
- _u8string_literal = "u8" + _string_literal
- _u16string_literal = "u" + _string_literal
- _u32string_literal = "U" + _string_literal
- _bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
- # floating constants (K&R2: A.2.5.3)
- _exponent_part = r"""([eE][-+]?[0-9]+)"""
- _fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
- _floating_constant = (
- "(((("
- + _fractional_constant
- + ")"
- + _exponent_part
- + "?)|([0-9]+"
- + _exponent_part
- + "))[FfLl]?)"
- )
- _binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
- _hex_fractional_constant = (
- "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
- )
- _hex_floating_constant = (
- "("
- + _hex_prefix
- + "("
- + _hex_digits
- + "|"
- + _hex_fractional_constant
- + ")"
- + _binary_exponent_part
- + "[FfLl]?)"
- )
- class _RegexAction(Enum):
- TOKEN = 0
- ID = 1
- ERROR = 2
- @dataclass(frozen=True)
- class _RegexRule:
- # tok_type: name of the token emitted for a match
- # regex_pattern: the raw regex (no anchors) to match at the current position
- # action: TOKEN for normal tokens, ID for identifiers, ERROR to report
- # error_message: message used for ERROR entries
- tok_type: str
- regex_pattern: str
- action: _RegexAction
- error_message: Optional[str]
- _regex_rules: List[_RegexRule] = [
- _RegexRule(
- "UNSUPPORTED_C_STYLE_COMMENT",
- _unsupported_c_style_comment,
- _RegexAction.ERROR,
- "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
- ),
- _RegexRule(
- "UNSUPPORTED_CXX_STYLE_COMMENT",
- _unsupported_cxx_style_comment,
- _RegexAction.ERROR,
- "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
- ),
- _RegexRule(
- "BAD_STRING_LITERAL",
- _bad_string_literal,
- _RegexAction.ERROR,
- "String contains invalid escape code",
- ),
- _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
- _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
- _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
- _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
- _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
- _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
- _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
- _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
- _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
- _RegexRule(
- "BAD_CONST_OCT",
- _bad_octal_constant,
- _RegexAction.ERROR,
- "Invalid octal constant",
- ),
- _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
- _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
- _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
- _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
- _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
- _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
- _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
- _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
- _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
- _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
- _RegexRule("ID", _identifier, _RegexAction.ID, None),
- ]
- _regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
- _regex_pattern_parts: List[str] = []
- for _rule in _regex_rules:
- _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
- _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
- # The master regex is a single alternation of all token patterns, each wrapped
- # in a named group. We match once at the current position and then use
- # `lastgroup` to recover which token kind fired; this avoids iterating over all
- # regexes on every character while keeping the same token-level semantics.
- _regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
- @dataclass(frozen=True)
- class _FixedToken:
- tok_type: str
- literal: str
- _fixed_tokens: List[_FixedToken] = [
- _FixedToken("ELLIPSIS", "..."),
- _FixedToken("LSHIFTEQUAL", "<<="),
- _FixedToken("RSHIFTEQUAL", ">>="),
- _FixedToken("PLUSPLUS", "++"),
- _FixedToken("MINUSMINUS", "--"),
- _FixedToken("ARROW", "->"),
- _FixedToken("LAND", "&&"),
- _FixedToken("LOR", "||"),
- _FixedToken("LSHIFT", "<<"),
- _FixedToken("RSHIFT", ">>"),
- _FixedToken("LE", "<="),
- _FixedToken("GE", ">="),
- _FixedToken("EQ", "=="),
- _FixedToken("NE", "!="),
- _FixedToken("TIMESEQUAL", "*="),
- _FixedToken("DIVEQUAL", "/="),
- _FixedToken("MODEQUAL", "%="),
- _FixedToken("PLUSEQUAL", "+="),
- _FixedToken("MINUSEQUAL", "-="),
- _FixedToken("ANDEQUAL", "&="),
- _FixedToken("OREQUAL", "|="),
- _FixedToken("XOREQUAL", "^="),
- _FixedToken("EQUALS", "="),
- _FixedToken("PLUS", "+"),
- _FixedToken("MINUS", "-"),
- _FixedToken("TIMES", "*"),
- _FixedToken("DIVIDE", "/"),
- _FixedToken("MOD", "%"),
- _FixedToken("OR", "|"),
- _FixedToken("AND", "&"),
- _FixedToken("NOT", "~"),
- _FixedToken("XOR", "^"),
- _FixedToken("LNOT", "!"),
- _FixedToken("LT", "<"),
- _FixedToken("GT", ">"),
- _FixedToken("CONDOP", "?"),
- _FixedToken("LPAREN", "("),
- _FixedToken("RPAREN", ")"),
- _FixedToken("LBRACKET", "["),
- _FixedToken("RBRACKET", "]"),
- _FixedToken("LBRACE", "{"),
- _FixedToken("RBRACE", "}"),
- _FixedToken("COMMA", ","),
- _FixedToken("PERIOD", "."),
- _FixedToken("SEMI", ";"),
- _FixedToken("COLON", ":"),
- ]
- # To avoid scanning all fixed tokens on every character, we bucket them by the
- # first character. When matching at position i, we only look at the bucket for
- # text[i], and we pre-sort that bucket by token length so the first match is
- # also the longest. This preserves longest-match semantics (e.g. '>>=' before
- # '>>' before '>') while reducing the number of comparisons.
- _fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
- for _entry in _fixed_tokens:
- _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
- for _bucket in _fixed_tokens_by_first.values():
- _bucket.sort(key=lambda item: len(item.literal), reverse=True)
- _line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
- _pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")
|