c_lexer.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. # ------------------------------------------------------------------------------
  2. # pycparser: c_lexer.py
  3. #
  4. # CLexer class: lexer for the C language
  5. #
  6. # Eli Bendersky [https://eli.thegreenplace.net/]
  7. # License: BSD
  8. # ------------------------------------------------------------------------------
  9. import re
  10. from dataclasses import dataclass
  11. from enum import Enum
  12. from typing import Callable, Dict, List, Optional, Tuple
  13. @dataclass(slots=True)
  14. class _Token:
  15. type: str
  16. value: str
  17. lineno: int
  18. column: int
  19. class CLexer:
  20. """A standalone lexer for C.
  21. Parameters for construction:
  22. error_func:
  23. Called with (msg, line, column) on lexing errors.
  24. on_lbrace_func:
  25. Called when an LBRACE token is produced (used for scope tracking).
  26. on_rbrace_func:
  27. Called when an RBRACE token is produced (used for scope tracking).
  28. type_lookup_func:
  29. Called with an identifier name; expected to return True if it is
  30. a typedef name and should be tokenized as TYPEID.
  31. Call input(text) to initialize lexing, and then keep calling token() to
  32. get the next token, until it returns None (at end of input).
  33. """
  34. def __init__(
  35. self,
  36. error_func: Callable[[str, int, int], None],
  37. on_lbrace_func: Callable[[], None],
  38. on_rbrace_func: Callable[[], None],
  39. type_lookup_func: Callable[[str], bool],
  40. ) -> None:
  41. self.error_func = error_func
  42. self.on_lbrace_func = on_lbrace_func
  43. self.on_rbrace_func = on_rbrace_func
  44. self.type_lookup_func = type_lookup_func
  45. self._init_state()
  46. def input(self, text: str, filename: str = "") -> None:
  47. """Initialize the lexer to the given input text.
  48. filename is an optional name identifying the file from which the input
  49. comes. The lexer can modify it if #line directives are encountered.
  50. """
  51. self._init_state()
  52. self._lexdata = text
  53. self._filename = filename
  54. def _init_state(self) -> None:
  55. self._lexdata = ""
  56. self._filename = ""
  57. self._pos = 0
  58. self._line_start = 0
  59. self._pending_tok: Optional[_Token] = None
  60. self._lineno = 1
  61. @property
  62. def filename(self) -> str:
  63. return self._filename
  64. def token(self) -> Optional[_Token]:
  65. # Lexing strategy overview:
  66. #
  67. # - We maintain a current position (self._pos), line number, and the
  68. # byte offset of the current line start. The lexer is a simple loop
  69. # that skips whitespace/newlines and emits one token per call.
  70. # - A small amount of logic is handled manually before regex matching:
  71. #
  72. # * Preprocessor-style directives: if we see '#', we check whether
  73. # it's a #line or #pragma directive and consume it inline. #line
  74. # updates lineno/filename and produces no tokens. #pragma can yield
  75. # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
  76. # so we stash the PPPRAGMASTR as _pending_tok to return on the next
  77. # token() call. Otherwise we return PPHASH.
  78. # * Newlines update lineno/line-start tracking so tokens can record
  79. # accurate columns.
  80. #
  81. # - The bulk of tokens are recognized in _match_token:
  82. #
  83. # * _regex_rules: regex patterns for identifiers, literals, and other
  84. # complex tokens (including error-producing patterns). The lexer
  85. # uses a combined _regex_master to scan options at the same time.
  86. # * _fixed_tokens: exact string matches for operators and punctuation,
  87. # resolved by longest match.
  88. #
  89. # - Error patterns call the error callback and advance minimally, which
  90. # keeps lexing resilient while reporting useful diagnostics.
  91. text = self._lexdata
  92. n = len(text)
  93. if self._pending_tok is not None:
  94. tok = self._pending_tok
  95. self._pending_tok = None
  96. return tok
  97. while self._pos < n:
  98. match text[self._pos]:
  99. case " " | "\t":
  100. self._pos += 1
  101. case "\n":
  102. self._lineno += 1
  103. self._pos += 1
  104. self._line_start = self._pos
  105. case "#":
  106. if _line_pattern.match(text, self._pos + 1):
  107. self._pos += 1
  108. self._handle_ppline()
  109. continue
  110. if _pragma_pattern.match(text, self._pos + 1):
  111. self._pos += 1
  112. toks = self._handle_pppragma()
  113. if len(toks) > 1:
  114. self._pending_tok = toks[1]
  115. if len(toks) > 0:
  116. return toks[0]
  117. continue
  118. tok = self._make_token("PPHASH", "#", self._pos)
  119. self._pos += 1
  120. return tok
  121. case _:
  122. if tok := self._match_token():
  123. return tok
  124. else:
  125. continue
  126. def _match_token(self) -> Optional[_Token]:
  127. """Match one token at the current position.
  128. Returns a Token on success, or None if no token could be matched and
  129. an error was reported. This method always advances _pos by the matched
  130. length, or by 1 on error/no-match.
  131. """
  132. text = self._lexdata
  133. pos = self._pos
  134. # We pick the longest match between:
  135. # - the master regex (identifiers, literals, error patterns, etc.)
  136. # - fixed operator/punctuator literals from the bucket for text[pos]
  137. #
  138. # The longest match is required to ensure we properly lex something
  139. # like ".123" (a floating-point constant) as a single entity (with
  140. # FLOAT_CONST), rather than a PERIOD followed by a number.
  141. #
  142. # The fixed-literal buckets are already length-sorted, so within that
  143. # bucket we can take the first match. However, we still compare its
  144. # length to the regex match because the regex may have matched a longer
  145. # token that should take precedence.
  146. best = None
  147. if m := _regex_master.match(text, pos):
  148. tok_type = m.lastgroup
  149. # All master-regex alternatives are named; lastgroup shouldn't be None.
  150. assert tok_type is not None
  151. value = m.group(tok_type)
  152. length = len(value)
  153. action, msg = _regex_actions[tok_type]
  154. best = (length, tok_type, value, action, msg)
  155. if bucket := _fixed_tokens_by_first.get(text[pos]):
  156. for entry in bucket:
  157. if text.startswith(entry.literal, pos):
  158. length = len(entry.literal)
  159. if best is None or length > best[0]:
  160. best = (
  161. length,
  162. entry.tok_type,
  163. entry.literal,
  164. _RegexAction.TOKEN,
  165. None,
  166. )
  167. break
  168. if best is None:
  169. msg = f"Illegal character {repr(text[pos])}"
  170. self._error(msg, pos)
  171. self._pos += 1
  172. return None
  173. length, tok_type, value, action, msg = best
  174. if action == _RegexAction.ERROR:
  175. if tok_type == "BAD_CHAR_CONST":
  176. msg = f"Invalid char constant {value}"
  177. # All other ERROR rules provide a message.
  178. assert msg is not None
  179. self._error(msg, pos)
  180. self._pos += max(1, length)
  181. return None
  182. if action == _RegexAction.ID:
  183. tok_type = _keyword_map.get(value, "ID")
  184. if tok_type == "ID" and self.type_lookup_func(value):
  185. tok_type = "TYPEID"
  186. tok = self._make_token(tok_type, value, pos)
  187. self._pos += length
  188. if tok.type == "LBRACE":
  189. self.on_lbrace_func()
  190. elif tok.type == "RBRACE":
  191. self.on_rbrace_func()
  192. return tok
  193. def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
  194. """Create a Token at an absolute input position.
  195. Expects tok_type/value and the absolute byte offset pos in the current
  196. input. Does not advance lexer state; callers manage _pos themselves.
  197. Returns a Token with lineno/column computed from current line tracking.
  198. """
  199. column = pos - self._line_start + 1
  200. tok = _Token(tok_type, value, self._lineno, column)
  201. return tok
  202. def _error(self, msg: str, pos: int) -> None:
  203. column = pos - self._line_start + 1
  204. self.error_func(msg, self._lineno, column)
  205. def _handle_ppline(self) -> None:
  206. # Since #line directives aren't supposed to return tokens but should
  207. # only affect the lexer's state (update line/filename for coords), this
  208. # method does a bit of parsing on its own. It doesn't return anything,
  209. # but its side effect is to update self._pos past the directive, and
  210. # potentially update self._lineno and self._filename, based on the
  211. # directive's contents.
  212. #
  213. # Accepted #line forms from preprocessors:
  214. # - "#line 66 \"kwas\\df.h\""
  215. # - "# 9"
  216. # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
  217. # - "# 1 \"file.h\" 3"
  218. # Errors we must report:
  219. # - "#line \"file.h\"" (filename before line number)
  220. # - "#line df" (garbage instead of number/string)
  221. #
  222. # We scan the directive line once (after an optional 'line' keyword),
  223. # validating the order: NUMBER, optional STRING, then any NUMBERs.
  224. # The NUMBERs tail is only accepted if a filename STRING was present.
  225. text = self._lexdata
  226. n = len(text)
  227. line_end = text.find("\n", self._pos)
  228. if line_end == -1:
  229. line_end = n
  230. line = text[self._pos : line_end]
  231. pos = 0
  232. line_len = len(line)
  233. def skip_ws() -> None:
  234. nonlocal pos
  235. while pos < line_len and line[pos] in " \t":
  236. pos += 1
  237. skip_ws()
  238. if line.startswith("line", pos):
  239. pos += 4
  240. def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
  241. if pp_line is None:
  242. self._error("line number missing in #line", self._pos + line_len)
  243. else:
  244. self._lineno = int(pp_line)
  245. if pp_filename is not None:
  246. self._filename = pp_filename
  247. self._pos = line_end + 1
  248. self._line_start = self._pos
  249. def fail(msg: str, offset: int) -> None:
  250. self._error(msg, self._pos + offset)
  251. self._pos = line_end + 1
  252. self._line_start = self._pos
  253. skip_ws()
  254. if pos >= line_len:
  255. success(None, None)
  256. return
  257. if line[pos] == '"':
  258. fail("filename before line number in #line", pos)
  259. return
  260. m = re.match(_decimal_constant, line[pos:])
  261. if not m:
  262. fail("invalid #line directive", pos)
  263. return
  264. pp_line = m.group(0)
  265. pos += len(pp_line)
  266. skip_ws()
  267. if pos >= line_len:
  268. success(pp_line, None)
  269. return
  270. if line[pos] != '"':
  271. fail("invalid #line directive", pos)
  272. return
  273. m = re.match(_string_literal, line[pos:])
  274. if not m:
  275. fail("invalid #line directive", pos)
  276. return
  277. pp_filename = m.group(0).lstrip('"').rstrip('"')
  278. pos += len(m.group(0))
  279. # Consume arbitrary sequence of numeric flags after the directive
  280. while True:
  281. skip_ws()
  282. if pos >= line_len:
  283. break
  284. m = re.match(_decimal_constant, line[pos:])
  285. if not m:
  286. fail("invalid #line directive", pos)
  287. return
  288. pos += len(m.group(0))
  289. success(pp_line, pp_filename)
  290. def _handle_pppragma(self) -> List[_Token]:
  291. # Parse a full #pragma line; returns a list of tokens with 1 or 2
  292. # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
  293. # returned, it means an error occurred, or we're at the end of input.
  294. #
  295. # Examples:
  296. # - "#pragma" -> PPPRAGMA only
  297. # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
  298. # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
  299. # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
  300. text = self._lexdata
  301. n = len(text)
  302. pos = self._pos
  303. while pos < n and text[pos] in " \t":
  304. pos += 1
  305. if pos >= n:
  306. self._pos = pos
  307. return []
  308. if not text.startswith("pragma", pos):
  309. self._error("invalid #pragma directive", pos)
  310. self._pos = pos + 1
  311. return []
  312. pragma_pos = pos
  313. pos += len("pragma")
  314. toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
  315. while pos < n and text[pos] in " \t":
  316. pos += 1
  317. start = pos
  318. while pos < n and text[pos] != "\n":
  319. pos += 1
  320. if pos > start:
  321. toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
  322. if pos < n and text[pos] == "\n":
  323. self._lineno += 1
  324. pos += 1
  325. self._line_start = pos
  326. self._pos = pos
  327. return toks
  328. ##
  329. ## Reserved keywords
  330. ##
  331. _keywords: Tuple[str, ...] = (
  332. "AUTO",
  333. "BREAK",
  334. "CASE",
  335. "CHAR",
  336. "CONST",
  337. "CONTINUE",
  338. "DEFAULT",
  339. "DO",
  340. "DOUBLE",
  341. "ELSE",
  342. "ENUM",
  343. "EXTERN",
  344. "FLOAT",
  345. "FOR",
  346. "GOTO",
  347. "IF",
  348. "INLINE",
  349. "INT",
  350. "LONG",
  351. "REGISTER",
  352. "OFFSETOF",
  353. "RESTRICT",
  354. "RETURN",
  355. "SHORT",
  356. "SIGNED",
  357. "SIZEOF",
  358. "STATIC",
  359. "STRUCT",
  360. "SWITCH",
  361. "TYPEDEF",
  362. "UNION",
  363. "UNSIGNED",
  364. "VOID",
  365. "VOLATILE",
  366. "WHILE",
  367. "__INT128",
  368. "_BOOL",
  369. "_COMPLEX",
  370. "_NORETURN",
  371. "_THREAD_LOCAL",
  372. "_STATIC_ASSERT",
  373. "_ATOMIC",
  374. "_ALIGNOF",
  375. "_ALIGNAS",
  376. "_PRAGMA",
  377. )
  378. _keyword_map: Dict[str, str] = {}
  379. for keyword in _keywords:
  380. # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
  381. if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
  382. _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
  383. else:
  384. _keyword_map[keyword.lower()] = keyword
  385. ##
  386. ## Regexes for use in tokens
  387. ##
  388. # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
  389. _identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
  390. _hex_prefix = "0[xX]"
  391. _hex_digits = "[0-9a-fA-F]+"
  392. _bin_prefix = "0[bB]"
  393. _bin_digits = "[01]+"
  394. # integer constants (K&R2: A.2.5.1)
  395. _integer_suffix_opt = (
  396. r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
  397. )
  398. _decimal_constant = (
  399. "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
  400. )
  401. _octal_constant = "0[0-7]*" + _integer_suffix_opt
  402. _hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
  403. _bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
  404. _bad_octal_constant = "0[0-7]*[89]"
  405. # comments are not supported
  406. _unsupported_c_style_comment = r"\/\*"
  407. _unsupported_cxx_style_comment = r"\/\/"
  408. # character constants (K&R2: A.2.5.2)
  409. # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
  410. # directives with Windows paths as filenames (..\..\dir\file)
  411. # For the same reason, decimal_escape allows all digit sequences. We want to
  412. # parse all correct code, even if it means to sometimes parse incorrect
  413. # code.
  414. #
  415. # The original regexes were taken verbatim from the C syntax definition,
  416. # and were later modified to avoid worst-case exponential running time.
  417. #
  418. # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
  419. # decimal_escape = r"""(\d+)"""
  420. # hex_escape = r"""(x[0-9a-fA-F]+)"""
  421. # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
  422. #
  423. # The following modifications were made to avoid the ambiguity that allowed
  424. # backtracking: (https://github.com/eliben/pycparser/issues/61)
  425. #
  426. # - \x was removed from simple_escape, unless it was not followed by a hex
  427. # digit, to avoid ambiguity with hex_escape.
  428. # - hex_escape allows one or more hex characters, but requires that the next
  429. # character(if any) is not hex
  430. # - decimal_escape allows one or more decimal characters, but requires that the
  431. # next character(if any) is not a decimal
  432. # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
  433. # permissive decimal_escape.
  434. #
  435. # Without this change, python's `re` module would recursively try parsing each
  436. # ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
  437. # `\1`+`23`, `\12`+`3`, and `\123`.
  438. _simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
  439. _decimal_escape = r"""(\d+)(?!\d)"""
  440. _hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
  441. _bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
  442. _escape_sequence = (
  443. r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
  444. )
  445. # This complicated regex with lookahead might be slow for strings, so because
  446. # all of the valid escapes (including \x) allowed
  447. # 0 or more non-escaped characters after the first character,
  448. # simple_escape+decimal_escape+hex_escape got simplified to
  449. _escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
  450. _cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
  451. _char_const = "'" + _cconst_char + "'"
  452. _wchar_const = "L" + _char_const
  453. _u8char_const = "u8" + _char_const
  454. _u16char_const = "u" + _char_const
  455. _u32char_const = "U" + _char_const
  456. _multicharacter_constant = "'" + _cconst_char + "{2,4}'"
  457. _unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
  458. _bad_char_const = (
  459. r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
  460. )
  461. # string literals (K&R2: A.2.6)
  462. _string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
  463. _string_literal = '"' + _string_char + '*"'
  464. _wstring_literal = "L" + _string_literal
  465. _u8string_literal = "u8" + _string_literal
  466. _u16string_literal = "u" + _string_literal
  467. _u32string_literal = "U" + _string_literal
  468. _bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
  469. # floating constants (K&R2: A.2.5.3)
  470. _exponent_part = r"""([eE][-+]?[0-9]+)"""
  471. _fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
  472. _floating_constant = (
  473. "(((("
  474. + _fractional_constant
  475. + ")"
  476. + _exponent_part
  477. + "?)|([0-9]+"
  478. + _exponent_part
  479. + "))[FfLl]?)"
  480. )
  481. _binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
  482. _hex_fractional_constant = (
  483. "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
  484. )
  485. _hex_floating_constant = (
  486. "("
  487. + _hex_prefix
  488. + "("
  489. + _hex_digits
  490. + "|"
  491. + _hex_fractional_constant
  492. + ")"
  493. + _binary_exponent_part
  494. + "[FfLl]?)"
  495. )
  496. class _RegexAction(Enum):
  497. TOKEN = 0
  498. ID = 1
  499. ERROR = 2
  500. @dataclass(frozen=True)
  501. class _RegexRule:
  502. # tok_type: name of the token emitted for a match
  503. # regex_pattern: the raw regex (no anchors) to match at the current position
  504. # action: TOKEN for normal tokens, ID for identifiers, ERROR to report
  505. # error_message: message used for ERROR entries
  506. tok_type: str
  507. regex_pattern: str
  508. action: _RegexAction
  509. error_message: Optional[str]
  510. _regex_rules: List[_RegexRule] = [
  511. _RegexRule(
  512. "UNSUPPORTED_C_STYLE_COMMENT",
  513. _unsupported_c_style_comment,
  514. _RegexAction.ERROR,
  515. "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
  516. ),
  517. _RegexRule(
  518. "UNSUPPORTED_CXX_STYLE_COMMENT",
  519. _unsupported_cxx_style_comment,
  520. _RegexAction.ERROR,
  521. "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
  522. ),
  523. _RegexRule(
  524. "BAD_STRING_LITERAL",
  525. _bad_string_literal,
  526. _RegexAction.ERROR,
  527. "String contains invalid escape code",
  528. ),
  529. _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
  530. _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
  531. _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
  532. _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
  533. _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
  534. _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
  535. _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
  536. _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
  537. _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
  538. _RegexRule(
  539. "BAD_CONST_OCT",
  540. _bad_octal_constant,
  541. _RegexAction.ERROR,
  542. "Invalid octal constant",
  543. ),
  544. _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
  545. _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
  546. _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
  547. _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
  548. _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
  549. _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
  550. _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
  551. _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
  552. _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
  553. _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
  554. _RegexRule("ID", _identifier, _RegexAction.ID, None),
  555. ]
  556. _regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
  557. _regex_pattern_parts: List[str] = []
  558. for _rule in _regex_rules:
  559. _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
  560. _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
  561. # The master regex is a single alternation of all token patterns, each wrapped
  562. # in a named group. We match once at the current position and then use
  563. # `lastgroup` to recover which token kind fired; this avoids iterating over all
  564. # regexes on every character while keeping the same token-level semantics.
  565. _regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
  566. @dataclass(frozen=True)
  567. class _FixedToken:
  568. tok_type: str
  569. literal: str
  570. _fixed_tokens: List[_FixedToken] = [
  571. _FixedToken("ELLIPSIS", "..."),
  572. _FixedToken("LSHIFTEQUAL", "<<="),
  573. _FixedToken("RSHIFTEQUAL", ">>="),
  574. _FixedToken("PLUSPLUS", "++"),
  575. _FixedToken("MINUSMINUS", "--"),
  576. _FixedToken("ARROW", "->"),
  577. _FixedToken("LAND", "&&"),
  578. _FixedToken("LOR", "||"),
  579. _FixedToken("LSHIFT", "<<"),
  580. _FixedToken("RSHIFT", ">>"),
  581. _FixedToken("LE", "<="),
  582. _FixedToken("GE", ">="),
  583. _FixedToken("EQ", "=="),
  584. _FixedToken("NE", "!="),
  585. _FixedToken("TIMESEQUAL", "*="),
  586. _FixedToken("DIVEQUAL", "/="),
  587. _FixedToken("MODEQUAL", "%="),
  588. _FixedToken("PLUSEQUAL", "+="),
  589. _FixedToken("MINUSEQUAL", "-="),
  590. _FixedToken("ANDEQUAL", "&="),
  591. _FixedToken("OREQUAL", "|="),
  592. _FixedToken("XOREQUAL", "^="),
  593. _FixedToken("EQUALS", "="),
  594. _FixedToken("PLUS", "+"),
  595. _FixedToken("MINUS", "-"),
  596. _FixedToken("TIMES", "*"),
  597. _FixedToken("DIVIDE", "/"),
  598. _FixedToken("MOD", "%"),
  599. _FixedToken("OR", "|"),
  600. _FixedToken("AND", "&"),
  601. _FixedToken("NOT", "~"),
  602. _FixedToken("XOR", "^"),
  603. _FixedToken("LNOT", "!"),
  604. _FixedToken("LT", "<"),
  605. _FixedToken("GT", ">"),
  606. _FixedToken("CONDOP", "?"),
  607. _FixedToken("LPAREN", "("),
  608. _FixedToken("RPAREN", ")"),
  609. _FixedToken("LBRACKET", "["),
  610. _FixedToken("RBRACKET", "]"),
  611. _FixedToken("LBRACE", "{"),
  612. _FixedToken("RBRACE", "}"),
  613. _FixedToken("COMMA", ","),
  614. _FixedToken("PERIOD", "."),
  615. _FixedToken("SEMI", ";"),
  616. _FixedToken("COLON", ":"),
  617. ]
  618. # To avoid scanning all fixed tokens on every character, we bucket them by the
  619. # first character. When matching at position i, we only look at the bucket for
  620. # text[i], and we pre-sort that bucket by token length so the first match is
  621. # also the longest. This preserves longest-match semantics (e.g. '>>=' before
  622. # '>>' before '>') while reducing the number of comparisons.
  623. _fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
  624. for _entry in _fixed_tokens:
  625. _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
  626. for _bucket in _fixed_tokens_by_first.values():
  627. _bucket.sort(key=lambda item: len(item.literal), reverse=True)
  628. _line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
  629. _pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")