lexer.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. # Lexer Implementation
  2. from abc import abstractmethod, ABC
  3. import re
  4. from typing import (
  5. TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
  6. ClassVar, TYPE_CHECKING, overload
  7. )
  8. from types import ModuleType
  9. import warnings
  10. try:
  11. import interegular
  12. except ImportError:
  13. pass
  14. if TYPE_CHECKING:
  15. from .common import LexerConf
  16. from .parsers.lalr_parser_state import ParserState
  17. from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
  18. from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
  19. from .grammar import TOKEN_DEFAULT_PRIORITY
  20. ###{standalone
  21. from contextlib import suppress
  22. from copy import copy
  23. try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
  24. has_interegular = bool(interegular)
  25. except NameError:
  26. has_interegular = False
  27. class Pattern(Serialize, ABC):
  28. "An abstraction over regular expressions."
  29. value: str
  30. flags: Collection[str]
  31. raw: Optional[str]
  32. type: ClassVar[str]
  33. def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
  34. self.value = value
  35. self.flags = frozenset(flags)
  36. self.raw = raw
  37. def __repr__(self):
  38. return repr(self.to_regexp())
  39. # Pattern Hashing assumes all subclasses have a different priority!
  40. def __hash__(self):
  41. return hash((type(self), self.value, self.flags))
  42. def __eq__(self, other):
  43. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  44. @abstractmethod
  45. def to_regexp(self) -> str:
  46. raise NotImplementedError()
  47. @property
  48. @abstractmethod
  49. def min_width(self) -> int:
  50. raise NotImplementedError()
  51. @property
  52. @abstractmethod
  53. def max_width(self) -> int:
  54. raise NotImplementedError()
  55. def _get_flags(self, value):
  56. for f in self.flags:
  57. value = ('(?%s:%s)' % (f, value))
  58. return value
  59. class PatternStr(Pattern):
  60. __serialize_fields__ = 'value', 'flags', 'raw'
  61. type: ClassVar[str] = "str"
  62. def to_regexp(self) -> str:
  63. return self._get_flags(re.escape(self.value))
  64. @property
  65. def min_width(self) -> int:
  66. return len(self.value)
  67. @property
  68. def max_width(self) -> int:
  69. return len(self.value)
  70. class PatternRE(Pattern):
  71. __serialize_fields__ = 'value', 'flags', 'raw', '_width'
  72. type: ClassVar[str] = "re"
  73. def to_regexp(self) -> str:
  74. return self._get_flags(self.value)
  75. _width = None
  76. def _get_width(self):
  77. if self._width is None:
  78. self._width = get_regexp_width(self.to_regexp())
  79. return self._width
  80. @property
  81. def min_width(self) -> int:
  82. return self._get_width()[0]
  83. @property
  84. def max_width(self) -> int:
  85. return self._get_width()[1]
  86. class TerminalDef(Serialize):
  87. "A definition of a terminal"
  88. __serialize_fields__ = 'name', 'pattern', 'priority'
  89. __serialize_namespace__ = PatternStr, PatternRE
  90. name: str
  91. pattern: Pattern
  92. priority: int
  93. def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
  94. assert isinstance(pattern, Pattern), pattern
  95. self.name = name
  96. self.pattern = pattern
  97. self.priority = priority
  98. def __repr__(self):
  99. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  100. def user_repr(self) -> str:
  101. if self.name.startswith('__'): # We represent a generated terminal
  102. return self.pattern.raw or self.name
  103. else:
  104. return self.name
  105. _T = TypeVar('_T', bound="Token")
  106. class Token(str):
  107. """A string with meta-information, that is produced by the lexer.
  108. When parsing text, the resulting chunks of the input that haven't been discarded,
  109. will end up in the tree as Token instances. The Token class inherits from Python's ``str``,
  110. so normal string comparisons and operations will work as expected.
  111. Attributes:
  112. type: Name of the token (as specified in grammar)
  113. value: Value of the token (redundant, as ``token.value == token`` will always be true)
  114. start_pos: The index of the token in the text
  115. line: The line of the token in the text (starting with 1)
  116. column: The column of the token in the text (starting with 1)
  117. end_line: The line where the token ends
  118. end_column: The next column after the end of the token. For example,
  119. if the token is a single character with a column value of 4,
  120. end_column will be 5.
  121. end_pos: the index where the token ends (basically ``start_pos + len(token)``)
  122. """
  123. __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
  124. __match_args__ = ('type', 'value')
  125. type: str
  126. start_pos: Optional[int]
  127. value: Any
  128. line: Optional[int]
  129. column: Optional[int]
  130. end_line: Optional[int]
  131. end_column: Optional[int]
  132. end_pos: Optional[int]
  133. @overload
  134. def __new__(
  135. cls,
  136. type: str,
  137. value: Any,
  138. start_pos: Optional[int] = None,
  139. line: Optional[int] = None,
  140. column: Optional[int] = None,
  141. end_line: Optional[int] = None,
  142. end_column: Optional[int] = None,
  143. end_pos: Optional[int] = None
  144. ) -> 'Token':
  145. ...
  146. @overload
  147. def __new__(
  148. cls,
  149. type_: str,
  150. value: Any,
  151. start_pos: Optional[int] = None,
  152. line: Optional[int] = None,
  153. column: Optional[int] = None,
  154. end_line: Optional[int] = None,
  155. end_column: Optional[int] = None,
  156. end_pos: Optional[int] = None
  157. ) -> 'Token': ...
  158. def __new__(cls, *args, **kwargs):
  159. if "type_" in kwargs:
  160. warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
  161. if "type" in kwargs:
  162. raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
  163. kwargs["type"] = kwargs.pop("type_")
  164. return cls._future_new(*args, **kwargs)
  165. @classmethod
  166. def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
  167. inst = super(Token, cls).__new__(cls, value)
  168. inst.type = type
  169. inst.start_pos = start_pos
  170. inst.value = value
  171. inst.line = line
  172. inst.column = column
  173. inst.end_line = end_line
  174. inst.end_column = end_column
  175. inst.end_pos = end_pos
  176. return inst
  177. @overload
  178. def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  179. ...
  180. @overload
  181. def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  182. ...
  183. def update(self, *args, **kwargs):
  184. if "type_" in kwargs:
  185. warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
  186. if "type" in kwargs:
  187. raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
  188. kwargs["type"] = kwargs.pop("type_")
  189. return self._future_update(*args, **kwargs)
  190. def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  191. return Token.new_borrow_pos(
  192. type if type is not None else self.type,
  193. value if value is not None else self.value,
  194. self
  195. )
  196. @classmethod
  197. def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T:
  198. return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
  199. def __reduce__(self):
  200. return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
  201. def __repr__(self):
  202. return 'Token(%r, %r)' % (self.type, self.value)
  203. def __deepcopy__(self, memo):
  204. return Token(self.type, self.value, self.start_pos, self.line, self.column)
  205. def __eq__(self, other):
  206. if isinstance(other, Token) and self.type != other.type:
  207. return False
  208. return str.__eq__(self, other)
  209. __hash__ = str.__hash__
  210. class LineCounter:
  211. "A utility class for keeping track of line & column information"
  212. __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'
  213. def __init__(self, newline_char):
  214. self.newline_char = newline_char
  215. self.char_pos = 0
  216. self.line = 1
  217. self.column = 1
  218. self.line_start_pos = 0
  219. def __eq__(self, other):
  220. if not isinstance(other, LineCounter):
  221. return NotImplemented
  222. return self.char_pos == other.char_pos and self.newline_char == other.newline_char
  223. def feed(self, token: TextOrSlice, test_newline=True):
  224. """Consume a token and calculate the new line & column.
  225. As an optional optimization, set test_newline=False if token doesn't contain a newline.
  226. """
  227. if test_newline:
  228. newlines = token.count(self.newline_char)
  229. if newlines:
  230. self.line += newlines
  231. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  232. self.char_pos += len(token)
  233. self.column = self.char_pos - self.line_start_pos + 1
  234. class UnlessCallback:
  235. def __init__(self, scanner: 'Scanner'):
  236. self.scanner = scanner
  237. def __call__(self, t: Token):
  238. res = self.scanner.fullmatch(t.value)
  239. if res is not None:
  240. t.type = res
  241. return t
  242. class CallChain:
  243. def __init__(self, callback1, callback2, cond):
  244. self.callback1 = callback1
  245. self.callback2 = callback2
  246. self.cond = cond
  247. def __call__(self, t):
  248. t2 = self.callback1(t)
  249. return self.callback2(t) if self.cond(t2) else t2
  250. def _get_match(re_, regexp, s, flags):
  251. m = re_.match(regexp, s, flags)
  252. if m:
  253. return m.group(0)
  254. def _create_unless(terminals, g_regex_flags, re_, use_bytes):
  255. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  256. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  257. embedded_strs = set()
  258. callback = {}
  259. for retok in tokens_by_type.get(PatternRE, []):
  260. unless = []
  261. for strtok in tokens_by_type.get(PatternStr, []):
  262. if strtok.priority != retok.priority:
  263. continue
  264. s = strtok.pattern.value
  265. if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
  266. unless.append(strtok)
  267. if strtok.pattern.flags <= retok.pattern.flags:
  268. embedded_strs.add(strtok)
  269. if unless:
  270. callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes))
  271. new_terminals = [t for t in terminals if t not in embedded_strs]
  272. return new_terminals, callback
  273. class Scanner:
  274. def __init__(self, terminals, g_regex_flags, re_, use_bytes):
  275. self.terminals = terminals
  276. self.g_regex_flags = g_regex_flags
  277. self.re_ = re_
  278. self.use_bytes = use_bytes
  279. self.allowed_types = {t.name for t in self.terminals}
  280. self._mres = self._build_mres(terminals, len(terminals))
  281. def _build_mres(self, terminals, max_size):
  282. # Python sets an unreasonable group limit (currently 100) in its re module
  283. # Worse, the only way to know we reached it is by catching an AssertionError!
  284. # This function recursively tries less and less groups until it's successful.
  285. mres = []
  286. while terminals:
  287. pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size])
  288. if self.use_bytes:
  289. pattern = pattern.encode('latin-1')
  290. try:
  291. mre = self.re_.compile(pattern, self.g_regex_flags)
  292. except AssertionError: # Yes, this is what Python provides us.. :/
  293. return self._build_mres(terminals, max_size // 2)
  294. mres.append(mre)
  295. terminals = terminals[max_size:]
  296. return mres
  297. def match(self, text: TextSlice, pos):
  298. for mre in self._mres:
  299. m = mre.match(text.text, pos, text.end)
  300. if m:
  301. return m.group(0), m.lastgroup
  302. def fullmatch(self, text: str) -> Optional[str]:
  303. for mre in self._mres:
  304. m = mre.fullmatch(text)
  305. if m:
  306. return m.lastgroup
  307. return None
  308. def _regexp_has_newline(r: str):
  309. r"""Expressions that may indicate newlines in a regexp:
  310. - newlines (\n)
  311. - escaped newline (\\n)
  312. - anything but ([^...])
  313. - any-char (.) when the flag (?s) exists
  314. - spaces (\s)
  315. """
  316. return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
  317. class LexerState:
  318. """Represents the current state of the lexer as it scans the text
  319. (Lexer objects are only instantiated per grammar, not per text)
  320. """
  321. __slots__ = 'text', 'line_ctr', 'last_token'
  322. text: TextSlice
  323. line_ctr: LineCounter
  324. last_token: Optional[Token]
  325. def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
  326. if isinstance(text, TextSlice):
  327. if line_ctr is None:
  328. line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
  329. if text.start > 0:
  330. # Advance the line-count until line_ctr.char_pos == text.start
  331. line_ctr.feed(TextSlice(text.text, 0, text.start))
  332. if not (text.start <= line_ctr.char_pos <= text.end):
  333. raise ValueError("LineCounter.char_pos is out of bounds")
  334. self.text = text
  335. self.line_ctr = line_ctr
  336. self.last_token = last_token
  337. def __eq__(self, other):
  338. if not isinstance(other, LexerState):
  339. return NotImplemented
  340. return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
  341. def __copy__(self):
  342. return type(self)(self.text, copy(self.line_ctr), self.last_token)
  343. class LexerThread:
  344. """A thread that ties a lexer instance and a lexer state, to be used by the parser
  345. """
  346. def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
  347. self.lexer = lexer
  348. self.state = lexer_state
  349. @classmethod
  350. def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
  351. text = TextSlice.cast_from(text_or_slice)
  352. return cls(lexer, LexerState(text))
  353. @classmethod
  354. def from_custom_input(cls, lexer: 'Lexer', text: Any) -> 'LexerThread':
  355. return cls(lexer, LexerState(text))
  356. def lex(self, parser_state):
  357. if self.state is None:
  358. raise TypeError("Cannot lex: No text assigned to lexer state")
  359. return self.lexer.lex(self.state, parser_state)
  360. def __copy__(self):
  361. return type(self)(self.lexer, copy(self.state))
  362. _Token = Token
  363. _Callback = Callable[[Token], Token]
  364. class Lexer(ABC):
  365. """Lexer interface
  366. Method Signatures:
  367. lex(self, lexer_state, parser_state) -> Iterator[Token]
  368. """
  369. @abstractmethod
  370. def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
  371. return NotImplemented
  372. def make_lexer_state(self, text: str):
  373. "Deprecated"
  374. return LexerState(TextSlice.cast_from(text))
  375. def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
  376. if not comparator:
  377. comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
  378. # When in strict mode, we only ever try to provide one example, so taking
  379. # a long time for that should be fine
  380. max_time = 2 if strict_mode else 0.2
  381. # We don't want to show too many collisions.
  382. if comparator.count_marked_pairs() >= max_collisions_to_show:
  383. return
  384. for group in classify(terminal_to_regexp, lambda t: t.priority).values():
  385. for a, b in comparator.check(group, skip_marked=True):
  386. assert a.priority == b.priority
  387. # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
  388. comparator.mark(a, b)
  389. # Notify the user
  390. message = f"Collision between Terminals {a.name} and {b.name}. "
  391. try:
  392. example = comparator.get_example_overlap(a, b, max_time).format_multiline()
  393. except ValueError:
  394. # Couldn't find an example within max_time steps.
  395. example = "No example could be found fast enough. However, the collision does still exists"
  396. if strict_mode:
  397. raise LexError(f"{message}\n{example}")
  398. logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
  399. if comparator.count_marked_pairs() >= max_collisions_to_show:
  400. logger.warning("Found 8 regex collisions, will not check for more.")
  401. return
  402. class AbstractBasicLexer(Lexer):
  403. terminals_by_name: Dict[str, TerminalDef]
  404. @abstractmethod
  405. def __init__(self, conf: 'LexerConf', comparator=None) -> None:
  406. ...
  407. @abstractmethod
  408. def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
  409. ...
  410. def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
  411. with suppress(EOFError):
  412. while True:
  413. yield self.next_token(state, parser_state)
  414. class BasicLexer(AbstractBasicLexer):
  415. terminals: Collection[TerminalDef]
  416. ignore_types: FrozenSet[str]
  417. newline_types: FrozenSet[str]
  418. user_callbacks: Dict[str, _Callback]
  419. callback: Dict[str, _Callback]
  420. re: ModuleType
  421. def __init__(self, conf: 'LexerConf', comparator=None) -> None:
  422. terminals = list(conf.terminals)
  423. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  424. self.re = conf.re_module
  425. if not conf.skip_validation:
  426. # Sanitization
  427. terminal_to_regexp = {}
  428. for t in terminals:
  429. regexp = t.pattern.to_regexp()
  430. try:
  431. self.re.compile(regexp, conf.g_regex_flags)
  432. except self.re.error:
  433. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  434. if t.pattern.min_width == 0:
  435. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  436. if t.pattern.type == "re":
  437. terminal_to_regexp[t] = regexp
  438. if not (set(conf.ignore) <= {t.name for t in terminals}):
  439. raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))
  440. if has_interegular:
  441. _check_regex_collisions(terminal_to_regexp, comparator, conf.strict)
  442. elif conf.strict:
  443. raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")
  444. # Init
  445. self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
  446. self.ignore_types = frozenset(conf.ignore)
  447. terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  448. self.terminals = terminals
  449. self.user_callbacks = conf.callbacks
  450. self.g_regex_flags = conf.g_regex_flags
  451. self.use_bytes = conf.use_bytes
  452. self.terminals_by_name = conf.terminals_by_name
  453. self._scanner: Optional[Scanner] = None
  454. def _build_scanner(self) -> Scanner:
  455. terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
  456. assert all(self.callback.values())
  457. for type_, f in self.user_callbacks.items():
  458. if type_ in self.callback:
  459. # Already a callback there, probably UnlessCallback
  460. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  461. else:
  462. self.callback[type_] = f
  463. return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
  464. @property
  465. def scanner(self) -> Scanner:
  466. if self._scanner is None:
  467. self._scanner = self._build_scanner()
  468. return self._scanner
  469. def match(self, text, pos):
  470. return self.scanner.match(text, pos)
  471. def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
  472. line_ctr = lex_state.line_ctr
  473. while line_ctr.char_pos < lex_state.text.end:
  474. res = self.match(lex_state.text, line_ctr.char_pos)
  475. if not res:
  476. allowed = self.scanner.allowed_types - self.ignore_types
  477. if not allowed:
  478. allowed = {"<END-OF-FILE>"}
  479. raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
  480. allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
  481. state=parser_state, terminals_by_name=self.terminals_by_name)
  482. value, type_ = res
  483. ignored = type_ in self.ignore_types
  484. t = None
  485. if not ignored or type_ in self.callback:
  486. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  487. line_ctr.feed(value, type_ in self.newline_types)
  488. if t is not None:
  489. t.end_line = line_ctr.line
  490. t.end_column = line_ctr.column
  491. t.end_pos = line_ctr.char_pos
  492. if t.type in self.callback:
  493. t = self.callback[t.type](t)
  494. if not ignored:
  495. if not isinstance(t, Token):
  496. raise LexError("Callbacks must return a token (returned %r)" % t)
  497. lex_state.last_token = t
  498. return t
  499. # EOF
  500. raise EOFError(self)
  501. class ContextualLexer(Lexer):
  502. lexers: Dict[int, AbstractBasicLexer]
  503. root_lexer: AbstractBasicLexer
  504. BasicLexer: Type[AbstractBasicLexer] = BasicLexer
  505. def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None:
  506. terminals = list(conf.terminals)
  507. terminals_by_name = conf.terminals_by_name
  508. trad_conf = copy(conf)
  509. trad_conf.terminals = terminals
  510. if has_interegular and not conf.skip_validation:
  511. comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
  512. else:
  513. comparator = None
  514. lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {}
  515. self.lexers = {}
  516. for state, accepts in states.items():
  517. key = frozenset(accepts)
  518. try:
  519. lexer = lexer_by_tokens[key]
  520. except KeyError:
  521. accepts = set(accepts) | set(conf.ignore) | set(always_accept)
  522. lexer_conf = copy(trad_conf)
  523. lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
  524. lexer = self.BasicLexer(lexer_conf, comparator)
  525. lexer_by_tokens[key] = lexer
  526. self.lexers[state] = lexer
  527. assert trad_conf.terminals is terminals
  528. trad_conf.skip_validation = True # We don't need to verify all terminals again
  529. self.root_lexer = self.BasicLexer(trad_conf, comparator)
  530. def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]:
  531. try:
  532. while True:
  533. lexer = self.lexers[parser_state.position]
  534. yield lexer.next_token(lexer_state, parser_state)
  535. except EOFError:
  536. pass
  537. except UnexpectedCharacters as e:
  538. # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
  539. # This tests the input against the global context, to provide a nicer error.
  540. try:
  541. last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token
  542. token = self.root_lexer.next_token(lexer_state, parser_state)
  543. raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
  544. except UnexpectedCharacters:
  545. raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
  546. ###}