lark.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680
  1. from abc import ABC, abstractmethod
  2. import getpass
  3. import sys, os, pickle
  4. import tempfile
  5. import types
  6. import re
  7. from typing import (
  8. TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, Sequence,
  9. Tuple, Iterable, IO, Any, TYPE_CHECKING, Collection
  10. )
  11. if TYPE_CHECKING:
  12. from .parsers.lalr_interactive_parser import InteractiveParser
  13. from .tree import ParseTree
  14. from .visitors import Transformer
  15. from typing import Literal
  16. from .parser_frontends import ParsingFrontend
  17. from .exceptions import ConfigurationError, assert_config, UnexpectedInput
  18. from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice, LarkInput
  19. from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
  20. from .tree import Tree
  21. from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
  22. from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
  23. from .parse_tree_builder import ParseTreeBuilder
  24. from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
  25. from .grammar import Rule
  26. try:
  27. import regex
  28. _has_regex = True
  29. except ImportError:
  30. _has_regex = False
  31. ###{standalone
  32. class PostLex(ABC):
  33. @abstractmethod
  34. def process(self, stream: Iterator[Token]) -> Iterator[Token]:
  35. return stream
  36. always_accept: Iterable[str] = ()
  37. class LarkOptions(Serialize):
  38. """Specifies the options for Lark
  39. """
  40. start: List[str]
  41. debug: bool
  42. strict: bool
  43. transformer: 'Optional[Transformer]'
  44. propagate_positions: Union[bool, str]
  45. maybe_placeholders: bool
  46. cache: Union[bool, str]
  47. cache_grammar: bool
  48. regex: bool
  49. g_regex_flags: int
  50. keep_all_tokens: bool
  51. tree_class: Optional[Callable[[str, List], Any]]
  52. parser: _ParserArgType
  53. lexer: _LexerArgType
  54. ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
  55. postlex: Optional[PostLex]
  56. priority: 'Optional[Literal["auto", "normal", "invert"]]'
  57. lexer_callbacks: Dict[str, Callable[[Token], Token]]
  58. use_bytes: bool
  59. ordered_sets: bool
  60. edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]]
  61. import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]'
  62. source_path: Optional[str]
  63. OPTIONS_DOC = r"""
  64. **=== General Options ===**
  65. start
  66. The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")
  67. debug
  68. Display debug information and extra warnings. Use only when debugging (Default: ``False``)
  69. When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed.
  70. strict
  71. Throw an exception on any potential ambiguity, including shift/reduce conflicts, and regex collisions.
  72. transformer
  73. Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
  74. propagate_positions
  75. Propagates positional attributes into the 'meta' attribute of all tree branches.
  76. Sets attributes: (line, column, end_line, end_column, start_pos, end_pos,
  77. container_line, container_column, container_end_line, container_end_column)
  78. Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
  79. maybe_placeholders
  80. When ``True``, the ``[]`` operator returns ``None`` when not matched.
  81. When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all.
  82. (default= ``True``)
  83. cache
  84. Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
  85. - When ``False``, does nothing (default)
  86. - When ``True``, caches to a temporary file in the local directory
  87. - When given a string, caches to the path pointed by the string
  88. cache_grammar
  89. For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
  90. Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
  91. (default= ``False``)
  92. regex
  93. When True, uses the ``regex`` module instead of the stdlib ``re``.
  94. g_regex_flags
  95. Flags that are applied to all terminals (both regex and strings)
  96. keep_all_tokens
  97. Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``)
  98. tree_class
  99. Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
  100. **=== Algorithm Options ===**
  101. parser
  102. Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley").
  103. (there is also a "cyk" option for legacy)
  104. lexer
  105. Decides whether or not to use a lexer stage
  106. - "auto" (default): Choose for me based on the parser
  107. - "basic": Use a basic lexer
  108. - "contextual": Stronger lexer (only works with parser="lalr")
  109. - "dynamic": Flexible and powerful (only with parser="earley")
  110. - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
  111. ambiguity
  112. Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
  113. - "resolve": The parser will automatically choose the simplest derivation
  114. (it chooses consistently: greedy for tokens, non-greedy for rules)
  115. - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).
  116. - "forest": The parser will return the root of the shared packed parse forest.
  117. **=== Misc. / Domain Specific Options ===**
  118. postlex
  119. Lexer post-processing (Default: ``None``) Only works with the basic and contextual lexers.
  120. priority
  121. How priorities should be evaluated - "auto", ``None``, "normal", "invert" (Default: "auto")
  122. lexer_callbacks
  123. Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
  124. use_bytes
  125. Accept an input of type ``bytes`` instead of ``str``.
  126. ordered_sets
  127. Should Earley use ordered-sets to achieve stable output (~10% slower than regular sets. Default: True)
  128. edit_terminals
  129. A callback for editing the terminals before parse.
  130. import_paths
  131. A List of either paths or loader functions to specify from where grammars are imported
  132. source_path
  133. Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
  134. **=== End of Options ===**
  135. """
  136. if __doc__:
  137. __doc__ += OPTIONS_DOC
  138. # Adding a new option needs to be done in multiple places:
  139. # - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts
  140. # - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs
  141. # - As an attribute of `LarkOptions` above
  142. # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded
  143. # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument
  144. _defaults: Dict[str, Any] = {
  145. 'debug': False,
  146. 'strict': False,
  147. 'keep_all_tokens': False,
  148. 'tree_class': None,
  149. 'cache': False,
  150. 'cache_grammar': False,
  151. 'postlex': None,
  152. 'parser': 'earley',
  153. 'lexer': 'auto',
  154. 'transformer': None,
  155. 'start': 'start',
  156. 'priority': 'auto',
  157. 'ambiguity': 'auto',
  158. 'regex': False,
  159. 'propagate_positions': False,
  160. 'lexer_callbacks': {},
  161. 'maybe_placeholders': True,
  162. 'edit_terminals': None,
  163. 'g_regex_flags': 0,
  164. 'use_bytes': False,
  165. 'ordered_sets': True,
  166. 'import_paths': [],
  167. 'source_path': None,
  168. '_plugins': {},
  169. }
  170. def __init__(self, options_dict: Dict[str, Any]) -> None:
  171. o = dict(options_dict)
  172. options = {}
  173. for name, default in self._defaults.items():
  174. if name in o:
  175. value = o.pop(name)
  176. if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'):
  177. value = bool(value)
  178. else:
  179. value = default
  180. options[name] = value
  181. if isinstance(options['start'], str):
  182. options['start'] = [options['start']]
  183. self.__dict__['options'] = options
  184. assert_config(self.parser, ('earley', 'lalr', 'cyk', None))
  185. if self.parser == 'earley' and self.transformer:
  186. raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
  187. 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
  188. if self.cache_grammar and not self.cache:
  189. raise ConfigurationError('cache_grammar cannot be set when cache is disabled')
  190. if o:
  191. raise ConfigurationError("Unknown options: %s" % o.keys())
  192. def __getattr__(self, name: str) -> Any:
  193. try:
  194. return self.__dict__['options'][name]
  195. except KeyError as e:
  196. raise AttributeError(e)
  197. def __setattr__(self, name: str, value: str) -> None:
  198. assert_config(name, self.options.keys(), "%r isn't a valid option. Expected one of: %s")
  199. self.options[name] = value
  200. def serialize(self, memo = None) -> Dict[str, Any]:
  201. return self.options
  202. @classmethod
  203. def deserialize(cls, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]]) -> "LarkOptions":
  204. return cls(data)
  205. # Options that can be passed to the Lark parser, even when it was loaded from cache/standalone.
  206. # These options are only used outside of `load_grammar`.
  207. _LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'lexer_callbacks', 'use_bytes', 'debug', 'g_regex_flags', 'regex', 'propagate_positions', 'tree_class', '_plugins'}
  208. _VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None)
  209. _VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest')
  210. _T = TypeVar('_T', bound="Lark")
  211. class Lark(Serialize):
  212. """Main interface for the library.
  213. It's mostly a thin wrapper for the many different parsers, and for the tree constructor.
  214. Parameters:
  215. grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax)
  216. options: a dictionary controlling various aspects of Lark.
  217. Example:
  218. >>> Lark(r'''start: "foo" ''')
  219. Lark(...)
  220. """
  221. source_path: str
  222. source_grammar: str
  223. grammar: 'Grammar'
  224. options: LarkOptions
  225. lexer: Lexer
  226. parser: 'ParsingFrontend'
  227. terminals: Collection[TerminalDef]
  228. __serialize_fields__ = ['parser', 'rules', 'options']
  229. def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
  230. self.options = LarkOptions(options)
  231. re_module: types.ModuleType
  232. # Update which fields are serialized
  233. if self.options.cache_grammar:
  234. self.__serialize_fields__ = self.__serialize_fields__ + ['grammar']
  235. # Set regex or re module
  236. use_regex = self.options.regex
  237. if use_regex:
  238. if _has_regex:
  239. re_module = regex
  240. else:
  241. raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
  242. else:
  243. re_module = re
  244. # Some, but not all file-like objects have a 'name' attribute
  245. if self.options.source_path is None:
  246. try:
  247. self.source_path = grammar.name # type: ignore[union-attr]
  248. except AttributeError:
  249. self.source_path = '<string>'
  250. else:
  251. self.source_path = self.options.source_path
  252. # Drain file-like objects to get their contents
  253. try:
  254. read = grammar.read # type: ignore[union-attr]
  255. except AttributeError:
  256. pass
  257. else:
  258. grammar = read()
  259. cache_fn = None
  260. cache_sha256 = None
  261. if isinstance(grammar, str):
  262. self.source_grammar = grammar
  263. if self.options.use_bytes:
  264. if not grammar.isascii():
  265. raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
  266. if self.options.cache:
  267. if self.options.parser != 'lalr':
  268. raise ConfigurationError("cache only works with parser='lalr' for now")
  269. unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals', '_plugins')
  270. options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
  271. from . import __version__
  272. s = grammar + options_str + __version__ + str(sys.version_info[:2])
  273. cache_sha256 = sha256_digest(s)
  274. if isinstance(self.options.cache, str):
  275. cache_fn = self.options.cache
  276. else:
  277. if self.options.cache is not True:
  278. raise ConfigurationError("cache argument must be bool or str")
  279. try:
  280. username = getpass.getuser()
  281. except Exception:
  282. # The exception raised may be ImportError or OSError in
  283. # the future. For the cache, we don't care about the
  284. # specific reason - we just want a username.
  285. username = "unknown"
  286. cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
  287. "cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])
  288. old_options = self.options
  289. try:
  290. with FS.open(cache_fn, 'rb') as f:
  291. logger.debug('Loading grammar from cache: %s', cache_fn)
  292. # Remove options that aren't relevant for loading from cache
  293. for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
  294. del options[name]
  295. file_sha256 = f.readline().rstrip(b'\n')
  296. cached_used_files = pickle.load(f)
  297. if file_sha256 == cache_sha256.encode('utf8') and verify_used_files(cached_used_files):
  298. cached_parser_data = pickle.load(f)
  299. self._load(cached_parser_data, **options)
  300. return
  301. except FileNotFoundError:
  302. # The cache file doesn't exist; parse and compose the grammar as normal
  303. pass
  304. except Exception: # We should probably narrow done which errors we catch here.
  305. logger.exception("Failed to load Lark from cache: %r. We will try to carry on.", cache_fn)
  306. # In theory, the Lark instance might have been messed up by the call to `_load`.
  307. # In practice the only relevant thing that might have been overwritten should be `options`
  308. self.options = old_options
  309. # Parse the grammar file and compose the grammars
  310. self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
  311. else:
  312. assert isinstance(grammar, Grammar)
  313. self.grammar = grammar
  314. if self.options.lexer == 'auto':
  315. if self.options.parser == 'lalr':
  316. self.options.lexer = 'contextual'
  317. elif self.options.parser == 'earley':
  318. if self.options.postlex is not None:
  319. logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. "
  320. "Consider using lalr with contextual instead of earley")
  321. self.options.lexer = 'basic'
  322. else:
  323. self.options.lexer = 'dynamic'
  324. elif self.options.parser == 'cyk':
  325. self.options.lexer = 'basic'
  326. else:
  327. assert False, self.options.parser
  328. lexer = self.options.lexer
  329. if isinstance(lexer, type):
  330. assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance
  331. else:
  332. assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
  333. if self.options.postlex is not None and 'dynamic' in lexer:
  334. raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")
  335. if self.options.ambiguity == 'auto':
  336. if self.options.parser == 'earley':
  337. self.options.ambiguity = 'resolve'
  338. else:
  339. assert_config(self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s")
  340. if self.options.priority == 'auto':
  341. self.options.priority = 'normal'
  342. if self.options.priority not in _VALID_PRIORITY_OPTIONS:
  343. raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS))
  344. if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
  345. raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))
  346. if self.options.parser is None:
  347. terminals_to_keep = '*' # For lexer-only mode, keep all terminals
  348. elif self.options.postlex is not None:
  349. terminals_to_keep = set(self.options.postlex.always_accept)
  350. else:
  351. terminals_to_keep = set()
  352. # Compile the EBNF grammar into BNF
  353. self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep)
  354. if self.options.edit_terminals:
  355. for t in self.terminals:
  356. self.options.edit_terminals(t)
  357. self._terminals_dict = {t.name: t for t in self.terminals}
  358. # If the user asked to invert the priorities, negate them all here.
  359. if self.options.priority == 'invert':
  360. for rule in self.rules:
  361. if rule.options.priority is not None:
  362. rule.options.priority = -rule.options.priority
  363. for term in self.terminals:
  364. term.priority = -term.priority
  365. # Else, if the user asked to disable priorities, strip them from the
  366. # rules and terminals. This allows the Earley parsers to skip an extra forest walk
  367. # for improved performance, if you don't need them (or didn't specify any).
  368. elif self.options.priority is None:
  369. for rule in self.rules:
  370. if rule.options.priority is not None:
  371. rule.options.priority = None
  372. for term in self.terminals:
  373. term.priority = 0
  374. # TODO Deprecate lexer_callbacks?
  375. self.lexer_conf = LexerConf(
  376. self.terminals, re_module, self.ignore_tokens, self.options.postlex,
  377. self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes, strict=self.options.strict
  378. )
  379. if self.options.parser:
  380. self.parser = self._build_parser()
  381. elif lexer:
  382. self.lexer = self._build_lexer()
  383. if cache_fn:
  384. logger.debug('Saving grammar to cache: %s', cache_fn)
  385. try:
  386. with FS.open(cache_fn, 'wb') as f:
  387. assert cache_sha256 is not None
  388. f.write(cache_sha256.encode('utf8') + b'\n')
  389. pickle.dump(used_files, f)
  390. self.save(f, _LOAD_ALLOWED_OPTIONS)
  391. except IOError as e:
  392. logger.exception("Failed to save Lark to cache: %r.", cache_fn, e)
  393. if __doc__:
  394. __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
  395. def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
  396. lexer_conf = self.lexer_conf
  397. if dont_ignore:
  398. from copy import copy
  399. lexer_conf = copy(lexer_conf)
  400. lexer_conf.ignore = ()
  401. return BasicLexer(lexer_conf)
  402. def _prepare_callbacks(self) -> None:
  403. self._callbacks = {}
  404. # we don't need these callbacks if we aren't building a tree
  405. if self.options.ambiguity != 'forest':
  406. self._parse_tree_builder = ParseTreeBuilder(
  407. self.rules,
  408. self.options.tree_class or Tree,
  409. self.options.propagate_positions,
  410. self.options.parser != 'lalr' and self.options.ambiguity == 'explicit',
  411. self.options.maybe_placeholders
  412. )
  413. self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
  414. self._callbacks.update(_get_lexer_callbacks(self.options.transformer, self.terminals))
  415. def _build_parser(self) -> "ParsingFrontend":
  416. self._prepare_callbacks()
  417. _validate_frontend_args(self.options.parser, self.options.lexer)
  418. parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
  419. return _construct_parsing_frontend(
  420. self.options.parser,
  421. self.options.lexer,
  422. self.lexer_conf,
  423. parser_conf,
  424. options=self.options
  425. )
  426. def save(self, f, exclude_options: Collection[str] = ()) -> None:
  427. """Saves the instance into the given file object
  428. Useful for caching and multiprocessing.
  429. """
  430. if self.options.parser != 'lalr':
  431. raise NotImplementedError("Lark.save() is only implemented for the LALR(1) parser.")
  432. data, m = self.memo_serialize([TerminalDef, Rule])
  433. if exclude_options:
  434. data["options"] = {n: v for n, v in data["options"].items() if n not in exclude_options}
  435. pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL)
  436. @classmethod
  437. def load(cls: Type[_T], f) -> _T:
  438. """Loads an instance from the given file object
  439. Useful for caching and multiprocessing.
  440. """
  441. inst = cls.__new__(cls)
  442. return inst._load(f)
  443. def _deserialize_lexer_conf(self, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]], options: LarkOptions) -> LexerConf:
  444. lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
  445. lexer_conf.callbacks = options.lexer_callbacks or {}
  446. lexer_conf.re_module = regex if options.regex else re
  447. lexer_conf.use_bytes = options.use_bytes
  448. lexer_conf.g_regex_flags = options.g_regex_flags
  449. lexer_conf.skip_validation = True
  450. lexer_conf.postlex = options.postlex
  451. return lexer_conf
  452. def _load(self: _T, f: Any, **kwargs) -> _T:
  453. if isinstance(f, dict):
  454. d = f
  455. else:
  456. d = pickle.load(f)
  457. memo_json = d['memo']
  458. data = d['data']
  459. assert memo_json
  460. memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
  461. if 'grammar' in data:
  462. self.grammar = Grammar.deserialize(data['grammar'], memo)
  463. options = dict(data['options'])
  464. if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
  465. raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
  466. .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS))
  467. options.update(kwargs)
  468. self.options = LarkOptions.deserialize(options, memo)
  469. self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
  470. self.source_path = '<deserialized>'
  471. _validate_frontend_args(self.options.parser, self.options.lexer)
  472. self.lexer_conf = self._deserialize_lexer_conf(data['parser'], memo, self.options)
  473. self.terminals = self.lexer_conf.terminals
  474. self._prepare_callbacks()
  475. self._terminals_dict = {t.name: t for t in self.terminals}
  476. self.parser = _deserialize_parsing_frontend(
  477. data['parser'],
  478. memo,
  479. self.lexer_conf,
  480. self._callbacks,
  481. self.options, # Not all, but multiple attributes are used
  482. )
  483. return self
  484. @classmethod
  485. def _load_from_dict(cls, data, memo, **kwargs):
  486. inst = cls.__new__(cls)
  487. return inst._load({'data': data, 'memo': memo}, **kwargs)
  488. @classmethod
  489. def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str]=None, **options) -> _T:
  490. """Create an instance of Lark with the grammar given by its filename
  491. If ``rel_to`` is provided, the function will find the grammar filename in relation to it.
  492. Example:
  493. >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
  494. Lark(...)
  495. """
  496. if rel_to:
  497. basepath = os.path.dirname(rel_to)
  498. grammar_filename = os.path.join(basepath, grammar_filename)
  499. with open(grammar_filename, encoding='utf8') as f:
  500. return cls(f, **options)
  501. @classmethod
  502. def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: 'Sequence[str]'=[""], **options) -> _T:
  503. """Create an instance of Lark with the grammar loaded from within the package `package`.
  504. This allows grammar loading from zipapps.
  505. Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`
  506. Example:
  507. Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
  508. """
  509. package_loader = FromPackageLoader(package, search_paths)
  510. full_path, text = package_loader(None, grammar_path)
  511. options.setdefault('source_path', full_path)
  512. options.setdefault('import_paths', [])
  513. options['import_paths'].append(package_loader)
  514. return cls(text, **options)
  515. def __repr__(self):
  516. return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
  517. def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
  518. """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
  519. When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
  520. :raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
  521. """
  522. lexer: Lexer
  523. if not hasattr(self, 'lexer') or dont_ignore:
  524. lexer = self._build_lexer(dont_ignore)
  525. else:
  526. lexer = self.lexer
  527. lexer_thread = LexerThread.from_text(lexer, text)
  528. stream = lexer_thread.lex(None)
  529. if self.options.postlex:
  530. return self.options.postlex.process(stream)
  531. return stream
  532. def get_terminal(self, name: str) -> TerminalDef:
  533. """Get information about a terminal"""
  534. return self._terminals_dict[name]
  535. def parse_interactive(self, text: Optional[LarkInput]=None, start: Optional[str]=None) -> 'InteractiveParser':
  536. """Start an interactive parsing session. Only works when parser='lalr'.
  537. Parameters:
  538. text (LarkInput, optional): Text to be parsed. Required for ``resume_parse()``.
  539. start (str, optional): Start symbol
  540. Returns:
  541. A new InteractiveParser instance.
  542. See Also: ``Lark.parse()``
  543. """
  544. return self.parser.parse_interactive(text, start=start)
  545. def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
  546. """Parse the given text, according to the options provided.
  547. Parameters:
  548. text (LarkInput): Text to be parsed, as `str` or `bytes`.
  549. TextSlice may also be used, but only when lexer='basic' or 'contextual'.
  550. If Lark was created with a custom lexer, this may be an object of any type.
  551. start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
  552. on_error (function, optional): if provided, will be called on UnexpectedInput error,
  553. with the exception as its argument. Return true to resume parsing, or false to raise the exception.
  554. LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
  555. Returns:
  556. If a transformer is supplied to ``__init__``, returns whatever is the
  557. result of the transformation. Otherwise, returns a Tree instance.
  558. :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
  559. ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
  560. For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.
  561. """
  562. if on_error is not None and self.options.parser != 'lalr':
  563. raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
  564. return self.parser.parse(text, start=start, on_error=on_error)
  565. ###}