earley.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. """This module implements an Earley parser.
  2. The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
  3. https://www.sciencedirect.com/science/article/pii/S1571066108001497
  4. That is probably the best reference for understanding the algorithm here.
  5. The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
  6. is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
  7. """
  8. from typing import TYPE_CHECKING, Callable, Optional, List, Any
  9. from collections import deque
  10. from ..lexer import Token
  11. from ..tree import Tree
  12. from ..exceptions import UnexpectedEOF, UnexpectedToken
  13. from ..utils import logger, OrderedSet, dedup_list
  14. from .grammar_analysis import GrammarAnalyzer
  15. from ..grammar import NonTerminal
  16. from .earley_common import Item
  17. from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
  18. if TYPE_CHECKING:
  19. from ..common import LexerConf, ParserConf
  20. class Parser:
  21. lexer_conf: 'LexerConf'
  22. parser_conf: 'ParserConf'
  23. debug: bool
  24. def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
  25. resolve_ambiguity: bool=True, debug: bool=False,
  26. tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
  27. analysis = GrammarAnalyzer(parser_conf)
  28. self.lexer_conf = lexer_conf
  29. self.parser_conf = parser_conf
  30. self.resolve_ambiguity = resolve_ambiguity
  31. self.debug = debug
  32. self.Tree = tree_class
  33. self.Set = OrderedSet if ordered_sets else set
  34. self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
  35. self.FIRST = analysis.FIRST
  36. self.NULLABLE = analysis.NULLABLE
  37. self.callbacks = parser_conf.callbacks
  38. # TODO add typing info
  39. self.predictions = {} # type: ignore[var-annotated]
  40. ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
  41. # the slow 'isupper' in is_terminal.
  42. self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
  43. self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
  44. self.forest_sum_visitor = None
  45. for rule in parser_conf.rules:
  46. if rule.origin not in self.predictions:
  47. self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
  48. ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
  49. # the priorities will be stripped from all rules/terminals before they reach us, allowing us to
  50. # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
  51. # on any rules/terminals.
  52. if self.forest_sum_visitor is None and rule.options.priority is not None:
  53. self.forest_sum_visitor = ForestSumVisitor
  54. # Check terminals for priorities
  55. # Ignore terminal priorities if the basic lexer is used
  56. if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
  57. for term in self.lexer_conf.terminals:
  58. if term.priority:
  59. self.forest_sum_visitor = ForestSumVisitor
  60. break
  61. self.term_matcher = term_matcher
  62. def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
  63. """The core Earley Predictor and Completer.
  64. At each stage of the input, we handling any completed items (things
  65. that matched on the last cycle) and use those to predict what should
  66. come next in the input stream. The completions and any predicted
  67. non-terminals are recursively processed until we reach a set of,
  68. which can be added to the scan list for the next scanner cycle."""
  69. # Held Completions (H in E.Scotts paper).
  70. held_completions = {}
  71. column = columns[i]
  72. # R (items) = Ei (column.items)
  73. items = deque(column)
  74. while items:
  75. item = items.pop() # remove an element, A say, from R
  76. ### The Earley completer
  77. if item.is_complete: ### (item.s == string)
  78. if item.node is None:
  79. label = (item.s, item.start, i)
  80. item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
  81. item.node.add_family(item.s, item.rule, item.start, None, None)
  82. # create_leo_transitives(item.rule.origin, item.start)
  83. ###R Joop Leo right recursion Completer
  84. if item.rule.origin in transitives[item.start]:
  85. transitive = transitives[item.start][item.s]
  86. if transitive.previous in transitives[transitive.column]:
  87. root_transitive = transitives[transitive.column][transitive.previous]
  88. else:
  89. root_transitive = transitive
  90. new_item = Item(transitive.rule, transitive.ptr, transitive.start)
  91. label = (root_transitive.s, root_transitive.start, i)
  92. new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
  93. new_item.node.add_path(root_transitive, item.node)
  94. if new_item.expect in self.TERMINALS:
  95. # Add (B :: aC.B, h, y) to Q
  96. to_scan.add(new_item)
  97. elif new_item not in column:
  98. # Add (B :: aC.B, h, y) to Ei and R
  99. column.add(new_item)
  100. items.append(new_item)
  101. ###R Regular Earley completer
  102. else:
  103. # Empty has 0 length. If we complete an empty symbol in a particular
  104. # parse step, we need to be able to use that same empty symbol to complete
  105. # any predictions that result, that themselves require empty. Avoids
  106. # infinite recursion on empty symbols.
  107. # held_completions is 'H' in E.Scott's paper.
  108. is_empty_item = item.start == i
  109. if is_empty_item:
  110. held_completions[item.rule.origin] = item.node
  111. originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
  112. for originator in originators:
  113. new_item = originator.advance()
  114. label = (new_item.s, originator.start, i)
  115. new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
  116. new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
  117. if new_item.expect in self.TERMINALS:
  118. # Add (B :: aC.B, h, y) to Q
  119. to_scan.add(new_item)
  120. elif new_item not in column:
  121. # Add (B :: aC.B, h, y) to Ei and R
  122. column.add(new_item)
  123. items.append(new_item)
  124. ### The Earley predictor
  125. elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
  126. new_items = []
  127. for rule in self.predictions[item.expect]:
  128. new_item = Item(rule, 0, i)
  129. new_items.append(new_item)
  130. # Process any held completions (H).
  131. if item.expect in held_completions:
  132. new_item = item.advance()
  133. label = (new_item.s, item.start, i)
  134. new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
  135. new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
  136. new_items.append(new_item)
  137. for new_item in new_items:
  138. if new_item.expect in self.TERMINALS:
  139. to_scan.add(new_item)
  140. elif new_item not in column:
  141. column.add(new_item)
  142. items.append(new_item)
  143. def _parse(self, lexer, columns, to_scan, start_symbol=None):
  144. def is_quasi_complete(item):
  145. if item.is_complete:
  146. return True
  147. quasi = item.advance()
  148. while not quasi.is_complete:
  149. if quasi.expect not in self.NULLABLE:
  150. return False
  151. if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
  152. return False
  153. quasi = quasi.advance()
  154. return True
  155. # def create_leo_transitives(origin, start):
  156. # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
  157. def scan(i, token, to_scan):
  158. """The core Earley Scanner.
  159. This is a custom implementation of the scanner that uses the
  160. Lark lexer to match tokens. The scan list is built by the
  161. Earley predictor, based on the previously completed tokens.
  162. This ensures that at each phase of the parse we have a custom
  163. lexer context, allowing for more complex ambiguities."""
  164. next_to_scan = self.Set()
  165. next_set = self.Set()
  166. columns.append(next_set)
  167. transitives.append({})
  168. node_cache = {}
  169. for item in self.Set(to_scan):
  170. if match(item.expect, token):
  171. new_item = item.advance()
  172. label = (new_item.s, new_item.start, i + 1)
  173. # 'terminals' may not contain token.type when using %declare
  174. # Additionally, token is not always a Token
  175. # For example, it can be a Tree when using TreeMatcher
  176. term = terminals.get(token.type) if isinstance(token, Token) else None
  177. # Set the priority of the token node to 0 so that the
  178. # terminal priorities do not affect the Tree chosen by
  179. # ForestSumVisitor after the basic lexer has already
  180. # "used up" the terminal priorities
  181. token_node = TokenNode(token, term, priority=0)
  182. new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
  183. new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
  184. if new_item.expect in self.TERMINALS:
  185. # add (B ::= Aai+1.B, h, y) to Q'
  186. next_to_scan.add(new_item)
  187. else:
  188. # add (B ::= Aa+1.B, h, y) to Ei+1
  189. next_set.add(new_item)
  190. if not next_set and not next_to_scan:
  191. expect = {i.expect.name for i in to_scan}
  192. raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
  193. return next_to_scan, node_cache
  194. # Define parser functions
  195. match = self.term_matcher
  196. terminals = self.lexer_conf.terminals_by_name
  197. # Cache for nodes & tokens created in a particular parse step.
  198. transitives = [{}]
  199. ## The main Earley loop.
  200. # Run the Prediction/Completion cycle for any Items in the current Earley set.
  201. # Completions will be added to the SPPF tree, and predictions will be recursively
  202. # processed down to terminals/empty nodes to be added to the scanner for the next
  203. # step.
  204. expects = {i.expect for i in to_scan}
  205. i = 0
  206. node_cache = {}
  207. for token in lexer.lex(expects):
  208. self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
  209. to_scan, node_cache = scan(i, token, to_scan)
  210. i += 1
  211. expects.clear()
  212. expects |= {i.expect for i in to_scan}
  213. self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
  214. ## Column is now the final column in the parse.
  215. assert i == len(columns)-1
  216. return to_scan
  217. def parse(self, lexer, start):
  218. assert start, start
  219. start_symbol = NonTerminal(start)
  220. columns = [self.Set()]
  221. to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
  222. ## Predict for the start_symbol.
  223. # Add predicted items to the first Earley set (for the predictor) if they
  224. # result in a non-terminal, or the scanner if they result in a terminal.
  225. for rule in self.predictions[start_symbol]:
  226. item = Item(rule, 0, 0)
  227. if item.expect in self.TERMINALS:
  228. to_scan.add(item)
  229. else:
  230. columns[0].add(item)
  231. to_scan = self._parse(lexer, columns, to_scan, start_symbol)
  232. # If the parse was successful, the start
  233. # symbol should have been completed in the last step of the Earley cycle, and will be in
  234. # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
  235. solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
  236. if not solutions:
  237. expected_terminals = [t.expect.name for t in to_scan]
  238. raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
  239. if len(solutions) > 1:
  240. raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.')
  241. solution ,= solutions
  242. if self.debug:
  243. from .earley_forest import ForestToPyDotVisitor
  244. try:
  245. debug_walker = ForestToPyDotVisitor()
  246. except ImportError:
  247. logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
  248. else:
  249. debug_walker.visit(solution, "sppf.png")
  250. if self.Tree is not None:
  251. # Perform our SPPF -> AST conversion
  252. # Disable the ForestToParseTree cache when ambiguity='resolve'
  253. # to prevent a tree construction bug. See issue #1283
  254. use_cache = not self.resolve_ambiguity
  255. transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
  256. return transformer.transform(solution)
  257. # return the root of the SPPF
  258. return solution