lalr_analysis.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. """This module builds a LALR(1) transition-table for lalr_parser.py
  2. For now, shift/reduce conflicts are automatically resolved as shifts.
  3. """
  4. # Author: Erez Shinan (2017)
  5. # Email : erezshin@gmail.com
  6. from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic
  7. from collections import defaultdict
  8. from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
  9. from ..exceptions import GrammarError
  10. from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State
  11. from ..grammar import Rule, Symbol
  12. from ..common import ParserConf
  13. ###{standalone
  14. class Action:
  15. def __init__(self, name):
  16. self.name = name
  17. def __str__(self):
  18. return self.name
  19. def __repr__(self):
  20. return str(self)
  21. Shift = Action('Shift')
  22. Reduce = Action('Reduce')
  23. StateT = TypeVar("StateT")
  24. class ParseTableBase(Generic[StateT]):
  25. states: Dict[StateT, Dict[str, Tuple]]
  26. start_states: Dict[str, StateT]
  27. end_states: Dict[str, StateT]
  28. def __init__(self, states, start_states, end_states):
  29. self.states = states
  30. self.start_states = start_states
  31. self.end_states = end_states
  32. def serialize(self, memo):
  33. tokens = Enumerator()
  34. states = {
  35. state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
  36. for token, (action, arg) in actions.items()}
  37. for state, actions in self.states.items()
  38. }
  39. return {
  40. 'tokens': tokens.reversed(),
  41. 'states': states,
  42. 'start_states': self.start_states,
  43. 'end_states': self.end_states,
  44. }
  45. @classmethod
  46. def deserialize(cls, data, memo):
  47. tokens = data['tokens']
  48. states = {
  49. state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
  50. for token, (action, arg) in actions.items()}
  51. for state, actions in data['states'].items()
  52. }
  53. return cls(states, data['start_states'], data['end_states'])
  54. class ParseTable(ParseTableBase['State']):
  55. """Parse-table whose key is State, i.e. set[RulePtr]
  56. Slower than IntParseTable, but useful for debugging
  57. """
  58. pass
  59. class IntParseTable(ParseTableBase[int]):
  60. """Parse-table whose key is int. Best for performance."""
  61. @classmethod
  62. def from_ParseTable(cls, parse_table: ParseTable):
  63. enum = list(parse_table.states)
  64. state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)}
  65. int_states = {}
  66. for s, la in parse_table.states.items():
  67. la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
  68. for k,v in la.items()}
  69. int_states[ state_to_idx[s] ] = la
  70. start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
  71. end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
  72. return cls(int_states, start_states, end_states)
  73. ###}
  74. # digraph and traverse, see The Theory and Practice of Compiler Writing
  75. # computes F(x) = G(x) union (union { G(y) | x R y })
  76. # X: nodes
  77. # R: relation (function mapping node -> list of nodes that satisfy the relation)
  78. # G: set valued function
  79. def digraph(X, R, G):
  80. F = {}
  81. S = []
  82. N = dict.fromkeys(X, 0)
  83. for x in X:
  84. # this is always true for the first iteration, but N[x] may be updated in traverse below
  85. if N[x] == 0:
  86. traverse(x, S, N, X, R, G, F)
  87. return F
  88. # x: single node
  89. # S: stack
  90. # N: weights
  91. # X: nodes
  92. # R: relation (see above)
  93. # G: set valued function
  94. # F: set valued function we are computing (map of input -> output)
  95. def traverse(x, S, N, X, R, G, F):
  96. S.append(x)
  97. d = len(S)
  98. N[x] = d
  99. F[x] = G[x]
  100. for y in R[x]:
  101. if N[y] == 0:
  102. traverse(y, S, N, X, R, G, F)
  103. n_x = N[x]
  104. assert(n_x > 0)
  105. n_y = N[y]
  106. assert(n_y != 0)
  107. if (n_y > 0) and (n_y < n_x):
  108. N[x] = n_y
  109. F[x].update(F[y])
  110. if N[x] == d:
  111. f_x = F[x]
  112. while True:
  113. z = S.pop()
  114. N[z] = -1
  115. F[z] = f_x
  116. if z == x:
  117. break
  118. class LALR_Analyzer(GrammarAnalyzer):
  119. lr0_itemsets: Set[LR0ItemSet]
  120. nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]]
  121. lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]]
  122. includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
  123. reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
  124. directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]]
  125. def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
  126. GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
  127. self.nonterminal_transitions = []
  128. self.directly_reads = defaultdict(set)
  129. self.reads = defaultdict(set)
  130. self.includes = defaultdict(set)
  131. self.lookback = defaultdict(set)
  132. def compute_lr0_states(self) -> None:
  133. self.lr0_itemsets = set()
  134. # map of kernels to LR0ItemSets
  135. cache: Dict['State', LR0ItemSet] = {}
  136. def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]:
  137. _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
  138. d = classify(unsat, lambda rp: rp.next)
  139. for sym, rps in d.items():
  140. kernel = fzset({rp.advance(sym) for rp in rps})
  141. new_state = cache.get(kernel, None)
  142. if new_state is None:
  143. closure = set(kernel)
  144. for rp in kernel:
  145. if not rp.is_satisfied and not rp.next.is_term:
  146. closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
  147. new_state = LR0ItemSet(kernel, closure)
  148. cache[kernel] = new_state
  149. state.transitions[sym] = new_state
  150. yield new_state
  151. self.lr0_itemsets.add(state)
  152. for _ in bfs(self.lr0_start_states.values(), step):
  153. pass
  154. def compute_reads_relations(self):
  155. # handle start state
  156. for root in self.lr0_start_states.values():
  157. assert(len(root.kernel) == 1)
  158. for rp in root.kernel:
  159. assert(rp.index == 0)
  160. self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
  161. for state in self.lr0_itemsets:
  162. seen = set()
  163. for rp in state.closure:
  164. if rp.is_satisfied:
  165. continue
  166. s = rp.next
  167. # if s is a not a nonterminal
  168. if s not in self.lr0_rules_by_origin:
  169. continue
  170. if s in seen:
  171. continue
  172. seen.add(s)
  173. nt = (state, s)
  174. self.nonterminal_transitions.append(nt)
  175. dr = self.directly_reads[nt]
  176. r = self.reads[nt]
  177. next_state = state.transitions[s]
  178. for rp2 in next_state.closure:
  179. if rp2.is_satisfied:
  180. continue
  181. s2 = rp2.next
  182. # if s2 is a terminal
  183. if s2 not in self.lr0_rules_by_origin:
  184. dr.add(s2)
  185. if s2 in self.NULLABLE:
  186. r.add((next_state, s2))
  187. def compute_includes_lookback(self):
  188. for nt in self.nonterminal_transitions:
  189. state, nonterminal = nt
  190. includes = []
  191. lookback = self.lookback[nt]
  192. for rp in state.closure:
  193. if rp.rule.origin != nonterminal:
  194. continue
  195. # traverse the states for rp(.rule)
  196. state2 = state
  197. for i in range(rp.index, len(rp.rule.expansion)):
  198. s = rp.rule.expansion[i]
  199. nt2 = (state2, s)
  200. state2 = state2.transitions[s]
  201. if nt2 not in self.reads:
  202. continue
  203. for j in range(i + 1, len(rp.rule.expansion)):
  204. if rp.rule.expansion[j] not in self.NULLABLE:
  205. break
  206. else:
  207. includes.append(nt2)
  208. # state2 is at the final state for rp.rule
  209. if rp.index == 0:
  210. for rp2 in state2.closure:
  211. if (rp2.rule == rp.rule) and rp2.is_satisfied:
  212. lookback.add((state2, rp2.rule))
  213. for nt2 in includes:
  214. self.includes[nt2].add(nt)
  215. def compute_lookaheads(self):
  216. read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
  217. follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)
  218. for nt, lookbacks in self.lookback.items():
  219. for state, rule in lookbacks:
  220. for s in follow_sets[nt]:
  221. state.lookaheads[s].add(rule)
  222. def compute_lalr1_states(self) -> None:
  223. m: Dict[LR0ItemSet, Dict[str, Tuple]] = {}
  224. reduce_reduce = []
  225. for itemset in self.lr0_itemsets:
  226. actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure)
  227. for la, next_state in itemset.transitions.items()}
  228. for la, rules in itemset.lookaheads.items():
  229. if len(rules) > 1:
  230. # Try to resolve conflict based on priority
  231. p = [(r.options.priority or 0, r) for r in rules]
  232. p.sort(key=lambda r: r[0], reverse=True)
  233. best, second_best = p[:2]
  234. if best[0] > second_best[0]:
  235. rules = {best[1]}
  236. else:
  237. reduce_reduce.append((itemset, la, rules))
  238. continue
  239. rule ,= rules
  240. if la in actions:
  241. if self.strict:
  242. msg = f'Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n' \
  243. f' * {rule}\n'
  244. raise GrammarError(msg)
  245. elif self.debug:
  246. logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
  247. logger.warning(' * %s', rule)
  248. else:
  249. logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
  250. logger.debug(' * %s', rule)
  251. else:
  252. actions[la] = (Reduce, rule)
  253. m[itemset] = { k.name: v for k, v in actions.items() }
  254. if reduce_reduce:
  255. msgs = []
  256. for itemset, la, rules in reduce_reduce:
  257. msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ]))
  258. if self.debug:
  259. msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure])
  260. msgs.append(msg)
  261. raise GrammarError('\n\n'.join(msgs))
  262. states = { k.closure: v for k, v in m.items() }
  263. # compute end states
  264. end_states: Dict[str, 'State'] = {}
  265. for state in states:
  266. for rp in state:
  267. for start in self.lr0_start_states:
  268. if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
  269. assert start not in end_states
  270. end_states[start] = state
  271. start_states = { start: state.closure for start, state in self.lr0_start_states.items() }
  272. _parse_table = ParseTable(states, start_states, end_states)
  273. if self.debug:
  274. self.parse_table = _parse_table
  275. else:
  276. self.parse_table = IntParseTable.from_ParseTable(_parse_table)
  277. def compute_lalr(self):
  278. self.compute_lr0_states()
  279. self.compute_reads_relations()
  280. self.compute_includes_lookback()
  281. self.compute_lookaheads()
  282. self.compute_lalr1_states()