grammar_analysis.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. "Provides for superficial grammar analysis."
  2. from collections import Counter, defaultdict
  3. from typing import List, Dict, Iterator, FrozenSet, Set
  4. from ..utils import bfs, fzset, classify, OrderedSet
  5. from ..exceptions import GrammarError
  6. from ..grammar import Rule, Terminal, NonTerminal, Symbol
  7. from ..common import ParserConf
  8. class RulePtr:
  9. __slots__ = ('rule', 'index')
  10. rule: Rule
  11. index: int
  12. def __init__(self, rule: Rule, index: int):
  13. assert isinstance(rule, Rule)
  14. assert index <= len(rule.expansion)
  15. self.rule = rule
  16. self.index = index
  17. def __repr__(self):
  18. before = [x.name for x in self.rule.expansion[:self.index]]
  19. after = [x.name for x in self.rule.expansion[self.index:]]
  20. return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
  21. @property
  22. def next(self) -> Symbol:
  23. return self.rule.expansion[self.index]
  24. def advance(self, sym: Symbol) -> 'RulePtr':
  25. assert self.next == sym
  26. return RulePtr(self.rule, self.index+1)
  27. @property
  28. def is_satisfied(self) -> bool:
  29. return self.index == len(self.rule.expansion)
  30. def __eq__(self, other) -> bool:
  31. if not isinstance(other, RulePtr):
  32. return NotImplemented
  33. return self.rule == other.rule and self.index == other.index
  34. def __hash__(self) -> int:
  35. return hash((self.rule, self.index))
  36. State = FrozenSet[RulePtr]
  37. # state generation ensures no duplicate LR0ItemSets
  38. class LR0ItemSet:
  39. __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
  40. kernel: State
  41. closure: State
  42. transitions: Dict[Symbol, 'LR0ItemSet']
  43. lookaheads: Dict[Symbol, Set[Rule]]
  44. def __init__(self, kernel, closure):
  45. self.kernel = fzset(kernel)
  46. self.closure = fzset(closure)
  47. self.transitions = {}
  48. self.lookaheads = defaultdict(set)
  49. def __repr__(self):
  50. return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
  51. def update_set(set1, set2):
  52. if not set2 or set1 > set2:
  53. return False
  54. copy = set(set1)
  55. set1 |= set2
  56. return set1 != copy
  57. def calculate_sets(rules):
  58. """Calculate FOLLOW sets.
  59. Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
  60. symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
  61. # foreach grammar rule X ::= Y(1) ... Y(k)
  62. # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
  63. # NULLABLE = NULLABLE union {X}
  64. # for i = 1 to k
  65. # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
  66. # FIRST(X) = FIRST(X) union FIRST(Y(i))
  67. # for j = i+1 to k
  68. # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
  69. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
  70. # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
  71. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
  72. # until none of NULLABLE,FIRST,FOLLOW changed in last iteration
  73. NULLABLE = set()
  74. FIRST = {}
  75. FOLLOW = {}
  76. for sym in symbols:
  77. FIRST[sym]={sym} if sym.is_term else set()
  78. FOLLOW[sym]=set()
  79. # Calculate NULLABLE and FIRST
  80. changed = True
  81. while changed:
  82. changed = False
  83. for rule in rules:
  84. if set(rule.expansion) <= NULLABLE:
  85. if update_set(NULLABLE, {rule.origin}):
  86. changed = True
  87. for i, sym in enumerate(rule.expansion):
  88. if set(rule.expansion[:i]) <= NULLABLE:
  89. if update_set(FIRST[rule.origin], FIRST[sym]):
  90. changed = True
  91. else:
  92. break
  93. # Calculate FOLLOW
  94. changed = True
  95. while changed:
  96. changed = False
  97. for rule in rules:
  98. for i, sym in enumerate(rule.expansion):
  99. if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
  100. if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
  101. changed = True
  102. for j in range(i+1, len(rule.expansion)):
  103. if set(rule.expansion[i+1:j]) <= NULLABLE:
  104. if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
  105. changed = True
  106. return FIRST, FOLLOW, NULLABLE
  107. class GrammarAnalyzer:
  108. def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
  109. self.debug = debug
  110. self.strict = strict
  111. root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
  112. for start in parser_conf.start}
  113. rules = parser_conf.rules + list(root_rules.values())
  114. self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
  115. if len(rules) != len(set(rules)):
  116. duplicates = [item for item, count in Counter(rules).items() if count > 1]
  117. raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
  118. for r in rules:
  119. for sym in r.expansion:
  120. if not (sym.is_term or sym in self.rules_by_origin):
  121. raise GrammarError("Using an undefined rule: %s" % sym)
  122. self.start_states = {start: self.expand_rule(root_rule.origin)
  123. for start, root_rule in root_rules.items()}
  124. self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
  125. for start, root_rule in root_rules.items()}
  126. lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
  127. for start in parser_conf.start}
  128. lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
  129. assert(len(lr0_rules) == len(set(lr0_rules)))
  130. self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
  131. # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
  132. self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
  133. for start, root_rule in lr0_root_rules.items()}
  134. self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
  135. def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> OrderedSet[RulePtr]:
  136. "Returns all init_ptrs accessible by rule (recursive)"
  137. if rules_by_origin is None:
  138. rules_by_origin = self.rules_by_origin
  139. init_ptrs = OrderedSet[RulePtr]()
  140. def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
  141. assert not rule.is_term, rule
  142. for r in rules_by_origin[rule]:
  143. init_ptr = RulePtr(r, 0)
  144. init_ptrs.add(init_ptr)
  145. if r.expansion: # if not empty rule
  146. new_r = init_ptr.next
  147. if not new_r.is_term:
  148. assert isinstance(new_r, NonTerminal)
  149. yield new_r
  150. for _ in bfs([source_rule], _expand_rule):
  151. pass
  152. return init_ptrs