grammar_parser.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
  2. # Licensed to PSF under a Contributor Agreement.
  3. # Modifications:
  4. # Copyright David Halter and Contributors
  5. # Modifications are dual-licensed: MIT and PSF.
  6. from typing import Optional, Iterator, Tuple, List
  7. from parso.python.tokenize import tokenize
  8. from parso.utils import parse_version_string
  9. from parso.python.token import PythonTokenTypes
  10. class NFAArc:
  11. def __init__(self, next_: 'NFAState', nonterminal_or_string: Optional[str]):
  12. self.next: NFAState = next_
  13. self.nonterminal_or_string: Optional[str] = nonterminal_or_string
  14. def __repr__(self):
  15. return '<%s: %s>' % (self.__class__.__name__, self.nonterminal_or_string)
  16. class NFAState:
  17. def __init__(self, from_rule: str):
  18. self.from_rule: str = from_rule
  19. self.arcs: List[NFAArc] = []
  20. def add_arc(self, next_, nonterminal_or_string=None):
  21. assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str)
  22. assert isinstance(next_, NFAState)
  23. self.arcs.append(NFAArc(next_, nonterminal_or_string))
  24. def __repr__(self):
  25. return '<%s: from %s>' % (self.__class__.__name__, self.from_rule)
  26. class GrammarParser:
  27. """
  28. The parser for Python grammar files.
  29. """
  30. def __init__(self, bnf_grammar: str):
  31. self._bnf_grammar = bnf_grammar
  32. self.generator = tokenize(
  33. bnf_grammar,
  34. version_info=parse_version_string('3.9')
  35. )
  36. self._gettoken() # Initialize lookahead
  37. def parse(self) -> Iterator[Tuple[NFAState, NFAState]]:
  38. # grammar: (NEWLINE | rule)* ENDMARKER
  39. while self.type != PythonTokenTypes.ENDMARKER:
  40. while self.type == PythonTokenTypes.NEWLINE:
  41. self._gettoken()
  42. # rule: NAME ':' rhs NEWLINE
  43. self._current_rule_name = self._expect(PythonTokenTypes.NAME)
  44. self._expect(PythonTokenTypes.OP, ':')
  45. a, z = self._parse_rhs()
  46. self._expect(PythonTokenTypes.NEWLINE)
  47. yield a, z
  48. def _parse_rhs(self):
  49. # rhs: items ('|' items)*
  50. a, z = self._parse_items()
  51. if self.value != "|":
  52. return a, z
  53. else:
  54. aa = NFAState(self._current_rule_name)
  55. zz = NFAState(self._current_rule_name)
  56. while True:
  57. # Add the possibility to go into the state of a and come back
  58. # to finish.
  59. aa.add_arc(a)
  60. z.add_arc(zz)
  61. if self.value != "|":
  62. break
  63. self._gettoken()
  64. a, z = self._parse_items()
  65. return aa, zz
  66. def _parse_items(self):
  67. # items: item+
  68. a, b = self._parse_item()
  69. while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \
  70. or self.value in ('(', '['):
  71. c, d = self._parse_item()
  72. # Need to end on the next item.
  73. b.add_arc(c)
  74. b = d
  75. return a, b
  76. def _parse_item(self):
  77. # item: '[' rhs ']' | atom ['+' | '*']
  78. if self.value == "[":
  79. self._gettoken()
  80. a, z = self._parse_rhs()
  81. self._expect(PythonTokenTypes.OP, ']')
  82. # Make it also possible that there is no token and change the
  83. # state.
  84. a.add_arc(z)
  85. return a, z
  86. else:
  87. a, z = self._parse_atom()
  88. value = self.value
  89. if value not in ("+", "*"):
  90. return a, z
  91. self._gettoken()
  92. # Make it clear that we can go back to the old state and repeat.
  93. z.add_arc(a)
  94. if value == "+":
  95. return a, z
  96. else:
  97. # The end state is the same as the beginning, nothing must
  98. # change.
  99. return a, a
  100. def _parse_atom(self):
  101. # atom: '(' rhs ')' | NAME | STRING
  102. if self.value == "(":
  103. self._gettoken()
  104. a, z = self._parse_rhs()
  105. self._expect(PythonTokenTypes.OP, ')')
  106. return a, z
  107. elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING):
  108. a = NFAState(self._current_rule_name)
  109. z = NFAState(self._current_rule_name)
  110. # Make it clear that the state transition requires that value.
  111. a.add_arc(z, self.value)
  112. self._gettoken()
  113. return a, z
  114. else:
  115. self._raise_error("expected (...) or NAME or STRING, got %s/%s",
  116. self.type, self.value)
  117. def _expect(self, type_, value=None):
  118. if self.type != type_:
  119. self._raise_error("expected %s, got %s [%s]",
  120. type_, self.type, self.value)
  121. if value is not None and self.value != value:
  122. self._raise_error("expected %s, got %s", value, self.value)
  123. value = self.value
  124. self._gettoken()
  125. return value
  126. def _gettoken(self):
  127. tup = next(self.generator)
  128. self.type, self.value, self.begin, prefix = tup
  129. def _raise_error(self, msg, *args):
  130. if args:
  131. try:
  132. msg = msg % args
  133. except:
  134. msg = " ".join([msg] + list(map(str, args)))
  135. line = self._bnf_grammar.splitlines()[self.begin[0] - 1]
  136. raise SyntaxError(msg, ('<grammar>', self.begin[0],
  137. self.begin[1], line))