parser.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
  2. # Licensed to PSF under a Contributor Agreement.
  3. # Modifications:
  4. # Copyright David Halter and Contributors
  5. # Modifications are dual-licensed: MIT and PSF.
  6. # 99% of the code is different from pgen2, now.
  7. """
  8. The ``Parser`` tries to convert the available Python code in an easy to read
  9. format, something like an abstract syntax tree. The classes who represent this
  10. tree, are sitting in the :mod:`parso.tree` module.
  11. The Python module ``tokenize`` is a very important part in the ``Parser``,
  12. because it splits the code into different words (tokens). Sometimes it looks a
  13. bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
  14. module for this? Well, ``ast`` does a very good job understanding proper Python
  15. code, but fails to work as soon as there's a single line of broken code.
  16. There's one important optimization that needs to be known: Statements are not
  17. being parsed completely. ``Statement`` is just a representation of the tokens
  18. within the statement. This lowers memory usage and cpu time and reduces the
  19. complexity of the ``Parser`` (there's another parser sitting inside
  20. ``Statement``, which produces ``Array`` and ``Call``).
  21. """
  22. from typing import Dict, Type
  23. from parso import tree
  24. from parso.pgen2.generator import ReservedString
  25. class ParserSyntaxError(Exception):
  26. """
  27. Contains error information about the parser tree.
  28. May be raised as an exception.
  29. """
  30. def __init__(self, message, error_leaf):
  31. self.message = message
  32. self.error_leaf = error_leaf
  33. class InternalParseError(Exception):
  34. """
  35. Exception to signal the parser is stuck and error recovery didn't help.
  36. Basically this shouldn't happen. It's a sign that something is really
  37. wrong.
  38. """
  39. def __init__(self, msg, type_, value, start_pos):
  40. Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
  41. (msg, type_.name, value, start_pos))
  42. self.msg = msg
  43. self.type = type
  44. self.value = value
  45. self.start_pos = start_pos
  46. class Stack(list):
  47. def _allowed_transition_names_and_token_types(self):
  48. def iterate():
  49. # An API just for Jedi.
  50. for stack_node in reversed(self):
  51. for transition in stack_node.dfa.transitions:
  52. if isinstance(transition, ReservedString):
  53. yield transition.value
  54. else:
  55. yield transition # A token type
  56. if not stack_node.dfa.is_final:
  57. break
  58. return list(iterate())
  59. class StackNode:
  60. def __init__(self, dfa):
  61. self.dfa = dfa
  62. self.nodes = []
  63. @property
  64. def nonterminal(self):
  65. return self.dfa.from_rule
  66. def __repr__(self):
  67. return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
  68. def _token_to_transition(grammar, type_, value):
  69. # Map from token to label
  70. if type_.value.contains_syntax:
  71. # Check for reserved words (keywords)
  72. try:
  73. return grammar.reserved_syntax_strings[value]
  74. except KeyError:
  75. pass
  76. return type_
  77. class BaseParser:
  78. """Parser engine.
  79. A Parser instance contains state pertaining to the current token
  80. sequence, and should not be used concurrently by different threads
  81. to parse separate token sequences.
  82. See python/tokenize.py for how to get input tokens by a string.
  83. When a syntax error occurs, error_recovery() is called.
  84. """
  85. node_map: Dict[str, Type[tree.BaseNode]] = {}
  86. default_node = tree.Node
  87. leaf_map: Dict[str, Type[tree.Leaf]] = {}
  88. default_leaf = tree.Leaf
  89. def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
  90. self._pgen_grammar = pgen_grammar
  91. self._start_nonterminal = start_nonterminal
  92. self._error_recovery = error_recovery
  93. def parse(self, tokens):
  94. first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
  95. self.stack = Stack([StackNode(first_dfa)])
  96. for token in tokens:
  97. self._add_token(token)
  98. while True:
  99. tos = self.stack[-1]
  100. if not tos.dfa.is_final:
  101. # We never broke out -- EOF is too soon -- Unfinished statement.
  102. # However, the error recovery might have added the token again, if
  103. # the stack is empty, we're fine.
  104. raise InternalParseError(
  105. "incomplete input", token.type, token.string, token.start_pos
  106. )
  107. if len(self.stack) > 1:
  108. self._pop()
  109. else:
  110. return self.convert_node(tos.nonterminal, tos.nodes)
  111. def error_recovery(self, token):
  112. if self._error_recovery:
  113. raise NotImplementedError("Error Recovery is not implemented")
  114. else:
  115. type_, value, start_pos, prefix = token
  116. error_leaf = tree.ErrorLeaf(type_, value, start_pos, prefix)
  117. raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
  118. def convert_node(self, nonterminal, children):
  119. try:
  120. node = self.node_map[nonterminal](children)
  121. except KeyError:
  122. node = self.default_node(nonterminal, children)
  123. return node
  124. def convert_leaf(self, type_, value, prefix, start_pos):
  125. try:
  126. return self.leaf_map[type_](value, start_pos, prefix)
  127. except KeyError:
  128. return self.default_leaf(value, start_pos, prefix)
  129. def _add_token(self, token):
  130. """
  131. This is the only core function for parsing. Here happens basically
  132. everything. Everything is well prepared by the parser generator and we
  133. only apply the necessary steps here.
  134. """
  135. grammar = self._pgen_grammar
  136. stack = self.stack
  137. type_, value, start_pos, prefix = token
  138. transition = _token_to_transition(grammar, type_, value)
  139. while True:
  140. try:
  141. plan = stack[-1].dfa.transitions[transition]
  142. break
  143. except KeyError:
  144. if stack[-1].dfa.is_final:
  145. self._pop()
  146. else:
  147. self.error_recovery(token)
  148. return
  149. except IndexError:
  150. raise InternalParseError("too much input", type_, value, start_pos)
  151. stack[-1].dfa = plan.next_dfa
  152. for push in plan.dfa_pushes:
  153. stack.append(StackNode(push))
  154. leaf = self.convert_leaf(type_, value, prefix, start_pos)
  155. stack[-1].nodes.append(leaf)
  156. def _pop(self):
  157. tos = self.stack.pop()
  158. # If there's exactly one child, return that child instead of
  159. # creating a new node. We still create expr_stmt and
  160. # file_input though, because a lot of Jedi depends on its
  161. # logic.
  162. if len(tos.nodes) == 1:
  163. new_node = tos.nodes[0]
  164. else:
  165. new_node = self.convert_node(tos.dfa.from_rule, tos.nodes)
  166. self.stack[-1].nodes.append(new_node)