indenter.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. "Provides a post-lexer for implementing Python-style indentation."
  2. from abc import ABC, abstractmethod
  3. from typing import List, Iterator
  4. from .exceptions import LarkError
  5. from .lark import PostLex
  6. from .lexer import Token
  7. ###{standalone
  8. class DedentError(LarkError):
  9. pass
  10. class Indenter(PostLex, ABC):
  11. """This is a postlexer that "injects" indent/dedent tokens based on indentation.
  12. It keeps track of the current indentation, as well as the current level of parentheses.
  13. Inside parentheses, the indentation is ignored, and no indent/dedent tokens get generated.
  14. Note: This is an abstract class. To use it, inherit and implement all its abstract methods:
  15. - tab_len
  16. - NL_type
  17. - OPEN_PAREN_types, CLOSE_PAREN_types
  18. - INDENT_type, DEDENT_type
  19. See also: the ``postlex`` option in `Lark`.
  20. """
  21. paren_level: int
  22. indent_level: List[int]
  23. def __init__(self) -> None:
  24. self.paren_level = 0
  25. self.indent_level = [0]
  26. assert self.tab_len > 0
  27. def handle_NL(self, token: Token) -> Iterator[Token]:
  28. if self.paren_level > 0:
  29. return
  30. yield token
  31. indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
  32. indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
  33. if indent > self.indent_level[-1]:
  34. self.indent_level.append(indent)
  35. yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
  36. else:
  37. while indent < self.indent_level[-1]:
  38. self.indent_level.pop()
  39. yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
  40. if indent != self.indent_level[-1]:
  41. raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1]))
  42. def _process(self, stream):
  43. token = None
  44. for token in stream:
  45. if token.type == self.NL_type:
  46. yield from self.handle_NL(token)
  47. else:
  48. yield token
  49. if token.type in self.OPEN_PAREN_types:
  50. self.paren_level += 1
  51. elif token.type in self.CLOSE_PAREN_types:
  52. self.paren_level -= 1
  53. assert self.paren_level >= 0
  54. while len(self.indent_level) > 1:
  55. self.indent_level.pop()
  56. yield Token.new_borrow_pos(self.DEDENT_type, '', token) if token else Token(self.DEDENT_type, '', 0, 0, 0, 0, 0, 0)
  57. assert self.indent_level == [0], self.indent_level
  58. def process(self, stream):
  59. self.paren_level = 0
  60. self.indent_level = [0]
  61. return self._process(stream)
  62. # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
  63. @property
  64. def always_accept(self):
  65. return (self.NL_type,)
  66. @property
  67. @abstractmethod
  68. def NL_type(self) -> str:
  69. "The name of the newline token"
  70. raise NotImplementedError()
  71. @property
  72. @abstractmethod
  73. def OPEN_PAREN_types(self) -> List[str]:
  74. "The names of the tokens that open a parenthesis"
  75. raise NotImplementedError()
  76. @property
  77. @abstractmethod
  78. def CLOSE_PAREN_types(self) -> List[str]:
  79. """The names of the tokens that close a parenthesis
  80. """
  81. raise NotImplementedError()
  82. @property
  83. @abstractmethod
  84. def INDENT_type(self) -> str:
  85. """The name of the token that starts an indentation in the grammar.
  86. See also: %declare
  87. """
  88. raise NotImplementedError()
  89. @property
  90. @abstractmethod
  91. def DEDENT_type(self) -> str:
  92. """The name of the token that end an indentation in the grammar.
  93. See also: %declare
  94. """
  95. raise NotImplementedError()
  96. @property
  97. @abstractmethod
  98. def tab_len(self) -> int:
  99. """How many spaces does a tab equal"""
  100. raise NotImplementedError()
  101. class PythonIndenter(Indenter):
  102. """A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to the Python syntax.
  103. See also: the ``postlex`` option in `Lark`.
  104. """
  105. NL_type = '_NEWLINE'
  106. OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
  107. CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
  108. INDENT_type = '_INDENT'
  109. DEDENT_type = '_DEDENT'
  110. tab_len = 8
  111. ###}