Lexer.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. # Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
  2. # Use of this file is governed by the BSD 3-clause license that
  3. # can be found in the LICENSE.txt file in the project root.
  4. #/
  5. # A lexer is recognizer that draws input symbols from a character stream.
  6. # lexer grammars result in a subclass of self object. A Lexer object
  7. # uses simplified match() and error recovery mechanisms in the interest
  8. # of speed.
  9. #/
  10. from io import StringIO
  11. import sys
  12. if sys.version_info[1] > 5:
  13. from typing import TextIO
  14. else:
  15. from typing.io import TextIO
  16. from antlr4.CommonTokenFactory import CommonTokenFactory
  17. from antlr4.atn.LexerATNSimulator import LexerATNSimulator
  18. from antlr4.InputStream import InputStream
  19. from antlr4.Recognizer import Recognizer
  20. from antlr4.Token import Token
  21. from antlr4.error.Errors import IllegalStateException, LexerNoViableAltException, RecognitionException
  22. class TokenSource(object):
  23. pass
  24. class Lexer(Recognizer, TokenSource):
  25. __slots__ = (
  26. '_input', '_output', '_factory', '_tokenFactorySourcePair', '_token',
  27. '_tokenStartCharIndex', '_tokenStartLine', '_tokenStartColumn',
  28. '_hitEOF', '_channel', '_type', '_modeStack', '_mode', '_text'
  29. )
  30. DEFAULT_MODE = 0
  31. MORE = -2
  32. SKIP = -3
  33. DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL
  34. HIDDEN = Token.HIDDEN_CHANNEL
  35. MIN_CHAR_VALUE = 0x0000
  36. MAX_CHAR_VALUE = 0x10FFFF
  37. def __init__(self, input:InputStream, output:TextIO = sys.stdout):
  38. super().__init__()
  39. self._input = input
  40. self._output = output
  41. self._factory = CommonTokenFactory.DEFAULT
  42. self._tokenFactorySourcePair = (self, input)
  43. self._interp = None # child classes must populate this
  44. # The goal of all lexer rules/methods is to create a token object.
  45. # self is an instance variable as multiple rules may collaborate to
  46. # create a single token. nextToken will return self object after
  47. # matching lexer rule(s). If you subclass to allow multiple token
  48. # emissions, then set self to the last token to be matched or
  49. # something nonnull so that the auto token emit mechanism will not
  50. # emit another token.
  51. self._token = None
  52. # What character index in the stream did the current token start at?
  53. # Needed, for example, to get the text for current token. Set at
  54. # the start of nextToken.
  55. self._tokenStartCharIndex = -1
  56. # The line on which the first character of the token resides#/
  57. self._tokenStartLine = -1
  58. # The character position of first character within the line#/
  59. self._tokenStartColumn = -1
  60. # Once we see EOF on char stream, next token will be EOF.
  61. # If you have DONE : EOF ; then you see DONE EOF.
  62. self._hitEOF = False
  63. # The channel number for the current token#/
  64. self._channel = Token.DEFAULT_CHANNEL
  65. # The token type for the current token#/
  66. self._type = Token.INVALID_TYPE
  67. self._modeStack = []
  68. self._mode = self.DEFAULT_MODE
  69. # You can set the text for the current token to override what is in
  70. # the input char buffer. Use setText() or can set self instance var.
  71. #/
  72. self._text = None
  73. def reset(self):
  74. # wack Lexer state variables
  75. if self._input is not None:
  76. self._input.seek(0) # rewind the input
  77. self._token = None
  78. self._type = Token.INVALID_TYPE
  79. self._channel = Token.DEFAULT_CHANNEL
  80. self._tokenStartCharIndex = -1
  81. self._tokenStartColumn = -1
  82. self._tokenStartLine = -1
  83. self._text = None
  84. self._hitEOF = False
  85. self._mode = Lexer.DEFAULT_MODE
  86. self._modeStack = []
  87. self._interp.reset()
  88. # Return a token from self source; i.e., match a token on the char
  89. # stream.
  90. def nextToken(self):
  91. if self._input is None:
  92. raise IllegalStateException("nextToken requires a non-null input stream.")
  93. # Mark start location in char stream so unbuffered streams are
  94. # guaranteed at least have text of current token
  95. tokenStartMarker = self._input.mark()
  96. try:
  97. while True:
  98. if self._hitEOF:
  99. self.emitEOF()
  100. return self._token
  101. self._token = None
  102. self._channel = Token.DEFAULT_CHANNEL
  103. self._tokenStartCharIndex = self._input.index
  104. self._tokenStartColumn = self._interp.column
  105. self._tokenStartLine = self._interp.line
  106. self._text = None
  107. continueOuter = False
  108. while True:
  109. self._type = Token.INVALID_TYPE
  110. ttype = self.SKIP
  111. try:
  112. ttype = self._interp.match(self._input, self._mode)
  113. except LexerNoViableAltException as e:
  114. self.notifyListeners(e) # report error
  115. self.recover(e)
  116. if self._input.LA(1)==Token.EOF:
  117. self._hitEOF = True
  118. if self._type == Token.INVALID_TYPE:
  119. self._type = ttype
  120. if self._type == self.SKIP:
  121. continueOuter = True
  122. break
  123. if self._type!=self.MORE:
  124. break
  125. if continueOuter:
  126. continue
  127. if self._token is None:
  128. self.emit()
  129. return self._token
  130. finally:
  131. # make sure we release marker after match or
  132. # unbuffered char stream will keep buffering
  133. self._input.release(tokenStartMarker)
  134. # Instruct the lexer to skip creating a token for current lexer rule
  135. # and look for another token. nextToken() knows to keep looking when
  136. # a lexer rule finishes with token set to SKIP_TOKEN. Recall that
  137. # if token==null at end of any token rule, it creates one for you
  138. # and emits it.
  139. #/
  140. def skip(self):
  141. self._type = self.SKIP
  142. def more(self):
  143. self._type = self.MORE
  144. def mode(self, m:int):
  145. self._mode = m
  146. def pushMode(self, m:int):
  147. if self._interp.debug:
  148. print("pushMode " + str(m), file=self._output)
  149. self._modeStack.append(self._mode)
  150. self.mode(m)
  151. def popMode(self):
  152. if len(self._modeStack)==0:
  153. raise Exception("Empty Stack")
  154. if self._interp.debug:
  155. print("popMode back to "+ self._modeStack[:-1], file=self._output)
  156. self.mode( self._modeStack.pop() )
  157. return self._mode
  158. # Set the char stream and reset the lexer#/
  159. @property
  160. def inputStream(self):
  161. return self._input
  162. @inputStream.setter
  163. def inputStream(self, input:InputStream):
  164. self._input = None
  165. self._tokenFactorySourcePair = (self, self._input)
  166. self.reset()
  167. self._input = input
  168. self._tokenFactorySourcePair = (self, self._input)
  169. @property
  170. def sourceName(self):
  171. return self._input.sourceName
  172. # By default does not support multiple emits per nextToken invocation
  173. # for efficiency reasons. Subclass and override self method, nextToken,
  174. # and getToken (to push tokens into a list and pull from that list
  175. # rather than a single variable as self implementation does).
  176. #/
  177. def emitToken(self, token:Token):
  178. self._token = token
  179. # The standard method called to automatically emit a token at the
  180. # outermost lexical rule. The token object should point into the
  181. # char buffer start..stop. If there is a text override in 'text',
  182. # use that to set the token's text. Override self method to emit
  183. # custom Token objects or provide a new factory.
  184. #/
  185. def emit(self):
  186. t = self._factory.create(self._tokenFactorySourcePair, self._type, self._text, self._channel, self._tokenStartCharIndex,
  187. self.getCharIndex()-1, self._tokenStartLine, self._tokenStartColumn)
  188. self.emitToken(t)
  189. return t
  190. def emitEOF(self):
  191. cpos = self.column
  192. lpos = self.line
  193. eof = self._factory.create(self._tokenFactorySourcePair, Token.EOF, None, Token.DEFAULT_CHANNEL, self._input.index,
  194. self._input.index-1, lpos, cpos)
  195. self.emitToken(eof)
  196. return eof
  197. @property
  198. def type(self):
  199. return self._type
  200. @type.setter
  201. def type(self, type:int):
  202. self._type = type
  203. @property
  204. def line(self):
  205. return self._interp.line
  206. @line.setter
  207. def line(self, line:int):
  208. self._interp.line = line
  209. @property
  210. def column(self):
  211. return self._interp.column
  212. @column.setter
  213. def column(self, column:int):
  214. self._interp.column = column
  215. # What is the index of the current character of lookahead?#/
  216. def getCharIndex(self):
  217. return self._input.index
  218. # Return the text matched so far for the current token or any
  219. # text override.
  220. @property
  221. def text(self):
  222. if self._text is not None:
  223. return self._text
  224. else:
  225. return self._interp.getText(self._input)
  226. # Set the complete text of self token; it wipes any previous
  227. # changes to the text.
  228. @text.setter
  229. def text(self, txt:str):
  230. self._text = txt
  231. # Return a list of all Token objects in input char stream.
  232. # Forces load of all tokens. Does not include EOF token.
  233. #/
  234. def getAllTokens(self):
  235. tokens = []
  236. t = self.nextToken()
  237. while t.type!=Token.EOF:
  238. tokens.append(t)
  239. t = self.nextToken()
  240. return tokens
  241. def notifyListeners(self, e:LexerNoViableAltException):
  242. start = self._tokenStartCharIndex
  243. stop = self._input.index
  244. text = self._input.getText(start, stop)
  245. msg = "token recognition error at: '" + self.getErrorDisplay(text) + "'"
  246. listener = self.getErrorListenerDispatch()
  247. listener.syntaxError(self, None, self._tokenStartLine, self._tokenStartColumn, msg, e)
  248. def getErrorDisplay(self, s:str):
  249. with StringIO() as buf:
  250. for c in s:
  251. buf.write(self.getErrorDisplayForChar(c))
  252. return buf.getvalue()
  253. def getErrorDisplayForChar(self, c:str):
  254. if ord(c[0])==Token.EOF:
  255. return "<EOF>"
  256. elif c=='\n':
  257. return "\\n"
  258. elif c=='\t':
  259. return "\\t"
  260. elif c=='\r':
  261. return "\\r"
  262. else:
  263. return c
  264. def getCharErrorDisplay(self, c:str):
  265. return "'" + self.getErrorDisplayForChar(c) + "'"
  266. # Lexers can normally match any char in it's vocabulary after matching
  267. # a token, so do the easy thing and just kill a character and hope
  268. # it all works out. You can instead use the rule invocation stack
  269. # to do sophisticated error recovery if you are in a fragment rule.
  270. #/
  271. def recover(self, re:RecognitionException):
  272. if self._input.LA(1) != Token.EOF:
  273. if isinstance(re, LexerNoViableAltException):
  274. # skip a char and try again
  275. self._interp.consume(self._input)
  276. else:
  277. # TODO: Do we lose character or line position information?
  278. self._input.consume()