py2_scan.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """
  2. Simple SPARK-style scanner
  3. Copyright (c) 2016 Rocky Bernstein
  4. """
  5. import re
  6. from spark_parser.example.python2.py2_token import PythonToken
  7. # from __future__ import print_function
  8. from spark_parser.scanner import GenericScanner
  9. RESERVED_WORDS = re.split(
  10. r"\s+",
  11. """and as assert break class continue def del eval exec else elif for from global
  12. if in import lambda or pass print return while with yield None""",
  13. )
  14. BRACKET2NAME = {
  15. "(": "LPAREN",
  16. ")": "RPAREN",
  17. "{": "LBRACE",
  18. "}": "RBRACE",
  19. "[": "LBRACKET",
  20. "]": "RBRACKET",
  21. }
  22. SYMBOL2NAME = {
  23. "@": "AT",
  24. "`": "BACKTICK",
  25. ":": "COLON",
  26. ",": "COMMA",
  27. ".": "DOT",
  28. }
  29. ENDMARKER = r"" # ctrl-d
  30. class Python2Scanner(GenericScanner):
  31. def error(self, s, pos):
  32. """Show text and a carot under that. For example:
  33. x = 2y + z
  34. ^"""
  35. print("Lexical error:")
  36. print("%s" % s[: pos + 10]) # + 10 for trailing context
  37. print("%s^" % (" " * (pos - 1)))
  38. for t in self.rv:
  39. print(t)
  40. raise SystemExit
  41. def __init__(self):
  42. self.is_newline = True
  43. self.indents = [0]
  44. self.lineno = 1
  45. self.column = 0
  46. GenericScanner.__init__(self)
  47. def tokenize(self, string):
  48. self.rv = []
  49. GenericScanner.tokenize(self, string)
  50. return self.rv
  51. def add_token(self, name, s, is_newline=False):
  52. self.column += len(s)
  53. t = PythonToken(name, s, self.lineno, self.column)
  54. if is_newline:
  55. self.lineno += 1
  56. self.column = 0
  57. if self.is_newline and name not in ["DEDENT", "INDENT"]:
  58. while 0 < self.indents[-1]:
  59. self.indents = self.indents[0:-1]
  60. self.rv.append(PythonToken("DEDENT", "", self.lineno, self.column))
  61. pass
  62. self.is_newline = is_newline
  63. self.rv.append(t)
  64. # The function names below begin with 't_'.
  65. # This indicates to GenericScanner that these routines
  66. # form the tokens. GenericScanner introspects on the
  67. # method names of this class and the docstrings to come
  68. # up with both the names of the tokens and the regular expressions
  69. # that make up those tokens
  70. def t_paren(self, s):
  71. r"[(){}[\]]"
  72. self.add_token(BRACKET2NAME[s], s)
  73. def t_symbol(self, s):
  74. r"[@:,.`]"
  75. self.add_token(SYMBOL2NAME[s], s)
  76. def t_endmarker(self, s):
  77. """"""
  78. self.add_token("ENDMARKER", s)
  79. # These can a appear as unary operators. Some are also binary operators
  80. UNOP2NAME = {"+": "PLUS", "-": "MINUS", "~": "TILDE"}
  81. def t_op(self, s):
  82. r"\+=|-=|\*=|/=|%=|&=|\|=|^=|<<=|>>=|\*\*=|//=|//|==|<=|>=|<<|>>|[<>%^&+/=~-]"
  83. # Operators need to be further classified since the grammar requires this
  84. if s in ("<", ">", "==", ">=", "<=", "<>", "!="):
  85. self.add_token("COMP_OP", s)
  86. elif s in (
  87. "+=",
  88. "-=",
  89. "*=",
  90. "/=",
  91. "%=",
  92. "&=",
  93. "|=",
  94. "^=",
  95. "<<=",
  96. ">>=",
  97. "**=",
  98. "//=",
  99. ):
  100. self.add_token("AUGASSIGN", s)
  101. elif s in self.UNOP2NAME.keys():
  102. self.add_token(self.UNOP2NAME[s], s)
  103. elif s in ("|", "^", "&", "<<", ">>", "**", "/", "%", "//"):
  104. # These are *ONLY* binary operators. Operators which are exclusively or
  105. # can be unary operators were handled previously
  106. self.add_token("BINOP", s)
  107. elif s == "=":
  108. self.add_token("EQUAL", s)
  109. else:
  110. print("Internal error: Unknown operator %s" % s)
  111. raise SystemExit
  112. def t_linesep(self, s):
  113. r";"
  114. self.add_token("SEMICOLON", s)
  115. def t_nl(self, s):
  116. r"\n"
  117. self.add_token("NEWLINE", s, is_newline=True)
  118. def t_name(self, s):
  119. r"[A-Za-z_][A-Za-z_0-9]*"
  120. if s in RESERVED_WORDS:
  121. self.add_token(s.upper(), s)
  122. else:
  123. self.add_token("NAME", s)
  124. # A way to handle the problem of having to match two different
  125. # tokens with a single regular expression.
  126. # We can't have two separate defs because then it would be indeterminate
  127. # whether we get two single stars or one double star.
  128. def t_star_star(self, s):
  129. r"\*\*?"
  130. token_name = "STARSTAR" if len(s) == 2 else "STAR"
  131. self.add_token(token_name, s)
  132. # CONSTANTS
  133. # ---------
  134. def t_string(self, s):
  135. r"([\"]{3}(.|[\n])*[\"]{3})|('{3}(.|[\n])*'{3})|('[^']*')|(\"[^\"]*\")"
  136. self.add_token("STRING", s)
  137. # numbers; int, float, and complex
  138. # Note we have to put longer matches earlier. Specifically radix notation and
  139. # fixed-point notation
  140. def t_number(self, s):
  141. r"(0x[0-9a-f]+|0b[01]+|0o[0-7]+|\d+\.\d|\d+)j?"
  142. self.add_token("NUMBER", s)
  143. # Ugh. Handle Python's indent/dedent mess.
  144. def handle_indent_dedent(self, s):
  145. indent = len(s)
  146. if indent > self.indents[-1]:
  147. self.add_token("INDENT", s)
  148. self.indents.append(indent)
  149. if indent == self.indents[-1]:
  150. self.is_newline = False
  151. pass
  152. else:
  153. # May need several levels of dedent
  154. while indent < self.indents[-1]:
  155. self.indents = self.indents[0:-1]
  156. self.add_token("DEDENT", s)
  157. pass
  158. pass
  159. return
  160. # Combine comment and whitespace because we want to
  161. # capture the space before a comment.
  162. def t_whitespace_or_comment(self, s):
  163. r"([ \t]*[#].*[^\x04][\n]?)|([ \t]+)"
  164. if "#" in s:
  165. # We have a comment
  166. matches = re.match(r"(\s+)(.*[\n]?)", s)
  167. if matches and self.is_newline:
  168. self.handle_indent_dedent(matches.group(1))
  169. s = matches.group(2)
  170. if s.endswith("\n"):
  171. self.add_token("COMMENT", s[:-1])
  172. self.add_token("NEWLINE", "\n")
  173. else:
  174. self.add_token("COMMENT", s)
  175. elif self.is_newline:
  176. self.handle_indent_dedent(s)
  177. pass
  178. return
  179. if __name__ == "__main__":
  180. scan = Python2Scanner()
  181. def showit(expr):
  182. print(expr)
  183. tokens = scan.tokenize(expr + ENDMARKER)
  184. for t in tokens:
  185. print(t)
  186. print("-" * 30)
  187. return
  188. # showit("1 # hi")
  189. showit(
  190. """def foo():
  191. # comment
  192. return
  193. """
  194. )
  195. # showit("(10.5 + 2 / 30) // 3 >> 1")
  196. # showit("1 + 2")
  197. # showit("""
  198. # () { } + - 'abc' \"abc\" 10 10j 0x10 # foo
  199. # # bar
  200. # """)
  201. # showit("""
  202. # for i in range(x):
  203. # if True:
  204. # pass
  205. # pass
  206. # pass""")
  207. # showit("""
  208. # for i in range(x):
  209. # while True:
  210. # break
  211. # """)