scanner.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. """
  2. Simple SPARK-style scanner
  3. Copyright (c) 2017 Rocky Bernstein
  4. """
  5. from __future__ import print_function
  6. import re
  7. from spark_parser.scanner import GenericScanner
  8. from gdbloc.tok import Token
  9. class ScannerError(Exception):
  10. def __init__(self, text, text_cursor):
  11. self.text = text
  12. self.text_cursor = text_cursor
  13. def __str__(self):
  14. return self.text + "\n" + self.text_cursor
  15. class LocationScanner(GenericScanner):
  16. def error(self, s):
  17. """Show text and a caret under that. For example:
  18. x = 2y + z
  19. ^
  20. """
  21. # print("Lexical error:")
  22. # print("%s" % s[:self.pos+10]) # + 10 for trailing context
  23. # print("%s^" % (" "*(self.pos-1)))
  24. # for t in self.rv: print(t)
  25. raise ScannerError( ("%s" % s),
  26. ("%s^" % (" "*(self.pos-1))) )
  27. def tokenize(self, input):
  28. self.rv = []
  29. GenericScanner.tokenize(self, input)
  30. return self.rv
  31. def add_token(self, name, v):
  32. t = Token(kind=name, value=v, offset=self.pos)
  33. self.pos += len(str(v))
  34. self.rv.append(t)
  35. # The function names below begin with 't_'.
  36. # This indicates to GenericScanner that these routines
  37. # form the tokens. GenericScanner introspects on the
  38. # method names of this class and the docstrings to come
  39. # up with both the names of the tokens and the regular expressions
  40. # that make up those tokens
  41. # Recognize white space, but we don't create a token for it.
  42. # This has the effect of stripping white space between tokens
  43. def t_whitespace(self, s):
  44. r'\s+'
  45. self.add_token('SPACE', s)
  46. pass
  47. def t_file_or_func(self, s):
  48. r'(?:[^*-+,\d\'"\t \n:][^\'"\t \n:,]*)|(?:^""".+""")|(?:\'\'\'.+\'\'\')'
  49. maybe_funcname = True
  50. if s == 'if':
  51. self.add_token('IF', s)
  52. return
  53. if s[0] in frozenset(('"', "'")):
  54. # Pick out text inside of triple-quoted string
  55. if ( (s.startswith("'''") and s.endswith("'''") ) or
  56. (s.startswith('"""') and s.endswith('"""') ) ):
  57. base = s[3:-3]
  58. else:
  59. # Pick out text inside singly-quote string
  60. base = s[1:-1]
  61. maybe_funcname = False
  62. else:
  63. base = s
  64. pos = self.pos
  65. if maybe_funcname and re.match('[a-zA-Z_][[a-zA-Z_.0-9\[\]]+\(\)', s):
  66. self.add_token('FUNCNAME', base)
  67. else:
  68. self.add_token('FILENAME', base)
  69. self.pos = pos + len(s)
  70. def t_single_quote_file(self, s):
  71. r"'[^'].+'"
  72. # Pick out text inside of singe-quoted string
  73. base = s[1:-1]
  74. self.add_token('FILENAME', base)
  75. self.pos += len(s)
  76. def t_double_quote_file(self, s):
  77. r'"[^"]+"'
  78. # Pick out text inside of singe-quoted string
  79. base = s[1:-1]
  80. self.add_token('FILENAME', base)
  81. self.pos += len(s)
  82. def t_colon(self, s):
  83. r':'
  84. # Used to separate a filename from a line number
  85. self.add_token('COLON', s)
  86. def t_comma(self, s):
  87. r','
  88. # Used in "list" to separate first from last
  89. self.add_token('COMMA', s)
  90. def t_direction(self, s):
  91. r'^[+-]$'
  92. # Used in the "list" command
  93. self.add_token('DIRECTION', s)
  94. # Recognize integers
  95. def t_number(self, s):
  96. r'\d+'
  97. pos = self.pos
  98. self.add_token('NUMBER', int(s))
  99. self.pos = pos + len(s)
  100. # Recognize list offsets (counts)
  101. def t_offset(self, s):
  102. r'[+]\d+'
  103. pos = self.pos
  104. self.add_token('OFFSET', s)
  105. self.pos = pos + len(s)
  106. # Recognize addresses (bytecode offsets)
  107. def t_address(self, s):
  108. r'[*]\d+'
  109. pos = self.pos
  110. self.add_token('ADDRESS', s)
  111. self.pos = pos + len(s)
  112. if __name__ == "__main__":
  113. for line in (
  114. '/tmp/foo.py:12',
  115. "'''/tmp/foo.py:12'''",
  116. "'/tmp/foo.py:12'",
  117. "6",
  118. "*6",
  119. # "/tmp/foo.py line 12",
  120. # "\"\"\"/tmp/foo.py's line 12\"\"\"",
  121. # "12",
  122. # "../foo.py:5",
  123. # "gcd()",
  124. "foo.py:5 if x > 1",
  125. "5 ,",
  126. "5,",
  127. "5,10",
  128. ",10",
  129. ):
  130. try:
  131. tokens = LocationScanner().tokenize(line.strip())
  132. for t in tokens:
  133. print(t)
  134. pass
  135. print('-' * 30)
  136. except ScannerError as e:
  137. print("Lexical error at or around: ")
  138. print(e.text)
  139. print(e.text_cursor)
  140. pass
  141. pass