| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- """
- Simple SPARK-style scanner
- Copyright (c) 2017 Rocky Bernstein
- """
- from __future__ import print_function
- import re
- from spark_parser.scanner import GenericScanner
- from gdbloc.tok import Token
- class ScannerError(Exception):
- def __init__(self, text, text_cursor):
- self.text = text
- self.text_cursor = text_cursor
- def __str__(self):
- return self.text + "\n" + self.text_cursor
- class LocationScanner(GenericScanner):
- def error(self, s):
- """Show text and a caret under that. For example:
- x = 2y + z
- ^
- """
- # print("Lexical error:")
- # print("%s" % s[:self.pos+10]) # + 10 for trailing context
- # print("%s^" % (" "*(self.pos-1)))
- # for t in self.rv: print(t)
- raise ScannerError( ("%s" % s),
- ("%s^" % (" "*(self.pos-1))) )
- def tokenize(self, input):
- self.rv = []
- GenericScanner.tokenize(self, input)
- return self.rv
- def add_token(self, name, v):
- t = Token(kind=name, value=v, offset=self.pos)
- self.pos += len(str(v))
- self.rv.append(t)
- # The function names below begin with 't_'.
- # This indicates to GenericScanner that these routines
- # form the tokens. GenericScanner introspects on the
- # method names of this class and the docstrings to come
- # up with both the names of the tokens and the regular expressions
- # that make up those tokens
- # Recognize white space, but we don't create a token for it.
- # This has the effect of stripping white space between tokens
- def t_whitespace(self, s):
- r'\s+'
- self.add_token('SPACE', s)
- pass
- def t_file_or_func(self, s):
- r'(?:[^*-+,\d\'"\t \n:][^\'"\t \n:,]*)|(?:^""".+""")|(?:\'\'\'.+\'\'\')'
- maybe_funcname = True
- if s == 'if':
- self.add_token('IF', s)
- return
- if s[0] in frozenset(('"', "'")):
- # Pick out text inside of triple-quoted string
- if ( (s.startswith("'''") and s.endswith("'''") ) or
- (s.startswith('"""') and s.endswith('"""') ) ):
- base = s[3:-3]
- else:
- # Pick out text inside singly-quote string
- base = s[1:-1]
- maybe_funcname = False
- else:
- base = s
- pos = self.pos
- if maybe_funcname and re.match('[a-zA-Z_][[a-zA-Z_.0-9\[\]]+\(\)', s):
- self.add_token('FUNCNAME', base)
- else:
- self.add_token('FILENAME', base)
- self.pos = pos + len(s)
- def t_single_quote_file(self, s):
- r"'[^'].+'"
- # Pick out text inside of singe-quoted string
- base = s[1:-1]
- self.add_token('FILENAME', base)
- self.pos += len(s)
- def t_double_quote_file(self, s):
- r'"[^"]+"'
- # Pick out text inside of singe-quoted string
- base = s[1:-1]
- self.add_token('FILENAME', base)
- self.pos += len(s)
- def t_colon(self, s):
- r':'
- # Used to separate a filename from a line number
- self.add_token('COLON', s)
- def t_comma(self, s):
- r','
- # Used in "list" to separate first from last
- self.add_token('COMMA', s)
- def t_direction(self, s):
- r'^[+-]$'
- # Used in the "list" command
- self.add_token('DIRECTION', s)
- # Recognize integers
- def t_number(self, s):
- r'\d+'
- pos = self.pos
- self.add_token('NUMBER', int(s))
- self.pos = pos + len(s)
- # Recognize list offsets (counts)
- def t_offset(self, s):
- r'[+]\d+'
- pos = self.pos
- self.add_token('OFFSET', s)
- self.pos = pos + len(s)
- # Recognize addresses (bytecode offsets)
- def t_address(self, s):
- r'[*]\d+'
- pos = self.pos
- self.add_token('ADDRESS', s)
- self.pos = pos + len(s)
- if __name__ == "__main__":
- for line in (
- '/tmp/foo.py:12',
- "'''/tmp/foo.py:12'''",
- "'/tmp/foo.py:12'",
- "6",
- "*6",
- # "/tmp/foo.py line 12",
- # "\"\"\"/tmp/foo.py's line 12\"\"\"",
- # "12",
- # "../foo.py:5",
- # "gcd()",
- "foo.py:5 if x > 1",
- "5 ,",
- "5,",
- "5,10",
- ",10",
- ):
- try:
- tokens = LocationScanner().tokenize(line.strip())
- for t in tokens:
- print(t)
- pass
- print('-' * 30)
- except ScannerError as e:
- print("Lexical error at or around: ")
- print(e.text)
- print(e.text_cursor)
- pass
- pass
|