yichael
/
xhs-note-crawling


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
							"""
Simple SPARK-style scanner
Copyright (c) 2017 Rocky Bernstein
"""

from __future__ import print_function
import re
from spark_parser.scanner import GenericScanner
from gdbloc.tok import Token

class ScannerError(Exception):
    def __init__(self, text, text_cursor):
        self.text = text
        self.text_cursor = text_cursor

    def __str__(self):
        return self.text + "\n" + self.text_cursor

class LocationScanner(GenericScanner):

    def error(self, s):
        """Show text and a caret under that. For example:
x = 2y + z
     ^
"""
        # print("Lexical error:")
        # print("%s" % s[:self.pos+10])  # + 10 for trailing context
        # print("%s^" % (" "*(self.pos-1)))
        # for t in self.rv: print(t)
        raise ScannerError( ("%s" % s),
                            ("%s^" % (" "*(self.pos-1))) )

    def tokenize(self, input):
        self.rv = []
        GenericScanner.tokenize(self, input)
        return self.rv

    def add_token(self, name, v):
        t = Token(kind=name, value=v, offset=self.pos)
        self.pos += len(str(v))
        self.rv.append(t)

    # The function names below begin with 't_'.
    # This indicates to GenericScanner that these routines
    # form the tokens. GenericScanner introspects on the
    # method names of this class and the docstrings to come
    # up with both the names of the tokens and the regular expressions
    # that make up those tokens

    # Recognize white space, but we don't create a token for it.
    # This has the effect of stripping white space between tokens
    def t_whitespace(self, s):
        r'\s+'
        self.add_token('SPACE', s)
        pass

    def t_file_or_func(self, s):
        r'(?:[^*-+,\d\'"\t \n:][^\'"\t \n:,]*)|(?:^""".+""")|(?:\'\'\'.+\'\'\')'
        maybe_funcname = True
        if s == 'if':
            self.add_token('IF', s)
            return
        if s[0] in frozenset(('"', "'")):
            # Pick out text inside of triple-quoted string
            if ( (s.startswith("'''") and s.endswith("'''") ) or
                 (s.startswith('"""') and s.endswith('"""') ) ):
                base = s[3:-3]
            else:
                # Pick out text inside singly-quote string
                base = s[1:-1]
            maybe_funcname = False
        else:
            base = s
        pos = self.pos
        if maybe_funcname and re.match('[a-zA-Z_][[a-zA-Z_.0-9\[\]]+\(\)', s):
            self.add_token('FUNCNAME', base)
        else:
            self.add_token('FILENAME', base)
        self.pos = pos + len(s)

    def t_single_quote_file(self, s):
        r"'[^'].+'"
        # Pick out text inside of singe-quoted string
        base = s[1:-1]
        self.add_token('FILENAME', base)
        self.pos += len(s)

    def t_double_quote_file(self, s):
        r'"[^"]+"'
        # Pick out text inside of singe-quoted string
        base = s[1:-1]
        self.add_token('FILENAME', base)
        self.pos += len(s)

    def t_colon(self, s):
        r':'
        # Used to separate a filename from a line number
        self.add_token('COLON', s)

    def t_comma(self, s):
        r','
        # Used in "list" to separate first from last
        self.add_token('COMMA', s)

    def t_direction(self, s):
        r'^[+-]$'
        # Used in the "list" command
        self.add_token('DIRECTION', s)

    # Recognize integers
    def t_number(self, s):
        r'\d+'
        pos = self.pos
        self.add_token('NUMBER', int(s))
        self.pos = pos + len(s)

    # Recognize list offsets (counts)
    def t_offset(self, s):
        r'[+]\d+'
        pos = self.pos
        self.add_token('OFFSET', s)
        self.pos = pos + len(s)

    # Recognize addresses (bytecode offsets)
    def t_address(self, s):
        r'[*]\d+'
        pos = self.pos
        self.add_token('ADDRESS', s)
        self.pos = pos + len(s)


if __name__ == "__main__":
    for line in (
            '/tmp/foo.py:12',
            "'''/tmp/foo.py:12'''",
            "'/tmp/foo.py:12'",
            "6",
            "*6",
            # "/tmp/foo.py line 12",
            # "\"\"\"/tmp/foo.py's line 12\"\"\"",
            # "12",
            # "../foo.py:5",
            # "gcd()",
            "foo.py:5 if x > 1",
            "5 ,",
            "5,",
            "5,10",
            ",10",
            ):
        try:
            tokens = LocationScanner().tokenize(line.strip())
            for t in tokens:
                print(t)
                pass
            print('-' * 30)
        except ScannerError as e:
            print("Lexical error at or around: ")
            print(e.text)
            print(e.text_cursor)

        pass
    pass