yichael
/
xhs-note-crawling


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
							"""
Simple SPARK-style scanner
Copyright (c) 2016 Rocky Bernstein
"""

import re

from spark_parser.example.python2.py2_token import PythonToken

# from __future__ import print_function
from spark_parser.scanner import GenericScanner

RESERVED_WORDS = re.split(
    r"\s+",
    """and as assert break class continue def del eval exec else elif for from global
if in import lambda or pass print return while with yield None""",
)

BRACKET2NAME = {
    "(": "LPAREN",
    ")": "RPAREN",
    "{": "LBRACE",
    "}": "RBRACE",
    "[": "LBRACKET",
    "]": "RBRACKET",
}

SYMBOL2NAME = {
    "@": "AT",
    "`": "BACKTICK",
    ":": "COLON",
    ",": "COMMA",
    ".": "DOT",
}

ENDMARKER = r""  # ctrl-d


class Python2Scanner(GenericScanner):
    def error(self, s, pos):
        """Show text and a carot under that. For example:
        x = 2y + z
             ^"""
        print("Lexical error:")
        print("%s" % s[: pos + 10])  # + 10 for trailing context
        print("%s^" % (" " * (pos - 1)))
        for t in self.rv:
            print(t)
        raise SystemExit

    def __init__(self):
        self.is_newline = True
        self.indents = [0]
        self.lineno = 1
        self.column = 0
        GenericScanner.__init__(self)

    def tokenize(self, string):
        self.rv = []
        GenericScanner.tokenize(self, string)
        return self.rv

    def add_token(self, name, s, is_newline=False):
        self.column += len(s)
        t = PythonToken(name, s, self.lineno, self.column)
        if is_newline:
            self.lineno += 1
            self.column = 0
        if self.is_newline and name not in ["DEDENT", "INDENT"]:
            while 0 < self.indents[-1]:
                self.indents = self.indents[0:-1]
                self.rv.append(PythonToken("DEDENT", "", self.lineno, self.column))
                pass
        self.is_newline = is_newline
        self.rv.append(t)

    # The function names below begin with 't_'.
    # This indicates to GenericScanner that these routines
    # form the tokens. GenericScanner introspects on the
    # method names of this class and the docstrings to come
    # up with both the names of the tokens and the regular expressions
    # that make up those tokens

    def t_paren(self, s):
        r"[(){}[\]]"
        self.add_token(BRACKET2NAME[s], s)

    def t_symbol(self, s):
        r"[@:,.`]"
        self.add_token(SYMBOL2NAME[s], s)

    def t_endmarker(self, s):
        """"""
        self.add_token("ENDMARKER", s)

    # These can a appear as unary operators. Some are also binary operators
    UNOP2NAME = {"+": "PLUS", "-": "MINUS", "~": "TILDE"}

    def t_op(self, s):
        r"\+=|-=|\*=|/=|%=|&=|\|=|^=|<<=|>>=|\*\*=|//=|//|==|<=|>=|<<|>>|[<>%^&+/=~-]"

        # Operators need to be further classified since the grammar requires this
        if s in ("<", ">", "==", ">=", "<=", "<>", "!="):
            self.add_token("COMP_OP", s)
        elif s in (
            "+=",
            "-=",
            "*=",
            "/=",
            "%=",
            "&=",
            "|=",
            "^=",
            "<<=",
            ">>=",
            "**=",
            "//=",
        ):
            self.add_token("AUGASSIGN", s)
        elif s in self.UNOP2NAME.keys():
            self.add_token(self.UNOP2NAME[s], s)
        elif s in ("|", "^", "&", "<<", ">>", "**", "/", "%", "//"):
            # These are  *ONLY* binary operators. Operators which are exclusively or
            # can be unary operators were handled previously
            self.add_token("BINOP", s)
        elif s == "=":
            self.add_token("EQUAL", s)
        else:
            print("Internal error: Unknown operator %s" % s)
            raise SystemExit

    def t_linesep(self, s):
        r";"
        self.add_token("SEMICOLON", s)

    def t_nl(self, s):
        r"\n"
        self.add_token("NEWLINE", s, is_newline=True)

    def t_name(self, s):
        r"[A-Za-z_][A-Za-z_0-9]*"
        if s in RESERVED_WORDS:
            self.add_token(s.upper(), s)
        else:
            self.add_token("NAME", s)

    # A way to handle the problem of having to match two different
    # tokens with a single regular expression.
    # We can't have two separate defs because then it would be indeterminate
    # whether we get two single stars or one double star.
    def t_star_star(self, s):
        r"\*\*?"
        token_name = "STARSTAR" if len(s) == 2 else "STAR"
        self.add_token(token_name, s)

    # CONSTANTS
    # ---------

    def t_string(self, s):
        r"([\"]{3}(.|[\n])*[\"]{3})|('{3}(.|[\n])*'{3})|('[^']*')|(\"[^\"]*\")"
        self.add_token("STRING", s)

    # numbers; int, float, and complex

    # Note we have to put longer matches earlier. Specifically radix notation and
    # fixed-point notation
    def t_number(self, s):
        r"(0x[0-9a-f]+|0b[01]+|0o[0-7]+|\d+\.\d|\d+)j?"
        self.add_token("NUMBER", s)

    # Ugh. Handle Python's indent/dedent mess.
    def handle_indent_dedent(self, s):
        indent = len(s)
        if indent > self.indents[-1]:
            self.add_token("INDENT", s)
            self.indents.append(indent)
        if indent == self.indents[-1]:
            self.is_newline = False
            pass
        else:
            # May need several levels of dedent
            while indent < self.indents[-1]:
                self.indents = self.indents[0:-1]
                self.add_token("DEDENT", s)
                pass
            pass
        return

    # Combine comment and whitespace because we want to
    # capture the space before a comment.
    def t_whitespace_or_comment(self, s):
        r"([ \t]*[#].*[^\x04][\n]?)|([ \t]+)"
        if "#" in s:
            # We have a comment
            matches = re.match(r"(\s+)(.*[\n]?)", s)
            if matches and self.is_newline:
                self.handle_indent_dedent(matches.group(1))
                s = matches.group(2)
            if s.endswith("\n"):
                self.add_token("COMMENT", s[:-1])
                self.add_token("NEWLINE", "\n")
            else:
                self.add_token("COMMENT", s)
        elif self.is_newline:
            self.handle_indent_dedent(s)
            pass
        return


if __name__ == "__main__":
    scan = Python2Scanner()

    def showit(expr):
        print(expr)
        tokens = scan.tokenize(expr + ENDMARKER)
        for t in tokens:
            print(t)
        print("-" * 30)
        return

    # showit("1 # hi")
    showit(
        """def foo():
    # comment
    return
"""
    )
#    showit("(10.5 + 2 / 30) // 3 >> 1")
#    showit("1 + 2")
#     showit("""
# () { } + - 'abc' \"abc\" 10 10j 0x10 # foo
# # bar
# """)
#     showit("""
# for i in range(x):
#   if True:
#      pass
#   pass
# pass""")
#     showit("""
# for i in range(x):
#     while True:
#        break
# """)