yichael
/
xhs-note-crawling


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
							#  Copyright (c) 2022-2024 Rocky Bernstein
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""Here we have the top-level parse grammar types with their rules and the start symbols
for them.

Specific Python versions such as for Python 3.10 subclass these and
add in grammar rules that are custom to them.

However at the top-level they are all the same and share the same start symbol
and start-symbol grammar rule.

"""
# The below adds a special "start" rule for the kind of thing that we want to
# decompile

from typing import Union

from spark_parser import GenericASTBuilder

from decompyle3.parsers.treenode import SyntaxTree


def nop_func(self, args):
    return None


class ParserError(Exception):
    def __init__(self, token, offset: int, debug: bool):
        self.token = token
        self.offset = offset
        self.debug = debug

    def __str__(self) -> str:
        return "Parse error at or near `%r' instruction at offset %s\n" % (
            self.token,
            self.offset,
        )


class PythonBaseParser(GenericASTBuilder):
    def __init__(self, debug_parser, start_symbol, is_lambda=False):

        # Note: order of debug_parser, and start_symbol is reverse from above.
        # This is because (at least at one time), start_symbol can be defaulted
        # in the setup, while debug_parser could have been but wasn't.
        GenericASTBuilder.__init__(self, SyntaxTree, start_symbol, debug_parser)

        # FIXME: customize per python parser version

        # These are the non-terminals we should collect into a list.
        # For example instead of:
        #   stmts -> stmts stmt -> stmts stmt stmt ...
        # collect as stmts -> stmt stmt ...
        nt_list = [
            "and_parts",
            "attributes",
            "add_consts",
            "dicts_unmap",
            "doms_end",
            "exprs",
            "kvlist",
            "kwargs",
            "lists",
            "or_parts",
            "stmts",
        ]
        self.collect = frozenset(nt_list)

        # For these items we need to keep the 1st epslion reduction since
        # the nonterminal name is used in a semantic action.
        self.keep_epsilon = frozenset(("kvlist_n", "kvlist"))

        # ??? Do we need a debug option to skip eliding singleton reductions?
        # Time will tell if it if useful in debugging

        # FIXME: optional_nt is a misnomer. It's really about there being a
        # singleton reduction that we can simplify. It also happens to be optional
        # in its other derivation
        self.optional_nt |= frozenset(("suite_stmts", "c_stmts_opt", "stmt", "sstmt"))

        # Reduce singleton reductions in these nonterminals:
        # FIXME: would love to do sstmts, stmts and
        # so on but that would require major changes to the
        # semantic actions
        self.singleton = frozenset(("str", "store", "inplace_op"))
        # Instructions filled in from scanner
        self.insts = []

        # True if we are parsing inside a lambda expression.
        # because a lambda expression are written on a single line, certain line-oriented
        # statements behave differently
        self.is_lambda = is_lambda

        self.start_symbol = start_symbol
        self.new_rules = set()

        # Placeholder for Python version tuple
        self.version = (None, None)

    def ast_first_offset(self, ast) -> Union[int, str]:
        return ast.offset if hasattr(ast, "offset") else self.ast_first_offset(ast[0])

    def add_unique_rule(
        self, rule, opname: str, arg_count: int, customize: dict
    ) -> None:
        """Add rule to grammar, but only if it hasn't been added previously
        opname and stack_count are used in the customize() semantic
        the actions to add the semantic action rule. Stack_count is
        used in custom opcodes like MAKE_FUNCTION to indicate how
        many arguments it has. Often it is not used.
        """
        if rule not in self.new_rules:
            # print("XXX ", rule) # debug
            self.new_rules.add(rule)
            self.addRule(rule, nop_func)
            customize[opname] = arg_count
            pass
        return

    def add_unique_rules(self, rules: list, customize: dict) -> None:
        """Add rules (a list of string) to grammar. Note that
        the rules must not be those that set arg_count in the
        custom dictionary.
        """
        for rule in rules:
            if len(rule) == 0:
                continue
            opname = rule.split("::=")[0].strip()
            self.add_unique_rule(rule, opname, 0, customize)
        return

    def add_unique_doc_rules(self, rules_str: str, customize: dict) -> None:
        """Add rules (a docstring-like list of rules) to grammar.
        Note that the rules must not be those that set arg_count in the
        custom dictionary.
        """
        # print(rules_str)
        rules = [r.strip() for r in rules_str.split("\n")]
        self.add_unique_rules(rules, customize)
        return

    def cleanup(self):
        """
        Remove recursive references to allow garbage
        collector to collect this object.
        """
        for dict in (self.rule2func, self.rules, self.rule2name):
            for i in list(dict.keys()):
                dict[i] = None
        for i in dir(self):
            setattr(self, i, None)

    def debug_reduce(self, rule, tokens, parent, last_token_pos):
        """Customized format and print for our kind of tokens
        which gets called in debugging grammar reduce rules
        """

        def fix(c):
            s = str(c)
            last_token_pos = s.find("_")
            if last_token_pos == -1:
                return s
            else:
                return s[:last_token_pos]

        prefix = ""
        if parent and tokens:
            p_token = tokens[parent]
            if hasattr(p_token, "linestart") and p_token.linestart:
                prefix = "L.%3d: " % p_token.linestart
            else:
                prefix = "       "
            if hasattr(p_token, "offset"):
                prefix += "%3s" % fix(p_token.offset)
                if len(rule[1]) > 1:
                    prefix += "-%-3s " % fix(tokens[last_token_pos - 1].offset)
                else:
                    prefix += "     "
        else:
            prefix = "               "

        print("%s%s ::= %s (%d)" % (prefix, rule[0], " ".join(rule[1]), last_token_pos))

    def error(self, instructions, index):
        # Find the last line boundary
        start, finish = -1, -1
        for start in range(index, -1, -1):
            if instructions[start].linestart:
                break
            pass
        for finish in range(index + 1, len(instructions)):
            if instructions[finish].linestart:
                break
            pass
        if start >= 0:
            err_token = instructions[index]
            print("Instruction context:")
            for i in range(start, finish):
                if i != index:
                    indent = "   "
                else:
                    indent = "-> "
                print("%s%s" % (indent, instructions[i]))
            raise ParserError(err_token, err_token.offset, self.debug["reduce"])
        else:
            raise ParserError(None, -1, self.debug["reduce"])

    def get_pos_kw(self, token):
        """Return then the number of positional parameters and
        represented by the attr field of token"""
        # Low byte indicates number of positional parameters,
        # high byte number of keyword parameters
        args_pos = token.attr & 0xFF
        args_kw = (token.attr >> 8) & 0xFF
        return args_pos, args_kw

    def nonterminal(self, nt, args):
        n = len(args)

        # # Use this to find lots of singleton rule
        # if n == 1 and nt not in self.singleton:
        #     print("XXX", nt)

        if nt in self.collect and n > 1:
            #
            #  Collect iterated thingies together. That is rather than
            #  stmts -> stmts stmt -> stmts stmt -> ...
            #  stmms -> stmt stmt ...
            #
            if not hasattr(args[0], "append"):
                # Was in self.optional_nt as a single item, but we find we have
                # more than one now...
                rv = GenericASTBuilder.nonterminal(self, nt, [args[0]])
            else:
                rv = args[0]
                pass
            # In a  list-like entity where the first item goes to epsilon,
            # drop that and save the 2nd item as the first one
            if len(rv) == 0 and nt not in self.keep_epsilon:
                rv = args[1]
            else:
                rv.append(args[1])
        elif n == 1 and args[0] in self.singleton:
            rv = GenericASTBuilder.nonterminal(self, nt, args[0])
            del args[0]  # save memory
        elif n == 1 and nt in self.optional_nt:
            rv = args[0]
        else:
            rv = GenericASTBuilder.nonterminal(self, nt, args)
        return rv

    def off2inst(self, token):
        """
        Return the corresponding instruction for this token
        """
        offset = token.off2int(prefer_last=False)
        return self.insts[self.offset2inst_index[offset]]

    def __ambiguity(self, children):
        # only for debugging! to be removed hG/2000-10-15
        print(children)
        return GenericASTBuilder.ambiguity(self, children)

    def resolve(self, list):
        if len(list) == 2 and "function_def" in list and "assign" in list:
            return "function_def"
        if "grammar" in list and "expr" in list:
            return "expr"
        return GenericASTBuilder.resolve(self, list)


class PythonParserExpr(PythonBaseParser):
    """This corresponds to a single grammar expression: "expr". It matches smaller
    units, so it is something to parse for that might be used when larger
    pieces of code can't decompile.

    """

    def p_start_rule_expr(self, args):
        """
        expr_start       ::= expr return_value_opt
        return_value_opt ::= RETURN_VALUE?
        """

    def __init__(self, debug_parser, start_symbol="expr_start"):
        super(PythonParserExpr, self).__init__(
            debug_parser=debug_parser, start_symbol=start_symbol
        )


PythonParserEval = PythonParserExpr


class PythonParserExec(PythonBaseParser):
    """
    This corresponds to the compile-mode == "exec" of the `compile()` builtin
    or exec() builtin function
    """

    # def p_exec(self, args):
    #     """
    #     stmts ::= stmt+
    #     """

    def __init__(self, debug_parser, start_symbol="stmts"):
        super(PythonParserExec, self).__init__(
            debug_parser=debug_parser, start_symbol=start_symbol
        )


class PythonParserLambda(PythonBaseParser):
    """
    This corresponds to the Python lambda definitions
    """

    def p_start_rule_lambda(self, args):
        """
        lambda_start ::= return_expr_lambda
        """

    # lambda_start is the highest level nonterminal. However
    # we can pass in other nonterminals like "expr" for a different
    # parse.
    def __init__(self, debug_parser, start_symbol="lambda_start"):
        super(PythonParserLambda, self).__init__(
            start_symbol=start_symbol, debug_parser=debug_parser
        )


class PythonParserSingle(PythonBaseParser):
    def p_start_rule_single(self, args):
        """
        # Single-mode interactive compilation
        single_start ::= expr PRINT_EXPR
        single_start ::= stmt
        """

    def __init__(self, debug_parser, start_symbol="single_start"):
        super(PythonParserSingle, self).__init__(
            start_symbol=start_symbol, debug_parser=debug_parser
        )


class PythonParser(PythonBaseParser):
    def __init__(self, compile_mode, debug_parser):
        # FIXME: go over.
        if compile_mode == "single":
            PythonParserSingle.__init__(self, debug_parser=debug_parser)
        elif compile_mode == "lambda":
            PythonParserLambda.__init__(self, debug_parser=debug_parser)
        elif compile_mode == "eval":
            PythonParserEval.__init__(self, debug_parser=debug_parser)
        elif compile_mode == "exec":
            PythonParserExec.__init__(self, debug_parser=debug_parser)
        elif compile_mode == "eval_expr":
            PythonParserEval.__init__(self, debug_parser=debug_parser)

        else:
            raise BaseException(
                f'compile_mode should be either "exec", "single", "lambda", or "eval_expr"; got {compile_mode}'
            )

        # FIXME: customize per python parser version

        # These are the non-terminals we should collect into a list.
        # For example instead of:
        #   stmts -> stmts stmt -> stmts stmt stmt ...
        # collect as stmts -> stmt stmt ...
        nt_list = [
            "_stmts",
            "and_parts",
            "attributes",
            "except_stmts",
            "exprlist",
            "importlist",
            "kvlist",
            "kwargs",
            "or_parts",
            # FIXME:
            # If we add c_stmts, we can miss adding a c_stmt,
            # test_float.py test_set_format() is an example.
            # Investigate
            # "c_stmts",
            "stmts",
            # Python 3.7+
            "importlist37",
        ]
        self.collect = frozenset(nt_list)

        # For these items we need to keep the 1st epslion reduction since
        # the nonterminal name is used in a semantic action.
        self.keep_epsilon = frozenset(("kvlist_n", "kvlist"))

        # ??? Do we need a debug option to skip eliding singleton reductions?
        # Time will tell if it if useful in debugging

        # FIXME: optional_nt is a misnomer. It's really about there being a
        # singleton reduction that we can simplify. It also happens to be optional
        # in its other derivation
        self.optional_nt |= frozenset(("suite_stmts", "c_stmts_opt", "stmt", "sstmt"))

        # Reduce singleton reductions in these nonterminals:
        # FIXME: would love to do expr, sstmts, stmts and
        # so on but that would require major changes to the
        # semantic actions
        self.singleton = frozenset(
            ("str", "store", "_stmts", "suite_stmts_opt", "inplace_op")
        )
        # Instructions filled in from scanner
        self.insts = []

        # true if we are parsing inside a lambda expression.
        # because a lambda expression are written on a single line, certain line-oriented
        # statements behave differently
        self.is_lambda = False