parse_heads.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. # Copyright (c) 2022-2024 Rocky Bernstein
  2. #
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. """Here we have the top-level parse grammar types with their rules and the start symbols
  16. for them.
  17. Specific Python versions such as for Python 3.10 subclass these and
  18. add in grammar rules that are custom to them.
  19. However at the top-level they are all the same and share the same start symbol
  20. and start-symbol grammar rule.
  21. """
  22. # The below adds a special "start" rule for the kind of thing that we want to
  23. # decompile
  24. from typing import Union
  25. from spark_parser import GenericASTBuilder
  26. from decompyle3.parsers.treenode import SyntaxTree
  27. def nop_func(self, args):
  28. return None
  29. class ParserError(Exception):
  30. def __init__(self, token, offset: int, debug: bool):
  31. self.token = token
  32. self.offset = offset
  33. self.debug = debug
  34. def __str__(self) -> str:
  35. return "Parse error at or near `%r' instruction at offset %s\n" % (
  36. self.token,
  37. self.offset,
  38. )
  39. class PythonBaseParser(GenericASTBuilder):
  40. def __init__(self, debug_parser, start_symbol, is_lambda=False):
  41. # Note: order of debug_parser, and start_symbol is reverse from above.
  42. # This is because (at least at one time), start_symbol can be defaulted
  43. # in the setup, while debug_parser could have been but wasn't.
  44. GenericASTBuilder.__init__(self, SyntaxTree, start_symbol, debug_parser)
  45. # FIXME: customize per python parser version
  46. # These are the non-terminals we should collect into a list.
  47. # For example instead of:
  48. # stmts -> stmts stmt -> stmts stmt stmt ...
  49. # collect as stmts -> stmt stmt ...
  50. nt_list = [
  51. "and_parts",
  52. "attributes",
  53. "add_consts",
  54. "dicts_unmap",
  55. "doms_end",
  56. "exprs",
  57. "kvlist",
  58. "kwargs",
  59. "lists",
  60. "or_parts",
  61. "stmts",
  62. ]
  63. self.collect = frozenset(nt_list)
  64. # For these items we need to keep the 1st epslion reduction since
  65. # the nonterminal name is used in a semantic action.
  66. self.keep_epsilon = frozenset(("kvlist_n", "kvlist"))
  67. # ??? Do we need a debug option to skip eliding singleton reductions?
  68. # Time will tell if it if useful in debugging
  69. # FIXME: optional_nt is a misnomer. It's really about there being a
  70. # singleton reduction that we can simplify. It also happens to be optional
  71. # in its other derivation
  72. self.optional_nt |= frozenset(("suite_stmts", "c_stmts_opt", "stmt", "sstmt"))
  73. # Reduce singleton reductions in these nonterminals:
  74. # FIXME: would love to do sstmts, stmts and
  75. # so on but that would require major changes to the
  76. # semantic actions
  77. self.singleton = frozenset(("str", "store", "inplace_op"))
  78. # Instructions filled in from scanner
  79. self.insts = []
  80. # True if we are parsing inside a lambda expression.
  81. # because a lambda expression are written on a single line, certain line-oriented
  82. # statements behave differently
  83. self.is_lambda = is_lambda
  84. self.start_symbol = start_symbol
  85. self.new_rules = set()
  86. # Placeholder for Python version tuple
  87. self.version = (None, None)
  88. def ast_first_offset(self, ast) -> Union[int, str]:
  89. return ast.offset if hasattr(ast, "offset") else self.ast_first_offset(ast[0])
  90. def add_unique_rule(
  91. self, rule, opname: str, arg_count: int, customize: dict
  92. ) -> None:
  93. """Add rule to grammar, but only if it hasn't been added previously
  94. opname and stack_count are used in the customize() semantic
  95. the actions to add the semantic action rule. Stack_count is
  96. used in custom opcodes like MAKE_FUNCTION to indicate how
  97. many arguments it has. Often it is not used.
  98. """
  99. if rule not in self.new_rules:
  100. # print("XXX ", rule) # debug
  101. self.new_rules.add(rule)
  102. self.addRule(rule, nop_func)
  103. customize[opname] = arg_count
  104. pass
  105. return
  106. def add_unique_rules(self, rules: list, customize: dict) -> None:
  107. """Add rules (a list of string) to grammar. Note that
  108. the rules must not be those that set arg_count in the
  109. custom dictionary.
  110. """
  111. for rule in rules:
  112. if len(rule) == 0:
  113. continue
  114. opname = rule.split("::=")[0].strip()
  115. self.add_unique_rule(rule, opname, 0, customize)
  116. return
  117. def add_unique_doc_rules(self, rules_str: str, customize: dict) -> None:
  118. """Add rules (a docstring-like list of rules) to grammar.
  119. Note that the rules must not be those that set arg_count in the
  120. custom dictionary.
  121. """
  122. # print(rules_str)
  123. rules = [r.strip() for r in rules_str.split("\n")]
  124. self.add_unique_rules(rules, customize)
  125. return
  126. def cleanup(self):
  127. """
  128. Remove recursive references to allow garbage
  129. collector to collect this object.
  130. """
  131. for dict in (self.rule2func, self.rules, self.rule2name):
  132. for i in list(dict.keys()):
  133. dict[i] = None
  134. for i in dir(self):
  135. setattr(self, i, None)
  136. def debug_reduce(self, rule, tokens, parent, last_token_pos):
  137. """Customized format and print for our kind of tokens
  138. which gets called in debugging grammar reduce rules
  139. """
  140. def fix(c):
  141. s = str(c)
  142. last_token_pos = s.find("_")
  143. if last_token_pos == -1:
  144. return s
  145. else:
  146. return s[:last_token_pos]
  147. prefix = ""
  148. if parent and tokens:
  149. p_token = tokens[parent]
  150. if hasattr(p_token, "linestart") and p_token.linestart:
  151. prefix = "L.%3d: " % p_token.linestart
  152. else:
  153. prefix = " "
  154. if hasattr(p_token, "offset"):
  155. prefix += "%3s" % fix(p_token.offset)
  156. if len(rule[1]) > 1:
  157. prefix += "-%-3s " % fix(tokens[last_token_pos - 1].offset)
  158. else:
  159. prefix += " "
  160. else:
  161. prefix = " "
  162. print("%s%s ::= %s (%d)" % (prefix, rule[0], " ".join(rule[1]), last_token_pos))
  163. def error(self, instructions, index):
  164. # Find the last line boundary
  165. start, finish = -1, -1
  166. for start in range(index, -1, -1):
  167. if instructions[start].linestart:
  168. break
  169. pass
  170. for finish in range(index + 1, len(instructions)):
  171. if instructions[finish].linestart:
  172. break
  173. pass
  174. if start >= 0:
  175. err_token = instructions[index]
  176. print("Instruction context:")
  177. for i in range(start, finish):
  178. if i != index:
  179. indent = " "
  180. else:
  181. indent = "-> "
  182. print("%s%s" % (indent, instructions[i]))
  183. raise ParserError(err_token, err_token.offset, self.debug["reduce"])
  184. else:
  185. raise ParserError(None, -1, self.debug["reduce"])
  186. def get_pos_kw(self, token):
  187. """Return then the number of positional parameters and
  188. represented by the attr field of token"""
  189. # Low byte indicates number of positional parameters,
  190. # high byte number of keyword parameters
  191. args_pos = token.attr & 0xFF
  192. args_kw = (token.attr >> 8) & 0xFF
  193. return args_pos, args_kw
  194. def nonterminal(self, nt, args):
  195. n = len(args)
  196. # # Use this to find lots of singleton rule
  197. # if n == 1 and nt not in self.singleton:
  198. # print("XXX", nt)
  199. if nt in self.collect and n > 1:
  200. #
  201. # Collect iterated thingies together. That is rather than
  202. # stmts -> stmts stmt -> stmts stmt -> ...
  203. # stmms -> stmt stmt ...
  204. #
  205. if not hasattr(args[0], "append"):
  206. # Was in self.optional_nt as a single item, but we find we have
  207. # more than one now...
  208. rv = GenericASTBuilder.nonterminal(self, nt, [args[0]])
  209. else:
  210. rv = args[0]
  211. pass
  212. # In a list-like entity where the first item goes to epsilon,
  213. # drop that and save the 2nd item as the first one
  214. if len(rv) == 0 and nt not in self.keep_epsilon:
  215. rv = args[1]
  216. else:
  217. rv.append(args[1])
  218. elif n == 1 and args[0] in self.singleton:
  219. rv = GenericASTBuilder.nonterminal(self, nt, args[0])
  220. del args[0] # save memory
  221. elif n == 1 and nt in self.optional_nt:
  222. rv = args[0]
  223. else:
  224. rv = GenericASTBuilder.nonterminal(self, nt, args)
  225. return rv
  226. def off2inst(self, token):
  227. """
  228. Return the corresponding instruction for this token
  229. """
  230. offset = token.off2int(prefer_last=False)
  231. return self.insts[self.offset2inst_index[offset]]
  232. def __ambiguity(self, children):
  233. # only for debugging! to be removed hG/2000-10-15
  234. print(children)
  235. return GenericASTBuilder.ambiguity(self, children)
  236. def resolve(self, list):
  237. if len(list) == 2 and "function_def" in list and "assign" in list:
  238. return "function_def"
  239. if "grammar" in list and "expr" in list:
  240. return "expr"
  241. return GenericASTBuilder.resolve(self, list)
  242. class PythonParserExpr(PythonBaseParser):
  243. """This corresponds to a single grammar expression: "expr". It matches smaller
  244. units, so it is something to parse for that might be used when larger
  245. pieces of code can't decompile.
  246. """
  247. def p_start_rule_expr(self, args):
  248. """
  249. expr_start ::= expr return_value_opt
  250. return_value_opt ::= RETURN_VALUE?
  251. """
  252. def __init__(self, debug_parser, start_symbol="expr_start"):
  253. super(PythonParserExpr, self).__init__(
  254. debug_parser=debug_parser, start_symbol=start_symbol
  255. )
  256. PythonParserEval = PythonParserExpr
  257. class PythonParserExec(PythonBaseParser):
  258. """
  259. This corresponds to the compile-mode == "exec" of the `compile()` builtin
  260. or exec() builtin function
  261. """
  262. # def p_exec(self, args):
  263. # """
  264. # stmts ::= stmt+
  265. # """
  266. def __init__(self, debug_parser, start_symbol="stmts"):
  267. super(PythonParserExec, self).__init__(
  268. debug_parser=debug_parser, start_symbol=start_symbol
  269. )
  270. class PythonParserLambda(PythonBaseParser):
  271. """
  272. This corresponds to the Python lambda definitions
  273. """
  274. def p_start_rule_lambda(self, args):
  275. """
  276. lambda_start ::= return_expr_lambda
  277. """
  278. # lambda_start is the highest level nonterminal. However
  279. # we can pass in other nonterminals like "expr" for a different
  280. # parse.
  281. def __init__(self, debug_parser, start_symbol="lambda_start"):
  282. super(PythonParserLambda, self).__init__(
  283. start_symbol=start_symbol, debug_parser=debug_parser
  284. )
  285. class PythonParserSingle(PythonBaseParser):
  286. def p_start_rule_single(self, args):
  287. """
  288. # Single-mode interactive compilation
  289. single_start ::= expr PRINT_EXPR
  290. single_start ::= stmt
  291. """
  292. def __init__(self, debug_parser, start_symbol="single_start"):
  293. super(PythonParserSingle, self).__init__(
  294. start_symbol=start_symbol, debug_parser=debug_parser
  295. )
  296. class PythonParser(PythonBaseParser):
  297. def __init__(self, compile_mode, debug_parser):
  298. # FIXME: go over.
  299. if compile_mode == "single":
  300. PythonParserSingle.__init__(self, debug_parser=debug_parser)
  301. elif compile_mode == "lambda":
  302. PythonParserLambda.__init__(self, debug_parser=debug_parser)
  303. elif compile_mode == "eval":
  304. PythonParserEval.__init__(self, debug_parser=debug_parser)
  305. elif compile_mode == "exec":
  306. PythonParserExec.__init__(self, debug_parser=debug_parser)
  307. elif compile_mode == "eval_expr":
  308. PythonParserEval.__init__(self, debug_parser=debug_parser)
  309. else:
  310. raise BaseException(
  311. f'compile_mode should be either "exec", "single", "lambda", or "eval_expr"; got {compile_mode}'
  312. )
  313. # FIXME: customize per python parser version
  314. # These are the non-terminals we should collect into a list.
  315. # For example instead of:
  316. # stmts -> stmts stmt -> stmts stmt stmt ...
  317. # collect as stmts -> stmt stmt ...
  318. nt_list = [
  319. "_stmts",
  320. "and_parts",
  321. "attributes",
  322. "except_stmts",
  323. "exprlist",
  324. "importlist",
  325. "kvlist",
  326. "kwargs",
  327. "or_parts",
  328. # FIXME:
  329. # If we add c_stmts, we can miss adding a c_stmt,
  330. # test_float.py test_set_format() is an example.
  331. # Investigate
  332. # "c_stmts",
  333. "stmts",
  334. # Python 3.7+
  335. "importlist37",
  336. ]
  337. self.collect = frozenset(nt_list)
  338. # For these items we need to keep the 1st epslion reduction since
  339. # the nonterminal name is used in a semantic action.
  340. self.keep_epsilon = frozenset(("kvlist_n", "kvlist"))
  341. # ??? Do we need a debug option to skip eliding singleton reductions?
  342. # Time will tell if it if useful in debugging
  343. # FIXME: optional_nt is a misnomer. It's really about there being a
  344. # singleton reduction that we can simplify. It also happens to be optional
  345. # in its other derivation
  346. self.optional_nt |= frozenset(("suite_stmts", "c_stmts_opt", "stmt", "sstmt"))
  347. # Reduce singleton reductions in these nonterminals:
  348. # FIXME: would love to do expr, sstmts, stmts and
  349. # so on but that would require major changes to the
  350. # semantic actions
  351. self.singleton = frozenset(
  352. ("str", "store", "_stmts", "suite_stmts_opt", "inplace_op")
  353. )
  354. # Instructions filled in from scanner
  355. self.insts = []
  356. # true if we are parsing inside a lambda expression.
  357. # because a lambda expression are written on a single line, certain line-oriented
  358. # statements behave differently
  359. self.is_lambda = False