scanner26.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. # Copyright (c) 2015-2017, 2021-2022, 2024 by Rocky Bernstein
  2. # Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
  3. # Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. """
  18. Python 2.6 bytecode scanner
  19. This overlaps Python's 2.6's dis module, but it can be run from Python 3 and
  20. other versions of Python. Also, we save token information for later
  21. use in deparsing.
  22. """
  23. import sys
  24. # bytecode verification, verify(), uses JUMP_OPs from here
  25. from xdis import iscode
  26. from xdis.bytecode import _get_const_info
  27. from xdis.opcodes import opcode_26
  28. import uncompyle6.scanners.scanner2 as scan
  29. from uncompyle6.scanner import Token
  30. intern = sys.intern
  31. JUMP_OPS = opcode_26.JUMP_OPS
  32. class Scanner26(scan.Scanner2):
  33. def __init__(self, show_asm=False):
  34. super(Scanner26, self).__init__((2, 6), show_asm)
  35. # "setup" opcodes
  36. self.setup_ops = frozenset(
  37. [
  38. self.opc.SETUP_EXCEPT,
  39. self.opc.SETUP_FINALLY,
  40. ]
  41. )
  42. return
  43. def ingest(self, co, classname=None, code_objects={}, show_asm=None):
  44. """Create "tokens" the bytecode of an Python code object. Largely these
  45. are the opcode name, but in some cases that has been modified to make parsing
  46. easier.
  47. returning a list of uncompyle6 Token's.
  48. Some transformations are made to assist the deparsing grammar:
  49. - various types of LOAD_CONST's are categorized in terms of what they load
  50. - COME_FROM instructions are added to assist parsing control structures
  51. - operands with stack argument counts or flag masks are appended to the
  52. opcode name, e.g.:
  53. * BUILD_LIST, BUILD_SET
  54. * MAKE_FUNCTION and FUNCTION_CALLS append the number of positional
  55. arguments
  56. - EXTENDED_ARGS instructions are removed
  57. Also, when we encounter certain tokens, we add them to a set
  58. which will cause custom grammar rules. Specifically, variable
  59. arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific
  60. rules for the specific number of arguments they take.
  61. """
  62. if not show_asm:
  63. show_asm = self.show_asm
  64. bytecode = self.build_instructions(co)
  65. # show_asm = 'after'
  66. if show_asm in ("both", "before"):
  67. print("\n# ---- disassembly:")
  68. bytecode.disassemble_bytes(
  69. co.co_code,
  70. varnames=co.co_varnames,
  71. names=co.co_names,
  72. constants=co.co_consts,
  73. cells=bytecode._cell_names,
  74. line_starts=bytecode._linestarts,
  75. asm_format="extended",
  76. )
  77. # Container for tokens
  78. tokens = []
  79. customize = {}
  80. if self.is_pypy:
  81. customize["PyPy"] = 0
  82. codelen = len(self.code)
  83. free, names, varnames = self.unmangle_code_names(co, classname)
  84. self.names = names
  85. # Scan for assertions. Later we will
  86. # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
  87. # 'LOAD_ASSERT' is used in assert statements.
  88. self.load_asserts = set()
  89. for i in self.op_range(0, codelen):
  90. # We need to detect the difference between:
  91. # raise AssertionError
  92. # and
  93. # assert ...
  94. if (
  95. self.code[i] == self.opc.JUMP_IF_TRUE
  96. and i + 4 < codelen
  97. and self.code[i + 3] == self.opc.POP_TOP
  98. and self.code[i + 4] == self.opc.LOAD_GLOBAL
  99. ):
  100. if names[self.get_argument(i + 4)] == "AssertionError":
  101. self.load_asserts.add(i + 4)
  102. jump_targets = self.find_jump_targets(show_asm)
  103. # contains (code, [addrRefToCode])
  104. last_stmt = self.next_stmt[0]
  105. i = self.next_stmt[last_stmt]
  106. replace = {}
  107. while i < codelen - 1:
  108. if self.lines and self.lines[last_stmt].next > i:
  109. # Distinguish "print ..." from "print ...,"
  110. if self.code[last_stmt] == self.opc.PRINT_ITEM:
  111. if self.code[i] == self.opc.PRINT_ITEM:
  112. replace[i] = "PRINT_ITEM_CONT"
  113. elif self.code[i] == self.opc.PRINT_NEWLINE:
  114. replace[i] = "PRINT_NEWLINE_CONT"
  115. last_stmt = i
  116. i = self.next_stmt[i]
  117. extended_arg = 0
  118. i = -1
  119. for offset in self.op_range(0, codelen):
  120. i += 1
  121. op = self.code[offset]
  122. op_name = self.opname[op]
  123. oparg = None
  124. pattr = None
  125. if offset in jump_targets:
  126. jump_idx = 0
  127. # We want to process COME_FROMs to the same offset to be in *descending*
  128. # offset order so we have the larger range or biggest instruction interval
  129. # last. (I think they are sorted in increasing order, but for safety
  130. # we sort them). That way, specific COME_FROM tags will match up
  131. # properly. For example, a "loop" with an "if" nested in it should have the
  132. # "loop" tag last so the grammar rule matches that properly.
  133. last_jump_offset = -1
  134. for jump_offset in sorted(jump_targets[offset], reverse=True):
  135. if jump_offset != last_jump_offset:
  136. tokens.append(
  137. Token(
  138. "COME_FROM",
  139. jump_offset,
  140. repr(jump_offset),
  141. offset="%s_%d" % (offset, jump_idx),
  142. has_arg=True,
  143. )
  144. )
  145. jump_idx += 1
  146. last_jump_offset = jump_offset
  147. elif offset in self.thens:
  148. tokens.append(
  149. Token(
  150. "THEN",
  151. None,
  152. self.thens[offset],
  153. offset="%s_0" % offset,
  154. has_arg=True,
  155. )
  156. )
  157. has_arg = op >= self.opc.HAVE_ARGUMENT
  158. if has_arg:
  159. oparg = self.get_argument(offset) + extended_arg
  160. extended_arg = 0
  161. if op == self.opc.EXTENDED_ARG:
  162. extended_arg += self.extended_arg_val(oparg)
  163. continue
  164. # Note: name used to match on rather than op since
  165. # BUILD_SET isn't in earlier Pythons.
  166. if op_name in (
  167. "BUILD_LIST",
  168. "BUILD_SET",
  169. ):
  170. t = Token(
  171. op_name,
  172. oparg,
  173. pattr,
  174. offset,
  175. self.linestarts.get(offset, None),
  176. op,
  177. has_arg,
  178. self.opc,
  179. )
  180. collection_type = op_name.split("_")[1]
  181. next_tokens = self.bound_collection_from_tokens(
  182. tokens, t, len(tokens), "CONST_%s" % collection_type
  183. )
  184. if next_tokens is not None:
  185. tokens = next_tokens
  186. continue
  187. if op in self.opc.CONST_OPS:
  188. const = co.co_consts[oparg]
  189. if iscode(const):
  190. oparg = const
  191. if const.co_name == "<lambda>":
  192. assert op_name == "LOAD_CONST"
  193. op_name = "LOAD_LAMBDA"
  194. elif const.co_name == self.genexpr_name:
  195. op_name = "LOAD_GENEXPR"
  196. elif const.co_name == "<dictcomp>":
  197. op_name = "LOAD_DICTCOMP"
  198. elif const.co_name == "<setcomp>":
  199. op_name = "LOAD_SETCOMP"
  200. else:
  201. op_name = "LOAD_CODE"
  202. # verify() uses 'pattr' for comparison, since 'attr'
  203. # now holds Code(const) and thus can not be used
  204. # for comparison (todo: think about changing this)
  205. # pattr = 'code_object @ 0x%x %s->%s' %\
  206. # (id(const), const.co_filename, const.co_name)
  207. pattr = "<code_object " + const.co_name + ">"
  208. else:
  209. if oparg < len(co.co_consts):
  210. argval, _ = _get_const_info(oparg, co.co_consts)
  211. # Why don't we use _ above for "pattr" rather than "const"?
  212. # This *is* a little hoaky, but we have to coordinate with
  213. # other parts like n_LOAD_CONST in pysource.py for example.
  214. pattr = const
  215. pass
  216. elif op in self.opc.NAME_OPS:
  217. pattr = names[oparg]
  218. elif op in self.opc.JREL_OPS:
  219. pattr = repr(offset + 3 + oparg)
  220. if op == self.opc.JUMP_FORWARD:
  221. target = self.get_target(offset)
  222. # FIXME: this is a hack to catch stuff like:
  223. # if x: continue
  224. # the "continue" is not on a new line.
  225. if len(tokens) and tokens[-1].kind == "JUMP_BACK":
  226. tokens[-1].kind = intern("CONTINUE")
  227. elif op in self.opc.JABS_OPS:
  228. pattr = repr(oparg)
  229. elif op in self.opc.LOCAL_OPS:
  230. if self.version < (1, 5):
  231. pattr = names[oparg]
  232. else:
  233. pattr = varnames[oparg]
  234. elif op in self.opc.COMPARE_OPS:
  235. pattr = self.opc.cmp_op[oparg]
  236. elif op in self.opc.FREE_OPS:
  237. pattr = free[oparg]
  238. if op in self.varargs_ops:
  239. # CE - Hack for >= 2.5
  240. # Now all values loaded via LOAD_CLOSURE are packed into
  241. # a tuple before calling MAKE_CLOSURE.
  242. if (
  243. self.version >= (2, 5)
  244. and op == self.opc.BUILD_TUPLE
  245. and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE
  246. ):
  247. continue
  248. else:
  249. op_name = "%s_%d" % (op_name, oparg)
  250. customize[op_name] = oparg
  251. elif self.version > (2, 0) and op == self.opc.CONTINUE_LOOP:
  252. customize[op_name] = 0
  253. elif (
  254. op_name
  255. in """
  256. CONTINUE_LOOP EXEC_STMT LOAD_LISTCOMP LOAD_SETCOMP
  257. """.split()
  258. ):
  259. customize[op_name] = 0
  260. elif op == self.opc.JUMP_ABSOLUTE:
  261. # Further classify JUMP_ABSOLUTE into backward jumps
  262. # which are used in loops, and "CONTINUE" jumps which
  263. # may appear in a "continue" statement. The loop-type
  264. # and continue-type jumps will help us classify loop
  265. # boundaries The continue-type jumps help us get
  266. # "continue" statements with would otherwise be turned
  267. # into a "pass" statement because JUMPs are sometimes
  268. # ignored in rules as just boundary overhead. In
  269. # comprehensions we might sometimes classify JUMP_BACK
  270. # as CONTINUE, but that's okay since we add a grammar
  271. # rule for that.
  272. target = self.get_target(offset)
  273. if target <= offset:
  274. op_name = "JUMP_BACK"
  275. if offset in self.stmts and self.code[offset + 3] not in (
  276. self.opc.END_FINALLY,
  277. self.opc.POP_BLOCK,
  278. ):
  279. if (
  280. offset in self.linestarts and tokens[-1].kind == "JUMP_BACK"
  281. ) or offset not in self.not_continue:
  282. op_name = "CONTINUE"
  283. else:
  284. # FIXME: this is a hack to catch stuff like:
  285. # if x: continue
  286. # the "continue" is not on a new line.
  287. if tokens[-1].kind == "JUMP_BACK":
  288. # We need 'intern' since we have
  289. # already have processed the previous
  290. # token.
  291. tokens[-1].kind = intern("CONTINUE")
  292. elif op == self.opc.LOAD_GLOBAL:
  293. if offset in self.load_asserts:
  294. op_name = "LOAD_ASSERT"
  295. elif op == self.opc.RETURN_VALUE:
  296. if offset in self.return_end_ifs:
  297. op_name = "RETURN_END_IF"
  298. linestart = self.linestarts.get(offset, None)
  299. if offset not in replace:
  300. tokens.append(
  301. Token(
  302. op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc
  303. )
  304. )
  305. else:
  306. tokens.append(
  307. Token(
  308. replace[offset],
  309. oparg,
  310. pattr,
  311. offset,
  312. linestart,
  313. op,
  314. has_arg,
  315. self.opc,
  316. )
  317. )
  318. pass
  319. pass
  320. if show_asm in ("both", "after"):
  321. print("\n# ---- tokenization:")
  322. # FIXME: t.format() is changing tokens!
  323. for t in tokens.copy():
  324. print(t.format(line_prefix=""))
  325. print()
  326. return tokens, customize
  327. if __name__ == "__main__":
  328. from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str
  329. if PYTHON_VERSION_TRIPLE[:2] == (2, 6):
  330. import inspect
  331. co = inspect.currentframe().f_code # type: ignore
  332. tokens, customize = Scanner26().ingest(co)
  333. for t in tokens:
  334. print(t.format())
  335. pass
  336. else:
  337. print("Need to be Python 2.6 to demo; I am version %s" % version_tuple_to_str())