scanner37base.py 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. # Copyright (c) 2015-2020, 2022-2024 by Rocky Bernstein
  2. # Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
  3. # Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. """
  18. Python 3.7 bytecode scanner/deparser base.
  19. Also, we *modify* the instruction sequence to assist deparsing code.
  20. For example:
  21. - we add "COME_FROM" instructions to help in figuring out
  22. conditional branching and looping.
  23. - LOAD_CONSTs are classified further into the type of thing
  24. they load:
  25. lambda's, genexpr's, {dict,set,list} comprehension's,
  26. - PARAMETER counts appended {CALL,MAKE}_FUNCTION, BUILD_{TUPLE,SET,SLICE}
  27. Finally we save token information.
  28. """
  29. import sys
  30. from typing import Any, Dict, List, Set, Tuple
  31. import xdis
  32. # Get all the opcodes into globals
  33. import xdis.opcodes.opcode_37 as op3
  34. from xdis import Instruction, instruction_size, iscode
  35. from xdis.bytecode import _get_const_info
  36. from decompyle3.scanner import Scanner, Token
  37. globals().update(op3.opmap)
  38. CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
  39. class Scanner37Base(Scanner):
  40. def __init__(
  41. self, version: Tuple[int, int], show_asm=None, debug="", is_pypy=False
  42. ):
  43. super(Scanner37Base, self).__init__(version, show_asm, is_pypy)
  44. self.offset2tok_index = None
  45. self.debug = debug
  46. # True is code is from PyPy
  47. self.is_pypy = is_pypy
  48. # Bytecode converted into instruction
  49. self.insts = []
  50. # Create opcode classification sets
  51. # Note: super initialization above initializes self.opc
  52. # Ops that start SETUP_ ... We will COME_FROM with these names
  53. # Some blocks and END_ statements. And they can start
  54. # a new statement
  55. if self.version < (3, 8):
  56. setup_ops = [
  57. self.opc.SETUP_LOOP,
  58. self.opc.SETUP_EXCEPT,
  59. self.opc.SETUP_FINALLY,
  60. ]
  61. self.setup_ops_no_loop = frozenset(setup_ops) - frozenset(
  62. [self.opc.SETUP_LOOP]
  63. )
  64. else:
  65. setup_ops = [self.opc.SETUP_FINALLY]
  66. self.setup_ops_no_loop = frozenset(setup_ops)
  67. # Add back these opcodes which help us detect "break" and
  68. # "continue" statements via parsing.
  69. self.opc.BREAK_LOOP = 80
  70. self.opc.CONTINUE_LOOP = 119
  71. pass
  72. setup_ops.append(self.opc.SETUP_WITH)
  73. self.setup_ops = frozenset(setup_ops)
  74. self.pop_jump_tf = frozenset([self.opc.PJIF, self.opc.PJIT])
  75. self.not_continue_follow = ("END_FINALLY", "POP_BLOCK")
  76. # Opcodes that can start a statement.
  77. statement_opcodes = [
  78. self.opc.POP_BLOCK,
  79. self.opc.STORE_FAST,
  80. self.opc.DELETE_FAST,
  81. self.opc.STORE_DEREF,
  82. self.opc.STORE_GLOBAL,
  83. self.opc.DELETE_GLOBAL,
  84. self.opc.STORE_NAME,
  85. self.opc.DELETE_NAME,
  86. self.opc.STORE_ATTR,
  87. self.opc.DELETE_ATTR,
  88. self.opc.STORE_SUBSCR,
  89. self.opc.POP_TOP,
  90. self.opc.DELETE_SUBSCR,
  91. self.opc.END_FINALLY,
  92. self.opc.RETURN_VALUE,
  93. self.opc.RAISE_VARARGS,
  94. self.opc.PRINT_EXPR,
  95. self.opc.JUMP_ABSOLUTE,
  96. # These are phony for 3.8+
  97. self.opc.BREAK_LOOP,
  98. self.opc.CONTINUE_LOOP,
  99. ]
  100. self.statement_opcodes = frozenset(statement_opcodes) | self.setup_ops_no_loop
  101. # Opcodes that can start a "store" non-terminal.
  102. # FIXME: JUMP_ABSOLUTE is weird. What's up with that?
  103. self.designator_ops = frozenset(
  104. [
  105. self.opc.STORE_FAST,
  106. self.opc.STORE_NAME,
  107. self.opc.STORE_GLOBAL,
  108. self.opc.STORE_DEREF,
  109. self.opc.STORE_ATTR,
  110. self.opc.STORE_SUBSCR,
  111. self.opc.UNPACK_SEQUENCE,
  112. self.opc.JUMP_ABSOLUTE,
  113. self.opc.UNPACK_EX,
  114. ]
  115. )
  116. self.jump_if_pop = frozenset(
  117. [self.opc.JUMP_IF_FALSE_OR_POP, self.opc.JUMP_IF_TRUE_OR_POP]
  118. )
  119. self.pop_jump_if_pop = frozenset(
  120. [
  121. self.opc.JUMP_IF_FALSE_OR_POP,
  122. self.opc.JUMP_IF_TRUE_OR_POP,
  123. self.opc.POP_JUMP_IF_TRUE,
  124. self.opc.POP_JUMP_IF_FALSE,
  125. ]
  126. )
  127. # Not really a set, but still classification-like
  128. self.statement_opcode_sequences = [
  129. (self.opc.POP_JUMP_IF_FALSE, self.opc.JUMP_FORWARD),
  130. (self.opc.POP_JUMP_IF_FALSE, self.opc.JUMP_ABSOLUTE),
  131. (self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_FORWARD),
  132. (self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_ABSOLUTE),
  133. ]
  134. # FIXME: remove this and use instead info from xdis.
  135. # Opcodes that take a variable number of arguments
  136. # (expr's)
  137. varargs_ops = {
  138. self.opc.BUILD_LIST,
  139. self.opc.BUILD_TUPLE,
  140. self.opc.BUILD_SET,
  141. self.opc.BUILD_SLICE,
  142. self.opc.BUILD_MAP,
  143. self.opc.UNPACK_SEQUENCE,
  144. self.opc.RAISE_VARARGS,
  145. }
  146. varargs_ops.add(self.opc.CALL_METHOD)
  147. varargs_ops |= {
  148. self.opc.BUILD_SET_UNPACK,
  149. self.opc.BUILD_MAP_UNPACK,
  150. self.opc.BUILD_LIST_UNPACK,
  151. self.opc.BUILD_TUPLE_UNPACK,
  152. }
  153. varargs_ops.add(self.opc.BUILD_CONST_KEY_MAP)
  154. # Below is in bit order, "default = bit 0, closure = bit 3
  155. self.MAKE_FUNCTION_FLAGS = tuple(
  156. """
  157. default keyword-only annotation closure""".split()
  158. )
  159. self.varargs_ops = frozenset(varargs_ops)
  160. # FIXME: remove the above in favor of:
  161. # self.varargs_ops = frozenset(self.opc.hasvargs)
  162. return
  163. def bound_collection_from_tokens(
  164. self, tokens: list, next_tokens: list, t: Token, i: int, collection_type: str
  165. ):
  166. count = t.attr
  167. assert isinstance(count, int)
  168. assert count <= i
  169. if collection_type == "CONST_DICT":
  170. # constant dictionaries work via BUILD_CONST_KEY_MAP and
  171. # handle the values() like sets and lists.
  172. # However, the keys() are an LOAD_CONST of the keys.
  173. # adjust offset to account for this
  174. count += 1
  175. # For small lists don't bother
  176. if count < 5:
  177. return next_tokens + [t]
  178. collection_start = i - count
  179. for j in range(collection_start, i):
  180. if tokens[j].kind not in (
  181. "LOAD_CODE",
  182. "LOAD_CONST",
  183. "LOAD_FAST",
  184. "LOAD_GLOBAL",
  185. "LOAD_NAME",
  186. "LOAD_STR",
  187. ):
  188. return next_tokens + [t]
  189. collection_enum = CONST_COLLECTIONS.index(collection_type)
  190. # If we get here, all instructions before tokens[i] are LOAD_CONST, and
  191. # we can add a boundary marker and change LOAD_CONST to something else.
  192. new_tokens = next_tokens[:-count]
  193. start_offset = tokens[collection_start].offset
  194. new_tokens.append(
  195. Token(
  196. opname="COLLECTION_START",
  197. attr=collection_enum,
  198. pattr=collection_type,
  199. offset=f"{start_offset}_0",
  200. has_arg=True,
  201. opc=self.opc,
  202. has_extended_arg=False,
  203. optype=None,
  204. )
  205. )
  206. for j in range(collection_start, i):
  207. new_tokens.append(
  208. Token(
  209. opname="ADD_VALUE",
  210. attr=tokens[j].attr,
  211. pattr=tokens[j].pattr,
  212. offset=tokens[j].offset,
  213. has_arg=True,
  214. linestart=tokens[j].linestart,
  215. opc=self.opc,
  216. has_extended_arg=False,
  217. optype=tokens[j].optype,
  218. )
  219. )
  220. new_tokens.append(
  221. Token(
  222. opname=f"BUILD_{collection_type}",
  223. attr=t.attr,
  224. pattr=t.pattr,
  225. offset=t.offset,
  226. has_arg=t.has_arg,
  227. linestart=t.linestart,
  228. opc=t.opc,
  229. has_extended_arg=False,
  230. )
  231. )
  232. return new_tokens
  233. def ingest(self, co, classname=None, code_objects={}, show_asm=None):
  234. """Create "tokens" the bytecode of a Python code object. Largely these
  235. are the opcode name, but in some cases that has been modified to make parsing
  236. easier.
  237. returning a list of decompyle3s Token's.
  238. Some transformations are made to assist the deparsing grammar:
  239. - various types of LOAD_CONST's are categorized in terms of what they load
  240. - COME_FROM instructions are added to assist parsing control structures
  241. - operands with stack argument counts or flag masks are appended to the
  242. opcode name, e.g.:
  243. * BUILD_LIST, BUILD_SET
  244. * MAKE_FUNCTION and FUNCTION_CALLS append the number of positional
  245. arguments
  246. - EXTENDED_ARGS instructions are removed
  247. Also, when we encounter certain tokens, we add them to a set
  248. which will cause custom grammar rules. Specifically, variable
  249. arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific
  250. rules for the specific number of arguments they take./src/external-vcs/github/rocky/python-decompile3
  251. """
  252. def tokens_append(j, token):
  253. tokens.append(token)
  254. self.offset2tok_index[token.offset] = j
  255. j += 1
  256. assert j == len(tokens)
  257. return j
  258. if not show_asm:
  259. show_asm = self.show_asm
  260. bytecode = self.build_instructions(co)
  261. if show_asm in ("both", "before"):
  262. print("\n# ---- disassembly:")
  263. bytecode.disassemble_bytes(
  264. co.co_code,
  265. varnames=co.co_varnames,
  266. names=co.co_names,
  267. constants=co.co_consts,
  268. cells=bytecode._cell_names,
  269. linestarts=bytecode._linestarts,
  270. asm_format="extended",
  271. filename=co.co_filename,
  272. show_source=True,
  273. first_line_number=co.co_firstlineno,
  274. )
  275. # "customize" is in the process of going away here
  276. customize = {}
  277. if self.is_pypy:
  278. customize["PyPy"] = 0
  279. # Scan for assertions. Later we will
  280. # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
  281. # 'LOAD_ASSERT' is used in assert statements.
  282. self.load_asserts = set()
  283. # list of tokens/instructions
  284. tokens = []
  285. self.offset2tok_index = {}
  286. n = len(self.insts)
  287. for i, inst in enumerate(self.insts):
  288. # We need to detect the difference between:
  289. # raise AssertionError
  290. # and
  291. # assert ...
  292. # If we have:
  293. # POP_JUMP_IF_TRUE
  294. # LOAD_GLOBAL AssertionError
  295. # RAISE_VARARGS
  296. # then we have an "assert" statement.
  297. # then we have a "raise" statement
  298. assert_can_follow = inst.opname.startswith("POP_JUMP_IF_") and i + 2 < n
  299. if assert_can_follow:
  300. load_global_inst = self.insts[i + 1]
  301. if (
  302. load_global_inst.opname == "LOAD_GLOBAL"
  303. and load_global_inst.argval == "AssertionError"
  304. and inst.argval is not None
  305. ):
  306. raise_inst = self.get_inst(self.prev_op[inst.argval])
  307. if raise_inst.opname.startswith("RAISE_VARARGS"):
  308. self.load_asserts.add(load_global_inst.offset)
  309. pass
  310. pass
  311. pass
  312. # Operand values in Python wordcode are small. As a result,
  313. # there are these EXTENDED_ARG instructions - way more than
  314. # before 3.6. These parsing a lot of pain.
  315. last_continue = None
  316. # To simplify things we want to untangle this. We also
  317. # do this loop before we compute jump targets.
  318. for i, inst in enumerate(self.insts):
  319. # One artifact of the "too-small" operand problem, is that
  320. # some backward jumps, are turned into forward jumps to another
  321. # "extended arg" backward jump to the same location.
  322. if inst.opname == "JUMP_FORWARD":
  323. jump_inst = self.get_inst(inst.argval)
  324. if jump_inst.has_extended_arg and jump_inst.opname.startswith("JUMP"):
  325. # Create a combination of the jump-to instruction and
  326. # this one. Keep the position information of this instruction,
  327. # but the operator and operand properties come from the other
  328. # instruction
  329. self.insts[i] = Instruction(
  330. is_jump_target=inst.is_jump_target,
  331. starts_line=inst.starts_line,
  332. offset=inst.offset,
  333. opname=jump_inst.opname,
  334. opcode=jump_inst.opcode,
  335. has_arg=jump_inst.has_arg,
  336. arg=jump_inst.arg,
  337. argval=jump_inst.argval,
  338. argrepr=jump_inst.argrepr,
  339. tos_str=None,
  340. positions=None,
  341. optype=jump_inst.optype,
  342. inst_size=jump_inst.inst_size,
  343. has_extended_arg=inst.has_extended_arg,
  344. start_offset=None,
  345. )
  346. # Get jump targets
  347. # Format: {target offset: [jump offsets]}
  348. jump_targets = self.find_jump_targets(show_asm)
  349. # print("XXX2", jump_targets)
  350. last_op_was_break = False
  351. j = 0
  352. for i, inst in enumerate(self.insts):
  353. argval = inst.argval
  354. op = inst.opcode
  355. if inst.opname == "EXTENDED_ARG":
  356. # FIXME: The EXTENDED_ARG is used to signal annotation
  357. # parameters
  358. if i + 1 < n and self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION:
  359. continue
  360. if inst.offset in jump_targets:
  361. jump_idx = 0
  362. # We want to process COME_FROMs to the same offset to be in
  363. # *descending* offset order, so we have the larger range or
  364. # biggest instruction interval last. (I think they are sorted
  365. # in increasing order, but for safety we sort them). That way,
  366. # specific COME_FROM tags will match up properly. For example,
  367. # a "loop" with an "if" nested in it should have the "loop" tag
  368. # last so the grammar rule matches that properly.
  369. for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
  370. come_from_name = "COME_FROM"
  371. opname = self.opname_for_offset(jump_offset)
  372. if opname == "EXTENDED_ARG":
  373. k = xdis.next_offset(op, self.opc, jump_offset)
  374. opname = self.opname_for_offset(k)
  375. if opname.startswith("SETUP_"):
  376. come_from_type = opname[len("SETUP_") :]
  377. come_from_name = "COME_FROM_%s" % come_from_type
  378. pass
  379. elif inst.offset in self.except_targets:
  380. come_from_name = "COME_FROM_EXCEPT_CLAUSE"
  381. j = tokens_append(
  382. j,
  383. Token(
  384. opname=come_from_name,
  385. attr=jump_offset,
  386. pattr=repr(jump_offset),
  387. offset="%s_%s" % (inst.offset, jump_idx),
  388. has_arg=True,
  389. opc=self.opc,
  390. has_extended_arg=False,
  391. optype=inst.optype,
  392. ),
  393. )
  394. jump_idx += 1
  395. pass
  396. pass
  397. pattr = inst.argrepr
  398. opname = inst.opname
  399. if op in self.opc.CONST_OPS:
  400. const = argval
  401. if iscode(const):
  402. if const.co_name == "<lambda>":
  403. assert opname == "LOAD_CONST"
  404. opname = "LOAD_LAMBDA"
  405. elif const.co_name == "<genexpr>":
  406. opname = "LOAD_GENEXPR"
  407. elif const.co_name == "<dictcomp>":
  408. opname = "LOAD_DICTCOMP"
  409. elif const.co_name == "<setcomp>":
  410. opname = "LOAD_SETCOMP"
  411. elif const.co_name == "<listcomp>":
  412. opname = "LOAD_LISTCOMP"
  413. else:
  414. opname = "LOAD_CODE"
  415. # verify() uses 'pattr' for comparison, since 'attr'
  416. # now holds Code(const) and thus can not be used
  417. # for comparison (todo: think about changing this)
  418. # pattr = 'code_object @ 0x%x %s->%s' %\
  419. # (id(const), const.co_filename, const.co_name)
  420. pattr = "<code_object " + const.co_name + ">"
  421. elif isinstance(const, str):
  422. opname = "LOAD_STR"
  423. else:
  424. if isinstance(inst.arg, int) and inst.arg < len(co.co_consts):
  425. argval, _ = _get_const_info(inst.arg, co.co_consts)
  426. # Why don't we use _ above for "pattr" rather than "const"?
  427. # This *is* a little hoaky, but we have to coordinate with
  428. # other parts like n_LOAD_CONST in pysource.py for example.
  429. pattr = const
  430. pass
  431. elif opname == "IMPORT_NAME":
  432. if "." in inst.argval:
  433. opname = "IMPORT_NAME_ATTR"
  434. pass
  435. elif opname == "LOAD_FAST" and argval == ".0":
  436. # Used as the parameter of a list expression
  437. opname = "LOAD_ARG"
  438. elif opname in ("MAKE_FUNCTION", "MAKE_CLOSURE"):
  439. flags = argval
  440. # FIXME: generalize this
  441. if flags == 8:
  442. opname = "MAKE_FUNCTION_CLOSURE"
  443. elif flags == 9:
  444. opname = "MAKE_FUNCTION_CLOSURE_POS"
  445. else:
  446. opname = f"MAKE_FUNCTION_{flags}"
  447. attr = []
  448. for flag in self.MAKE_FUNCTION_FLAGS:
  449. bit = flags & 1
  450. attr.append(bit)
  451. flags >>= 1
  452. attr = attr[:4] # remove last value: attr[5] == False
  453. j = tokens_append(
  454. j,
  455. Token(
  456. opname=opname,
  457. attr=attr,
  458. pattr=pattr,
  459. offset=inst.offset,
  460. linestart=inst.starts_line,
  461. op=op,
  462. has_arg=inst.has_arg,
  463. opc=self.opc,
  464. has_extended_arg=inst.has_extended_arg,
  465. optype=inst.optype,
  466. ),
  467. )
  468. continue
  469. elif op in self.varargs_ops:
  470. pos_args = argval
  471. if self.is_pypy and not pos_args and opname == "BUILD_MAP":
  472. opname = "BUILD_MAP_n"
  473. else:
  474. opname = "%s_%d" % (opname, pos_args)
  475. elif self.is_pypy and opname == "JUMP_IF_NOT_DEBUG":
  476. # The value in the dict is in special cases in semantic actions, such
  477. # as JUMP_IF_NOT_DEBUG. The value is not used in these cases, so we put
  478. # in arbitrary value 0.
  479. customize[opname] = 0
  480. elif opname == "UNPACK_EX":
  481. # FIXME: try with scanner and parser by
  482. # changing argval
  483. before_args = argval & 0xFF
  484. after_args = (argval >> 8) & 0xFF
  485. pattr = "%d before vararg, %d after" % (before_args, after_args)
  486. argval = (before_args, after_args)
  487. opname = "%s_%d+%d" % (opname, before_args, after_args)
  488. elif op == self.opc.JUMP_ABSOLUTE:
  489. # Refine JUMP_ABSOLUTE further in into:
  490. #
  491. # * "JUMP_LOOP" - which are used in loops. This is sometimes
  492. # found at the end of a looping construct
  493. # * "BREAK_LOOP" - which are used to break loops.
  494. # * "CONTINUE" - jumps which may appear in a "continue" statement.
  495. # It is okay to confuse this with JUMP_LOOP. The
  496. # grammar should tolerate this.
  497. # * "JUMP_FORWARD - forward jumps that are not BREAK_LOOP jumps.
  498. #
  499. # The loop-type and continue-type jumps will help us
  500. # classify loop boundaries The continue-type jumps
  501. # help us get "continue" statements with would
  502. # otherwise be turned into a "pass" statement because
  503. # JUMPs are sometimes ignored in rules as just
  504. # boundary overhead. Again, in comprehensions we might
  505. # sometimes classify JUMP_LOOP as CONTINUE, but that's
  506. # okay since grammar rules should tolerate that.
  507. pattr = argval
  508. target = inst.argval
  509. if target <= inst.offset:
  510. next_opname = self.insts[i + 1].opname
  511. # 'Continue's include jumps to loops that are not
  512. # and the end of a block which follow with
  513. # POP_BLOCK and COME_FROM_LOOP. If the
  514. # JUMP_ABSOLUTE is to a FOR_ITER, and it is
  515. # followed by another JUMP_FORWARD then we'll take
  516. # it as a "continue".
  517. next_inst = self.insts[i + 1]
  518. is_continue = self.insts[
  519. self.offset2inst_index[target]
  520. ].opname == "FOR_ITER" and next_inst.opname in (
  521. "JUMP_FORWARD",
  522. "JUMP_ABSOLUTE",
  523. )
  524. if self.version < (3, 8) and (
  525. is_continue
  526. or (
  527. inst.offset in self.stmts
  528. and (
  529. inst.starts_line
  530. and next_opname not in self.not_continue_follow
  531. )
  532. )
  533. ):
  534. opname = "CONTINUE"
  535. else:
  536. # "continue" versus "break_loop" dectction is more complicated
  537. # because "continue" to an outer loop is really a "break loop"
  538. opname = "JUMP_LOOP"
  539. # FIXME: this is a hack to catch stuff like:
  540. # if x: continue
  541. # the "continue" is not on a new line.
  542. #
  543. # Another situation is where we have
  544. # for method in methods:
  545. # for B in method:
  546. # if c:
  547. # return
  548. # break # A "continue" but not the innermost one
  549. if tokens[-1].kind == "JUMP_LOOP" and tokens[-1].attr <= argval:
  550. if tokens[-2].kind == "BREAK_LOOP":
  551. del tokens[-1]
  552. j -= 1
  553. else:
  554. # "intern" is used because we are
  555. # changing the *previous* token. A
  556. # POP_TOP suggests a "break" rather
  557. # than a "continue"?
  558. if tokens[-2] == "POP_TOP" and (
  559. is_continue and next_inst.argval != tokens[-1].attr
  560. ):
  561. tokens[-1].kind = sys.intern("BREAK_LOOP")
  562. else:
  563. tokens[-1].kind = sys.intern("CONTINUE")
  564. last_continue = tokens[-1]
  565. pass
  566. pass
  567. pass
  568. elif (
  569. last_continue is not None
  570. and tokens[-1].kind == "JUMP_LOOP"
  571. and last_continue.attr <= tokens[-1].attr
  572. and last_continue.offset > tokens[-1].attr
  573. ):
  574. # Handle mis-characterized "CONTINUE"
  575. # We have a situation like:
  576. # loop ... for or while)
  577. # loop
  578. # if ...: # code below starts here
  579. # break # not continue
  580. #
  581. # POP_JUMP_IF_FALSE_LOOP # to outer loop
  582. # JUMP_LOOP # to inner loop
  583. # ...
  584. # JUMP_LOOP # to outer loop
  585. tokens[-2].kind = sys.intern("BREAK_LOOP")
  586. pass
  587. if last_op_was_break and opname == "CONTINUE":
  588. last_op_was_break = False
  589. continue
  590. pass
  591. else:
  592. opname = "JUMP_FORWARD"
  593. elif opname.startswith("POP_JUMP_IF_") and not inst.jumps_forward():
  594. opname += "_LOOP"
  595. elif inst.offset in self.load_asserts:
  596. opname = "LOAD_ASSERT"
  597. last_op_was_break = opname == "BREAK_LOOP"
  598. j = tokens_append(
  599. j,
  600. Token(
  601. opname=opname,
  602. attr=argval,
  603. pattr=pattr,
  604. offset=inst.offset,
  605. linestart=inst.starts_line,
  606. op=op,
  607. has_arg=inst.has_arg,
  608. opc=self.opc,
  609. has_extended_arg=inst.has_extended_arg,
  610. tos_str=inst.tos_str,
  611. start_offset=inst.start_offset,
  612. ),
  613. )
  614. if opname == "CONTINUE":
  615. last_continue = tokens[-1]
  616. pass
  617. if show_asm in ("both", "after") and self.version < (3, 8):
  618. print("\n# ---- tokenization:")
  619. # FIXME: t.format() is changing tokens!
  620. for t in tokens.copy():
  621. print(t.format(line_prefix=""))
  622. print()
  623. return tokens, customize
  624. def find_jump_targets(self, debug: str) -> dict:
  625. """
  626. Detect all offsets in a byte code which are jump targets
  627. where we might insert a COME_FROM instruction.
  628. Return the list of offsets.
  629. Return the list of offsets. An instruction can be jumped
  630. to in from multiple instructions.
  631. """
  632. code = self.code
  633. n = len(code)
  634. self.structs = [{"type": "root", "start": 0, "end": n - 1}]
  635. # All loop entry points
  636. self.loops: List[int] = []
  637. # Map fixed jumps to their real destination
  638. self.fixed_jumps: Dict[int, int] = {}
  639. self.except_targets = {}
  640. self.ignore_if: Set[int] = set()
  641. self.build_statement_indices()
  642. # Containers filled by detect_control_flow()
  643. self.not_continue: Set[int] = set()
  644. self.return_end_ifs: Set[int] = set()
  645. self.setup_loop_targets = {} # target given setup_loop offset
  646. self.setup_loops = {} # setup_loop offset given target
  647. targets = {}
  648. for i, inst in enumerate(self.insts):
  649. offset = inst.offset
  650. op = inst.opcode
  651. # FIXME: this code is going to get removed.
  652. # Determine structures and fix jumps in Python versions
  653. # since 2.3
  654. self.detect_control_flow(offset, i)
  655. if inst.has_arg:
  656. # FIXME: fix grammar so we don't have to exclude FOR_ITER
  657. if inst.is_jump() and op != self.opc.FOR_ITER:
  658. label = inst.argval
  659. else:
  660. label = self.fixed_jumps.get(offset)
  661. if label is not None and label != -1:
  662. targets[label] = targets.get(label, []) + [offset]
  663. elif op == self.opc.END_FINALLY and offset in self.fixed_jumps:
  664. label = self.fixed_jumps[offset]
  665. targets[label] = targets.get(label, []) + [offset]
  666. pass
  667. pass # for loop
  668. # DEBUG:
  669. if debug in ("both", "after"):
  670. import pprint as pp
  671. pp.pprint(self.structs)
  672. return targets
  673. def build_statement_indices(self):
  674. code = self.code
  675. start = 0
  676. end = codelen = len(code)
  677. # Compose preliminary list of indices with statements,
  678. # using plain statement opcodes
  679. prelim = self.inst_matches(start, end, self.statement_opcodes)
  680. # Initialize final container with statements with
  681. # preliminary data
  682. stmts = self.stmts = set(prelim)
  683. # Same for opcode sequences
  684. pass_stmts = set()
  685. for sequence in self.statement_opcode_sequences:
  686. for i in self.op_range(start, end - (len(sequence) + 1)):
  687. match = True
  688. for elem in sequence:
  689. if elem != code[i]:
  690. match = False
  691. break
  692. i += instruction_size(code[i], self.opc)
  693. if match is True:
  694. i = self.prev_op[i]
  695. stmts.add(i)
  696. pass_stmts.add(i)
  697. # Initialize statement list with the full data we've gathered so far
  698. if pass_stmts:
  699. stmt_offset_list = list(stmts)
  700. stmt_offset_list.sort()
  701. else:
  702. stmt_offset_list = prelim
  703. # 'List-map' which contains offset of start of
  704. # next statement, when op offset is passed as index
  705. self.next_stmt = slist = []
  706. last_stmt_offset = -1
  707. i = 0
  708. # Go through all statement offsets
  709. for stmt_offset in stmt_offset_list:
  710. # Process absolute jumps, but do not remove 'pass' statements
  711. # from the set
  712. if (
  713. code[stmt_offset] == self.opc.JUMP_ABSOLUTE
  714. and stmt_offset not in pass_stmts
  715. ):
  716. # If absolute jump occurs in forward direction or it takes off from the
  717. # same line as previous statement, this is not a statement
  718. # FIXME: 0 isn't always correct
  719. target = self.get_target(stmt_offset)
  720. if (
  721. target > stmt_offset
  722. or self.lines[last_stmt_offset].l_no == self.lines[stmt_offset].l_no
  723. ):
  724. stmts.remove(stmt_offset)
  725. continue
  726. # Scan back bytecode ops till we encounter non-JUMP_ABSOLUTE op
  727. j = self.prev_op[stmt_offset]
  728. while code[j] == self.opc.JUMP_ABSOLUTE and j > 0:
  729. j = self.prev_op[j]
  730. # If we got here, then it's list comprehension which
  731. # is not a statement too
  732. if code[j] == self.opc.LIST_APPEND:
  733. stmts.remove(stmt_offset)
  734. continue
  735. # Exclude ROT_TWO + POP_TOP
  736. elif (
  737. code[stmt_offset] == self.opc.POP_TOP
  738. and code[self.prev_op[stmt_offset]] == self.opc.ROT_TWO
  739. ):
  740. stmts.remove(stmt_offset)
  741. continue
  742. # Exclude FOR_ITER + designators
  743. elif code[stmt_offset] in self.designator_ops:
  744. j = self.prev_op[stmt_offset]
  745. while code[j] in self.designator_ops:
  746. j = self.prev_op[j]
  747. if code[j] == self.opc.FOR_ITER:
  748. stmts.remove(stmt_offset)
  749. continue
  750. # Add to list another list with offset of current statement,
  751. # equal to length of previous statement
  752. slist += [stmt_offset] * (stmt_offset - i)
  753. last_stmt_offset = stmt_offset
  754. i = stmt_offset
  755. # Finish filling the list for last statement
  756. slist += [codelen] * (codelen - len(slist))
  757. def detect_control_flow(self, offset: int, inst_index: int):
  758. """
  759. Detect type of block structures and their boundaries to fix optimized jumps
  760. in python2.3+
  761. """
  762. code = self.code
  763. inst = self.insts[inst_index]
  764. op = inst.opcode
  765. # Detect parent structure
  766. parent: Dict[str, Any] = self.structs[0]
  767. start: int = parent["start"]
  768. end: int = parent["end"]
  769. # Pick innermost parent for our offset
  770. for struct in self.structs:
  771. current_start = struct["start"]
  772. current_end = struct["end"]
  773. if (current_start <= offset < current_end) and (
  774. current_start >= start and current_end <= end
  775. ):
  776. start = current_start
  777. end = current_end
  778. parent = struct
  779. if self.version < (3, 8) and op == self.opc.SETUP_LOOP:
  780. # We categorize loop types: 'for', 'while', 'while 1' with
  781. # possibly suffixes '-loop' and '-else'
  782. # Try to find the jump_back instruction of the loop.
  783. # It could be a return instruction.
  784. start += inst.inst_size
  785. target = self.get_target(offset)
  786. end = self.restrict_to_parent(target, parent)
  787. self.setup_loops[target] = offset
  788. if target != end:
  789. self.fixed_jumps[offset] = end
  790. (line_no, next_line_byte) = self.lines[offset]
  791. jump_back = self.last_instr(
  792. start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False
  793. )
  794. if jump_back:
  795. jump_forward_offset = xdis.next_offset(
  796. code[jump_back], self.opc, jump_back
  797. )
  798. else:
  799. jump_forward_offset = None
  800. return_val_offset1 = self.prev[self.prev[end]]
  801. if (
  802. jump_back
  803. and jump_back != self.prev_op[end]
  804. and self.is_jump_forward(jump_forward_offset)
  805. ):
  806. if code[self.prev_op[end]] == self.opc.RETURN_VALUE or (
  807. code[self.prev_op[end]] == self.opc.POP_BLOCK
  808. and code[return_val_offset1] == self.opc.RETURN_VALUE
  809. ):
  810. jump_back = None
  811. if not jump_back:
  812. # loop suite ends in return
  813. jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE)
  814. if not jump_back:
  815. return
  816. jb_inst = self.get_inst(jump_back)
  817. jump_back = self.next_offset(jb_inst.opcode, jump_back)
  818. if_offset = None
  819. if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf:
  820. if_offset = self.prev[next_line_byte]
  821. if if_offset:
  822. loop_type = "while"
  823. self.ignore_if.add(if_offset)
  824. else:
  825. loop_type = "for"
  826. target = next_line_byte
  827. end = xdis.next_offset(code[jump_back], self.opc, jump_back)
  828. else:
  829. if self.get_target(jump_back) >= next_line_byte:
  830. jump_back = self.last_instr(
  831. start, end, self.opc.JUMP_ABSOLUTE, start, False
  832. )
  833. jb_inst = self.get_inst(jump_back)
  834. jb_next_offset = self.next_offset(jb_inst.opcode, jump_back)
  835. if end > jb_next_offset and self.is_jump_forward(end):
  836. if self.is_jump_forward(jb_next_offset):
  837. if self.get_target(jb_next_offset) == self.get_target(end):
  838. self.fixed_jumps[offset] = jb_next_offset
  839. end = jb_next_offset
  840. elif target < offset:
  841. self.fixed_jumps[offset] = jb_next_offset
  842. end = jb_next_offset
  843. target = self.get_target(jump_back)
  844. if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER):
  845. loop_type = "for"
  846. else:
  847. loop_type = "while"
  848. test = self.prev_op[next_line_byte]
  849. if test == offset:
  850. loop_type = "while 1"
  851. elif self.code[test] in self.opc.JUMP_OPs:
  852. self.ignore_if.add(test)
  853. test_target = self.get_target(test)
  854. if test_target > (jump_back + 3):
  855. jump_back = test_target
  856. self.not_continue.add(jump_back)
  857. self.loops.append(target)
  858. self.structs.append(
  859. {"type": loop_type + "-loop", "start": target, "end": jump_back}
  860. )
  861. after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back)
  862. if after_jump_offset != end:
  863. self.structs.append(
  864. {
  865. "type": loop_type + "-else",
  866. "start": after_jump_offset,
  867. "end": end,
  868. }
  869. )
  870. elif op in self.pop_jump_tf:
  871. target = inst.argval
  872. self.fixed_jumps[offset] = target
  873. # FIXME: consider removing the test on 3.8.
  874. elif self.version >= (3, 8) and inst.is_jump():
  875. self.fixed_jumps[offset] = inst.argval
  876. elif self.version < (3, 8) and op == self.opc.SETUP_EXCEPT:
  877. target = self.get_target(offset)
  878. end = self.restrict_to_parent(target, parent)
  879. self.fixed_jumps[offset] = end
  880. elif self.version < (3, 8) and op == self.opc.POP_EXCEPT:
  881. next_offset = xdis.next_offset(op, self.opc, offset)
  882. target = self.get_target(next_offset)
  883. if target > next_offset:
  884. next_op = code[next_offset]
  885. if (
  886. self.opc.JUMP_ABSOLUTE == next_op
  887. and self.opc.END_FINALLY
  888. != code[xdis.next_offset(next_op, self.opc, next_offset)]
  889. ):
  890. self.fixed_jumps[next_offset] = target
  891. self.except_targets[target] = next_offset
  892. elif op == self.opc.SETUP_FINALLY:
  893. target = self.get_target(offset)
  894. end = self.restrict_to_parent(target, parent)
  895. self.fixed_jumps[offset] = end
  896. elif op in self.jump_if_pop:
  897. target = self.get_target(offset)
  898. if target > offset:
  899. unop_target = self.last_instr(
  900. offset, target, self.opc.JUMP_FORWARD, target
  901. )
  902. if unop_target and code[unop_target + 3] != self.opc.ROT_TWO:
  903. self.fixed_jumps[offset] = unop_target
  904. else:
  905. self.fixed_jumps[offset] = self.restrict_to_parent(target, parent)
  906. pass
  907. pass
  908. else:
  909. # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get
  910. # misclassified as RETURN_END_IF. Handle that here.
  911. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF
  912. if op == self.opc.RETURN_VALUE:
  913. next_offset = xdis.next_offset(op, self.opc, offset)
  914. if next_offset < len(code) and (
  915. code[next_offset] == self.opc.JUMP_ABSOLUTE
  916. and offset in self.return_end_ifs
  917. ):
  918. self.return_end_ifs.remove(offset)
  919. pass
  920. pass
  921. elif op == self.opc.JUMP_FORWARD:
  922. # If we have:
  923. # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x:
  924. # then RETURN_VALUE is not RETURN_END_IF
  925. rtarget = self.get_target(offset)
  926. rtarget_prev = self.prev[rtarget]
  927. if (
  928. code[rtarget_prev] == self.opc.RETURN_VALUE
  929. and rtarget_prev in self.return_end_ifs
  930. ):
  931. i = rtarget_prev
  932. while i != offset:
  933. if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]:
  934. return
  935. i = self.prev[i]
  936. self.return_end_ifs.remove(rtarget_prev)
  937. pass
  938. return
  939. def next_except_jump(self, start):
  940. """
  941. Return the next jump that was generated by an except SomeException:
  942. construct in a try...except...else clause or None if not found.
  943. """
  944. if self.code[start] == self.opc.DUP_TOP:
  945. except_match = self.first_instr(
  946. start, len(self.code), self.opc.POP_JUMP_IF_FALSE
  947. )
  948. if except_match:
  949. jmp = self.prev_op[self.get_target(except_match)]
  950. self.ignore_if.add(except_match)
  951. self.not_continue.add(jmp)
  952. return jmp
  953. count_END_FINALLY = 0
  954. count_SETUP_ = 0
  955. for i in self.op_range(start, len(self.code)):
  956. op = self.code[i]
  957. if op == self.opc.END_FINALLY:
  958. if count_END_FINALLY == count_SETUP_:
  959. assert self.code[self.prev_op[i]] in frozenset(
  960. [
  961. self.opc.JUMP_ABSOLUTE,
  962. self.opc.JUMP_FORWARD,
  963. self.opc.RETURN_VALUE,
  964. ]
  965. )
  966. self.not_continue.add(self.prev_op[i])
  967. return self.prev_op[i]
  968. count_END_FINALLY += 1
  969. elif op in self.setup_opts_no_loop:
  970. count_SETUP_ += 1
  971. if __name__ == "__main__":
  972. from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str
  973. if PYTHON_VERSION_TRIPLE[:2] >= (3, 7):
  974. import inspect
  975. my_co = inspect.currentframe().f_code # type: ignore
  976. my_tokens, customize = Scanner37Base(PYTHON_VERSION_TRIPLE).ingest(my_co)
  977. for my_token in my_tokens:
  978. print(my_token)
  979. else:
  980. print(
  981. "Need to be Python 3.7..3.8 to demo; "
  982. f"I am version {version_tuple_to_str()}."
  983. )
  984. pass