scanner37base.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014
  1. # Copyright (c) 2015-2020, 2022-2024 by Rocky Bernstein
  2. # Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
  3. # Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. """
  18. Python 3.7 bytecode scanner/deparser base.
  19. Also we *modify* the instruction sequence to assist deparsing code.
  20. For example:
  21. - we add "COME_FROM" instructions to help in figuring out
  22. conditional branching and looping.
  23. - LOAD_CONSTs are classified further into the type of thing
  24. they load:
  25. lambda's, genexpr's, {dict,set,list} comprehension's,
  26. - PARAMETER counts appended {CALL,MAKE}_FUNCTION, BUILD_{TUPLE,SET,SLICE}
  27. Finally we save token information.
  28. """
  29. import sys
  30. from typing import Any, Dict, List, Set, Tuple
  31. import xdis
  32. # Get all the opcodes into globals
  33. import xdis.opcodes.opcode_37 as op3
  34. from xdis import Instruction, instruction_size, iscode
  35. from xdis.bytecode import _get_const_info
  36. from uncompyle6.scanner import Scanner, Token
  37. globals().update(op3.opmap)
  38. CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT")
  39. class Scanner37Base(Scanner):
  40. def __init__(
  41. self, version: Tuple[int, int], show_asm=None, debug="", is_pypy=False
  42. ):
  43. super(Scanner37Base, self).__init__(version, show_asm, is_pypy)
  44. self.offset2tok_index = None
  45. self.debug = debug
  46. # True is code is from PyPy
  47. self.is_pypy = is_pypy
  48. # Bytecode converted into instruction
  49. self.insts = []
  50. # Create opcode classification sets
  51. # Note: super initialization above initializes self.opc
  52. # Ops that start SETUP_ ... We will COME_FROM with these names
  53. # Some blocks and END_ statements. And they can start
  54. # a new statement
  55. if self.version < (3, 8):
  56. setup_ops = [
  57. self.opc.SETUP_LOOP,
  58. self.opc.SETUP_EXCEPT,
  59. self.opc.SETUP_FINALLY,
  60. ]
  61. self.setup_ops_no_loop = frozenset(setup_ops) - frozenset(
  62. [self.opc.SETUP_LOOP]
  63. )
  64. else:
  65. setup_ops = [self.opc.SETUP_FINALLY]
  66. self.setup_ops_no_loop = frozenset(setup_ops)
  67. # Add back these opcodes which help us detect "break" and
  68. # "continue" statements via parsing.
  69. self.opc.BREAK_LOOP = 80
  70. self.opc.CONTINUE_LOOP = 119
  71. pass
  72. setup_ops.append(self.opc.SETUP_WITH)
  73. self.setup_ops = frozenset(setup_ops)
  74. self.pop_jump_tf = frozenset([self.opc.PJIF, self.opc.PJIT])
  75. self.not_continue_follow = ("END_FINALLY", "POP_BLOCK")
  76. # Opcodes that can start a statement.
  77. statement_opcodes = [
  78. self.opc.POP_BLOCK,
  79. self.opc.STORE_FAST,
  80. self.opc.DELETE_FAST,
  81. self.opc.STORE_DEREF,
  82. self.opc.STORE_GLOBAL,
  83. self.opc.DELETE_GLOBAL,
  84. self.opc.STORE_NAME,
  85. self.opc.DELETE_NAME,
  86. self.opc.STORE_ATTR,
  87. self.opc.DELETE_ATTR,
  88. self.opc.STORE_SUBSCR,
  89. self.opc.POP_TOP,
  90. self.opc.DELETE_SUBSCR,
  91. self.opc.END_FINALLY,
  92. self.opc.RETURN_VALUE,
  93. self.opc.RAISE_VARARGS,
  94. self.opc.PRINT_EXPR,
  95. self.opc.JUMP_ABSOLUTE,
  96. # These are phony for 3.8+
  97. self.opc.BREAK_LOOP,
  98. self.opc.CONTINUE_LOOP,
  99. ]
  100. self.statement_opcodes = frozenset(statement_opcodes) | self.setup_ops_no_loop
  101. # Opcodes that can start a "store" non-terminal.
  102. # FIXME: JUMP_ABSOLUTE is weird. What's up with that?
  103. self.designator_ops = frozenset(
  104. [
  105. self.opc.STORE_FAST,
  106. self.opc.STORE_NAME,
  107. self.opc.STORE_GLOBAL,
  108. self.opc.STORE_DEREF,
  109. self.opc.STORE_ATTR,
  110. self.opc.STORE_SUBSCR,
  111. self.opc.UNPACK_SEQUENCE,
  112. self.opc.JUMP_ABSOLUTE,
  113. self.opc.UNPACK_EX,
  114. ]
  115. )
  116. self.jump_if_pop = frozenset(
  117. [self.opc.JUMP_IF_FALSE_OR_POP, self.opc.JUMP_IF_TRUE_OR_POP]
  118. )
  119. self.pop_jump_if_pop = frozenset(
  120. [
  121. self.opc.JUMP_IF_FALSE_OR_POP,
  122. self.opc.JUMP_IF_TRUE_OR_POP,
  123. self.opc.POP_JUMP_IF_TRUE,
  124. self.opc.POP_JUMP_IF_FALSE,
  125. ]
  126. )
  127. # Not really a set, but still classification-like
  128. self.statement_opcode_sequences = [
  129. (self.opc.POP_JUMP_IF_FALSE, self.opc.JUMP_FORWARD),
  130. (self.opc.POP_JUMP_IF_FALSE, self.opc.JUMP_ABSOLUTE),
  131. (self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_FORWARD),
  132. (self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_ABSOLUTE),
  133. ]
  134. # FIXME: remove this and use instead info from xdis.
  135. # Opcodes that take a variable number of arguments
  136. # (expr's)
  137. varargs_ops = set(
  138. [
  139. self.opc.BUILD_LIST,
  140. self.opc.BUILD_TUPLE,
  141. self.opc.BUILD_SET,
  142. self.opc.BUILD_SLICE,
  143. self.opc.BUILD_MAP,
  144. self.opc.UNPACK_SEQUENCE,
  145. self.opc.RAISE_VARARGS,
  146. ]
  147. )
  148. varargs_ops.add(self.opc.CALL_METHOD)
  149. varargs_ops |= set(
  150. [
  151. self.opc.BUILD_SET_UNPACK,
  152. self.opc.BUILD_MAP_UNPACK, # we will handle this later
  153. self.opc.BUILD_LIST_UNPACK,
  154. self.opc.BUILD_TUPLE_UNPACK,
  155. ]
  156. )
  157. varargs_ops.add(self.opc.BUILD_CONST_KEY_MAP)
  158. # Below is in bit order, "default = bit 0, closure = bit 3
  159. self.MAKE_FUNCTION_FLAGS = tuple(
  160. """
  161. default keyword-only annotation closure""".split()
  162. )
  163. self.varargs_ops = frozenset(varargs_ops)
  164. # FIXME: remove the above in favor of:
  165. # self.varargs_ops = frozenset(self.opc.hasvargs)
  166. return
  167. def ingest(self, co, classname=None, code_objects={}, show_asm=None):
  168. """Create "tokens" the bytecode of an Python code object. Largely these
  169. are the opcode name, but in some cases that has been modified to make parsing
  170. easier.
  171. returning a list of uncompyle6 Token's.
  172. Some transformations are made to assist the deparsing grammar:
  173. - various types of LOAD_CONST's are categorized in terms of what they load
  174. - COME_FROM instructions are added to assist parsing control structures
  175. - operands with stack argument counts or flag masks are appended to the
  176. opcode name, e.g.:
  177. * BUILD_LIST, BUILD_SET
  178. * MAKE_FUNCTION and FUNCTION_CALLS append the number of positional
  179. arguments
  180. - EXTENDED_ARGS instructions are removed
  181. Also, when we encounter certain tokens, we add them to a set
  182. which will cause custom grammar rules. Specifically, variable
  183. arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific
  184. rules for the specific number of arguments they take.
  185. """
  186. def tokens_append(j, token):
  187. tokens.append(token)
  188. self.offset2tok_index[token.offset] = j
  189. j += 1
  190. assert j == len(tokens)
  191. return j
  192. if not show_asm:
  193. show_asm = self.show_asm
  194. bytecode = self.build_instructions(co)
  195. if show_asm in ("both", "before"):
  196. print("\n# ---- disassembly:")
  197. bytecode.disassemble_bytes(
  198. co.co_code,
  199. varnames=co.co_varnames,
  200. names=co.co_names,
  201. constants=co.co_consts,
  202. cells=bytecode._cell_names,
  203. line_starts=bytecode._linestarts,
  204. asm_format="extended",
  205. filename=co.co_filename,
  206. show_source=True,
  207. first_line_number=co.co_firstlineno,
  208. )
  209. # "customize" is in the process of going away here
  210. customize = {}
  211. if self.is_pypy:
  212. customize["PyPy"] = 0
  213. # Scan for assertions. Later we will
  214. # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
  215. # 'LOAD_ASSERT' is used in assert statements.
  216. self.load_asserts = set()
  217. # list of tokens/instructions
  218. tokens = []
  219. self.offset2tok_index = {}
  220. n = len(self.insts)
  221. for i, inst in enumerate(self.insts):
  222. # We need to detect the difference between:
  223. # raise AssertionError
  224. # and
  225. # assert ...
  226. # If we have a JUMP_FORWARD after the
  227. # RAISE_VARARGS then we have a "raise" statement
  228. # else we have an "assert" statement.
  229. assert_can_follow = inst.opname == "POP_JUMP_IF_TRUE" and i + 1 < n
  230. if assert_can_follow:
  231. next_inst = self.insts[i + 1]
  232. if (
  233. next_inst.opname == "LOAD_GLOBAL"
  234. and next_inst.argval == "AssertionError"
  235. and inst.argval is not None
  236. ):
  237. raise_inst = self.get_inst(self.prev_op[inst.argval])
  238. if raise_inst.opname.startswith("RAISE_VARARGS"):
  239. self.load_asserts.add(next_inst.offset)
  240. pass
  241. pass
  242. # Operand values in Python wordcode are small. As a result,
  243. # there are these EXTENDED_ARG instructions - way more than
  244. # before 3.6. These parsing a lot of pain.
  245. # To simplify things we want to untangle this. We also
  246. # do this loop before we compute jump targets.
  247. for i, inst in enumerate(self.insts):
  248. # One artifact of the "too-small" operand problem, is that
  249. # some backward jumps, are turned into forward jumps to another
  250. # "extended arg" backward jump to the same location.
  251. if inst.opname == "JUMP_FORWARD":
  252. jump_inst = self.get_inst(inst.argval)
  253. if jump_inst.has_extended_arg and jump_inst.opname.startswith("JUMP"):
  254. # Create a combination of the jump-to instruction and
  255. # this one. Keep the position information of this instruction,
  256. # but the operator and operand properties come from the other
  257. # instruction
  258. self.insts[i] = Instruction(
  259. opcode=jump_inst.opcode,
  260. opname=jump_inst.opname,
  261. arg=jump_inst.arg,
  262. argval=jump_inst.argval,
  263. argrepr=jump_inst.argrepr,
  264. offset=inst.offset,
  265. starts_line=inst.starts_line,
  266. is_jump_target=inst.is_jump_target,
  267. positions=None,
  268. optype=jump_inst.optype,
  269. has_arg=jump_inst.has_arg,
  270. inst_size=jump_inst.inst_size,
  271. has_extended_arg=inst.has_extended_arg,
  272. fallthrough=False,
  273. tos_str=None,
  274. start_offset=None,
  275. )
  276. # Get jump targets
  277. # Format: {target offset: [jump offsets]}
  278. jump_targets = self.find_jump_targets(show_asm)
  279. # print("XXX2", jump_targets)
  280. last_op_was_break = False
  281. j = 0
  282. for i, inst in enumerate(self.insts):
  283. argval = inst.argval
  284. op = inst.opcode
  285. if inst.offset in jump_targets:
  286. jump_idx = 0
  287. # We want to process COME_FROMs to the same offset to be in *descending*
  288. # offset order so we have the larger range or biggest instruction interval
  289. # last. (I think they are sorted in increasing order, but for safety
  290. # we sort them). That way, specific COME_FROM tags will match up
  291. # properly. For example, a "loop" with an "if" nested in it should have the
  292. # "loop" tag last so the grammar rule matches that properly.
  293. for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
  294. come_from_name = "COME_FROM"
  295. opname = self.opname_for_offset(jump_offset)
  296. if opname == "EXTENDED_ARG":
  297. k = xdis.next_offset(op, self.opc, jump_offset)
  298. opname = self.opname_for_offset(k)
  299. if opname.startswith("SETUP_"):
  300. come_from_type = opname[len("SETUP_") :]
  301. come_from_name = "COME_FROM_%s" % come_from_type
  302. pass
  303. elif inst.offset in self.except_targets:
  304. come_from_name = "COME_FROM_EXCEPT_CLAUSE"
  305. j = tokens_append(
  306. j,
  307. Token(
  308. opname=come_from_name,
  309. attr=jump_offset,
  310. pattr=repr(jump_offset),
  311. offset="%s_%s" % (inst.offset, jump_idx),
  312. has_arg=True,
  313. opc=self.opc,
  314. has_extended_arg=False,
  315. optype=inst.optype,
  316. ),
  317. )
  318. jump_idx += 1
  319. pass
  320. pass
  321. pattr = inst.argrepr
  322. opname = inst.opname
  323. if op in self.opc.CONST_OPS:
  324. const = argval
  325. if iscode(const):
  326. if const.co_name == "<lambda>":
  327. assert opname == "LOAD_CONST"
  328. opname = "LOAD_LAMBDA"
  329. elif const.co_name == "<genexpr>":
  330. opname = "LOAD_GENEXPR"
  331. elif const.co_name == "<dictcomp>":
  332. opname = "LOAD_DICTCOMP"
  333. elif const.co_name == "<setcomp>":
  334. opname = "LOAD_SETCOMP"
  335. elif const.co_name == "<listcomp>":
  336. opname = "LOAD_LISTCOMP"
  337. else:
  338. opname = "LOAD_CODE"
  339. # verify() uses 'pattr' for comparison, since 'attr'
  340. # now holds Code(const) and thus can not be used
  341. # for comparison (todo: think about changing this)
  342. # pattr = 'code_object @ 0x%x %s->%s' %\
  343. # (id(const), const.co_filename, const.co_name)
  344. pattr = "<code_object " + const.co_name + ">"
  345. elif isinstance(const, str):
  346. opname = "LOAD_STR"
  347. else:
  348. if isinstance(inst.arg, int) and inst.arg < len(co.co_consts):
  349. argval, _ = _get_const_info(inst.arg, co.co_consts)
  350. # Why don't we use _ above for "pattr" rather than "const"?
  351. # This *is* a little hoaky, but we have to coordinate with
  352. # other parts like n_LOAD_CONST in pysource.py for example.
  353. pattr = const
  354. pass
  355. elif opname == "IMPORT_NAME":
  356. if "." in inst.argval:
  357. opname = "IMPORT_NAME_ATTR"
  358. pass
  359. elif opname == "LOAD_FAST" and argval == ".0":
  360. # Used as the parameter of a list expression
  361. opname = "LOAD_ARG"
  362. elif opname in ("MAKE_FUNCTION", "MAKE_CLOSURE"):
  363. flags = argval
  364. opname = "MAKE_FUNCTION_%d" % (flags)
  365. attr = []
  366. for flag in self.MAKE_FUNCTION_FLAGS:
  367. bit = flags & 1
  368. attr.append(bit)
  369. flags >>= 1
  370. attr = attr[:4] # remove last value: attr[5] == False
  371. j = tokens_append(
  372. j,
  373. Token(
  374. opname=opname,
  375. attr=attr,
  376. pattr=pattr,
  377. offset=inst.offset,
  378. linestart=inst.starts_line,
  379. op=op,
  380. has_arg=inst.has_arg,
  381. opc=self.opc,
  382. has_extended_arg=inst.has_extended_arg,
  383. optype=inst.optype,
  384. ),
  385. )
  386. continue
  387. elif op in self.varargs_ops:
  388. pos_args = argval
  389. if self.is_pypy and not pos_args and opname == "BUILD_MAP":
  390. opname = "BUILD_MAP_n"
  391. else:
  392. opname = "%s_%d" % (opname, pos_args)
  393. elif self.is_pypy and opname == "JUMP_IF_NOT_DEBUG":
  394. # The value in the dict is in special cases in semantic actions, such
  395. # as JUMP_IF_NOT_DEBUG. The value is not used in these cases, so we put
  396. # in arbitrary value 0.
  397. customize[opname] = 0
  398. elif opname == "UNPACK_EX":
  399. # FIXME: try with scanner and parser by
  400. # changing argval
  401. before_args = argval & 0xFF
  402. after_args = (argval >> 8) & 0xFF
  403. pattr = "%d before vararg, %d after" % (before_args, after_args)
  404. argval = (before_args, after_args)
  405. opname = "%s_%d+%d" % (opname, before_args, after_args)
  406. elif op == self.opc.JUMP_ABSOLUTE:
  407. # Refine JUMP_ABSOLUTE further in into:
  408. #
  409. # * "JUMP_LOOP" - which are used in loops. This is sometimes
  410. # found at the end of a looping construct
  411. # * "BREAK_LOOP" - which are used to break loops.
  412. # * "CONTINUE" - jumps which may appear in a "continue" statement.
  413. # It is okay to confuse this with JUMP_LOOP. The
  414. # grammar should tolerate this.
  415. # * "JUMP_FORWARD - forward jumps that are not BREAK_LOOP jumps.
  416. #
  417. # The loop-type and continue-type jumps will help us
  418. # classify loop boundaries The continue-type jumps
  419. # help us get "continue" statements with would
  420. # otherwise be turned into a "pass" statement because
  421. # JUMPs are sometimes ignored in rules as just
  422. # boundary overhead. Again, in comprehensions we might
  423. # sometimes classify JUMP_LOOP as CONTINUE, but that's
  424. # okay since grammar rules should tolerate that.
  425. pattr = argval
  426. target = inst.argval
  427. if target <= inst.offset:
  428. next_opname = self.insts[i + 1].opname
  429. # 'Continue's include jumps to loops that are not
  430. # and the end of a block which follow with
  431. # POP_BLOCK and COME_FROM_LOOP. If the
  432. # JUMP_ABSOLUTE is to a FOR_ITER, and it is
  433. # followed by another JUMP_FORWARD then we'll take
  434. # it as a "continue".
  435. next_inst = self.insts[i + 1]
  436. is_continue = self.insts[
  437. self.offset2inst_index[target]
  438. ].opname == "FOR_ITER" and next_inst.opname in (
  439. "JUMP_FORWARD",
  440. "JUMP_ABSOLUTE",
  441. )
  442. if self.version < (3, 8) and (
  443. is_continue
  444. or (
  445. inst.offset in self.stmts
  446. and (
  447. inst.starts_line
  448. and next_opname not in self.not_continue_follow
  449. )
  450. )
  451. ):
  452. opname = "CONTINUE"
  453. else:
  454. # "continue" versus "break_loop" dectction is more complicated
  455. # because "continue" to an outer loop is really a "break loop"
  456. opname = "JUMP_BACK"
  457. # FIXME: this is a hack to catch stuff like:
  458. # if x: continue
  459. # the "continue" is not on a new line.
  460. #
  461. # Another situation is where we have
  462. # for method in methods:
  463. # for B in method:
  464. # if c:
  465. # return
  466. # break # A "continue" but not the innermost one
  467. if tokens[-1].kind == "JUMP_LOOP" and tokens[-1].attr <= argval:
  468. if tokens[-2].kind == "BREAK_LOOP":
  469. del tokens[-1]
  470. j -= 1
  471. else:
  472. # "intern" is used because we are
  473. # changing the *previous* token. A
  474. # POP_TOP suggests a "break" rather
  475. # than a "continue"?
  476. if tokens[-2] == "POP_TOP" and (
  477. is_continue and next_inst.argval != tokens[-1].attr
  478. ):
  479. tokens[-1].kind = sys.intern("BREAK_LOOP")
  480. else:
  481. tokens[-1].kind = sys.intern("CONTINUE")
  482. last_continue = tokens[-1]
  483. pass
  484. pass
  485. pass
  486. # elif (
  487. # last_continue is not None
  488. # and tokens[-1].kind == "JUMP_LOOP"
  489. # and last_continue.attr <= tokens[-1].attr
  490. # and last_continue.offset > tokens[-1].attr
  491. # ):
  492. # # Handle mis-characterized "CONTINUE"
  493. # # We have a situation like:
  494. # # loop ... for or while)
  495. # # loop
  496. # # if ...: # code below starts here
  497. # # break # not continue
  498. # #
  499. # # POP_JUMP_IF_FALSE_LOOP # to outer loop
  500. # # JUMP_LOOP # to inner loop
  501. # # ...
  502. # # JUMP_LOOP # to outer loop
  503. # tokens[-2].kind = sys.intern("BREAK_LOOP")
  504. # pass
  505. # if last_op_was_break and opname == "CONTINUE":
  506. # last_op_was_break = False
  507. # continue
  508. pass
  509. else:
  510. opname = "JUMP_FORWARD"
  511. elif inst.offset in self.load_asserts:
  512. opname = "LOAD_ASSERT"
  513. last_op_was_break = opname == "BREAK_LOOP"
  514. j = tokens_append(
  515. j,
  516. Token(
  517. opname=opname,
  518. attr=argval,
  519. pattr=pattr,
  520. offset=inst.offset,
  521. linestart=inst.starts_line,
  522. op=op,
  523. has_arg=inst.has_arg,
  524. opc=self.opc,
  525. has_extended_arg=inst.has_extended_arg,
  526. optype=inst.optype,
  527. ),
  528. )
  529. pass
  530. if show_asm in ("both", "after") and self.version < (3, 8):
  531. print("\n# ---- tokenization:")
  532. # FIXME: t.format() is changing tokens!
  533. for t in tokens.copy():
  534. print(t.format(line_prefix=""))
  535. print()
  536. return tokens, customize
  537. def find_jump_targets(self, debug: str) -> dict:
  538. """
  539. Detect all offsets in a byte code which are jump targets
  540. where we might insert a COME_FROM instruction.
  541. Return the list of offsets.
  542. Return the list of offsets. An instruction can be jumped
  543. to in from multiple instructions.
  544. """
  545. code = self.code
  546. n = len(code)
  547. self.structs = [{"type": "root", "start": 0, "end": n - 1}]
  548. # All loop entry points
  549. self.loops: List[int] = []
  550. # Map fixed jumps to their real destination
  551. self.fixed_jumps: Dict[int, int] = {}
  552. self.except_targets = {}
  553. self.ignore_if: Set[int] = set()
  554. self.build_statement_indices()
  555. # Containers filled by detect_control_flow()
  556. self.not_continue: Set[int] = set()
  557. self.return_end_ifs: Set[int] = set()
  558. self.setup_loop_targets = {} # target given setup_loop offset
  559. self.setup_loops = {} # setup_loop offset given target
  560. targets = {}
  561. for i, inst in enumerate(self.insts):
  562. offset = inst.offset
  563. op = inst.opcode
  564. # FIXME: this code is going to get removed.
  565. # Determine structures and fix jumps in Python versions
  566. # since 2.3
  567. self.detect_control_flow(offset, targets, i)
  568. if inst.has_arg:
  569. label = self.fixed_jumps.get(offset)
  570. oparg = inst.arg
  571. if self.code[offset] == self.opc.EXTENDED_ARG:
  572. j = xdis.next_offset(op, self.opc, offset)
  573. next_offset = xdis.next_offset(op, self.opc, j)
  574. else:
  575. next_offset = xdis.next_offset(op, self.opc, offset)
  576. if label is None:
  577. if op in self.opc.hasjrel and op != self.opc.FOR_ITER:
  578. label = next_offset + oparg
  579. elif op in self.opc.hasjabs:
  580. if op in self.jump_if_pop:
  581. if oparg > offset:
  582. label = oparg
  583. if label is not None and label != -1:
  584. targets[label] = targets.get(label, []) + [offset]
  585. elif op == self.opc.END_FINALLY and offset in self.fixed_jumps:
  586. label = self.fixed_jumps[offset]
  587. targets[label] = targets.get(label, []) + [offset]
  588. pass
  589. pass # for loop
  590. # DEBUG:
  591. if debug in ("both", "after"):
  592. import pprint as pp
  593. pp.pprint(self.structs)
  594. return targets
  595. def build_statement_indices(self):
  596. code = self.code
  597. start = 0
  598. end = codelen = len(code)
  599. # Compose preliminary list of indices with statements,
  600. # using plain statement opcodes
  601. prelim = self.inst_matches(start, end, self.statement_opcodes)
  602. # Initialize final container with statements with
  603. # preliminary data
  604. stmts = self.stmts = set(prelim)
  605. # Same for opcode sequences
  606. pass_stmts = set()
  607. for sequence in self.statement_opcode_sequences:
  608. for i in self.op_range(start, end - (len(sequence) + 1)):
  609. match = True
  610. for elem in sequence:
  611. if elem != code[i]:
  612. match = False
  613. break
  614. i += instruction_size(code[i], self.opc)
  615. if match is True:
  616. i = self.prev_op[i]
  617. stmts.add(i)
  618. pass_stmts.add(i)
  619. # Initialize statement list with the full data we've gathered so far
  620. if pass_stmts:
  621. stmt_offset_list = list(stmts)
  622. stmt_offset_list.sort()
  623. else:
  624. stmt_offset_list = prelim
  625. # 'List-map' which contains offset of start of
  626. # next statement, when op offset is passed as index
  627. self.next_stmt = slist = []
  628. last_stmt_offset = -1
  629. i = 0
  630. # Go through all statement offsets
  631. for stmt_offset in stmt_offset_list:
  632. # Process absolute jumps, but do not remove 'pass' statements
  633. # from the set
  634. if (
  635. code[stmt_offset] == self.opc.JUMP_ABSOLUTE
  636. and stmt_offset not in pass_stmts
  637. ):
  638. # If absolute jump occurs in forward direction or it takes off from the
  639. # same line as previous statement, this is not a statement
  640. # FIXME: 0 isn't always correct
  641. target = self.get_target(stmt_offset)
  642. if (
  643. target > stmt_offset
  644. or self.lines[last_stmt_offset].l_no == self.lines[stmt_offset].l_no
  645. ):
  646. stmts.remove(stmt_offset)
  647. continue
  648. # Scan back bytecode ops till we encounter non-JUMP_ABSOLUTE op
  649. j = self.prev_op[stmt_offset]
  650. while code[j] == self.opc.JUMP_ABSOLUTE and j > 0:
  651. j = self.prev_op[j]
  652. # If we got here, then it's list comprehension which
  653. # is not a statement too
  654. if code[j] == self.opc.LIST_APPEND:
  655. stmts.remove(stmt_offset)
  656. continue
  657. # Exclude ROT_TWO + POP_TOP
  658. elif (
  659. code[stmt_offset] == self.opc.POP_TOP
  660. and code[self.prev_op[stmt_offset]] == self.opc.ROT_TWO
  661. ):
  662. stmts.remove(stmt_offset)
  663. continue
  664. # Exclude FOR_ITER + designators
  665. elif code[stmt_offset] in self.designator_ops:
  666. j = self.prev_op[stmt_offset]
  667. while code[j] in self.designator_ops:
  668. j = self.prev_op[j]
  669. if code[j] == self.opc.FOR_ITER:
  670. stmts.remove(stmt_offset)
  671. continue
  672. # Add to list another list with offset of current statement,
  673. # equal to length of previous statement
  674. slist += [stmt_offset] * (stmt_offset - i)
  675. last_stmt_offset = stmt_offset
  676. i = stmt_offset
  677. # Finish filling the list for last statement
  678. slist += [codelen] * (codelen - len(slist))
  679. def detect_control_flow(
  680. self, offset: int, targets: Dict[Any, Any], inst_index: int
  681. ):
  682. """
  683. Detect type of block structures and their boundaries to fix optimized jumps
  684. in python2.3+
  685. """
  686. code = self.code
  687. inst = self.insts[inst_index]
  688. op = inst.opcode
  689. # Detect parent structure
  690. parent: Dict[str, Any] = self.structs[0]
  691. start: int = parent["start"]
  692. end: int = parent["end"]
  693. # Pick inner-most parent for our offset
  694. for struct in self.structs:
  695. current_start = struct["start"]
  696. current_end = struct["end"]
  697. if (current_start <= offset < current_end) and (
  698. current_start >= start and current_end <= end
  699. ):
  700. start = current_start
  701. end = current_end
  702. parent = struct
  703. if self.version < (3, 8) and op == self.opc.SETUP_LOOP:
  704. # We categorize loop types: 'for', 'while', 'while 1' with
  705. # possibly suffixes '-loop' and '-else'
  706. # Try to find the jump_back instruction of the loop.
  707. # It could be a return instruction.
  708. start += inst.inst_size
  709. target = self.get_target(offset)
  710. end = self.restrict_to_parent(target, parent)
  711. self.setup_loops[target] = offset
  712. if target != end:
  713. self.fixed_jumps[offset] = end
  714. (line_no, next_line_byte) = self.lines[offset]
  715. jump_back = self.last_instr(
  716. start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False
  717. )
  718. if jump_back:
  719. jump_forward_offset = xdis.next_offset(
  720. code[jump_back], self.opc, jump_back
  721. )
  722. else:
  723. jump_forward_offset = None
  724. return_val_offset1 = self.prev[self.prev[end]]
  725. if (
  726. jump_back
  727. and jump_back != self.prev_op[end]
  728. and self.is_jump_forward(jump_forward_offset)
  729. ):
  730. if code[self.prev_op[end]] == self.opc.RETURN_VALUE or (
  731. code[self.prev_op[end]] == self.opc.POP_BLOCK
  732. and code[return_val_offset1] == self.opc.RETURN_VALUE
  733. ):
  734. jump_back = None
  735. if not jump_back:
  736. # loop suite ends in return
  737. jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE)
  738. if not jump_back:
  739. return
  740. jb_inst = self.get_inst(jump_back)
  741. jump_back = self.next_offset(jb_inst.opcode, jump_back)
  742. if_offset = None
  743. if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf:
  744. if_offset = self.prev[next_line_byte]
  745. if if_offset:
  746. loop_type = "while"
  747. self.ignore_if.add(if_offset)
  748. else:
  749. loop_type = "for"
  750. target = next_line_byte
  751. end = xdis.next_offset(code[jump_back], self.opc, jump_back)
  752. else:
  753. if self.get_target(jump_back) >= next_line_byte:
  754. jump_back = self.last_instr(
  755. start, end, self.opc.JUMP_ABSOLUTE, start, False
  756. )
  757. jb_inst = self.get_inst(jump_back)
  758. jb_next_offset = self.next_offset(jb_inst.opcode, jump_back)
  759. if end > jb_next_offset and self.is_jump_forward(end):
  760. if self.is_jump_forward(jb_next_offset):
  761. if self.get_target(jb_next_offset) == self.get_target(end):
  762. self.fixed_jumps[offset] = jb_next_offset
  763. end = jb_next_offset
  764. elif target < offset:
  765. self.fixed_jumps[offset] = jb_next_offset
  766. end = jb_next_offset
  767. target = self.get_target(jump_back)
  768. if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER):
  769. loop_type = "for"
  770. else:
  771. loop_type = "while"
  772. test = self.prev_op[next_line_byte]
  773. if test == offset:
  774. loop_type = "while 1"
  775. elif self.code[test] in self.opc.JUMP_OPs:
  776. self.ignore_if.add(test)
  777. test_target = self.get_target(test)
  778. if test_target > (jump_back + 3):
  779. jump_back = test_target
  780. self.not_continue.add(jump_back)
  781. self.loops.append(target)
  782. self.structs.append(
  783. {"type": loop_type + "-loop", "start": target, "end": jump_back}
  784. )
  785. after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back)
  786. if after_jump_offset != end:
  787. self.structs.append(
  788. {
  789. "type": loop_type + "-else",
  790. "start": after_jump_offset,
  791. "end": end,
  792. }
  793. )
  794. elif op in self.pop_jump_tf:
  795. target = inst.argval
  796. self.fixed_jumps[offset] = target
  797. elif self.version < (3, 8) and op == self.opc.SETUP_EXCEPT:
  798. target = self.get_target(offset)
  799. end = self.restrict_to_parent(target, parent)
  800. self.fixed_jumps[offset] = end
  801. elif op == self.opc.POP_EXCEPT:
  802. next_offset = xdis.next_offset(op, self.opc, offset)
  803. target = self.get_target(next_offset)
  804. if target > next_offset:
  805. next_op = code[next_offset]
  806. if (
  807. self.opc.JUMP_ABSOLUTE == next_op
  808. and self.opc.END_FINALLY
  809. != code[xdis.next_offset(next_op, self.opc, next_offset)]
  810. ):
  811. self.fixed_jumps[next_offset] = target
  812. self.except_targets[target] = next_offset
  813. elif op == self.opc.SETUP_FINALLY:
  814. target = self.get_target(offset)
  815. end = self.restrict_to_parent(target, parent)
  816. self.fixed_jumps[offset] = end
  817. elif op in self.jump_if_pop:
  818. target = self.get_target(offset)
  819. if target > offset:
  820. unop_target = self.last_instr(
  821. offset, target, self.opc.JUMP_FORWARD, target
  822. )
  823. if unop_target and code[unop_target + 3] != self.opc.ROT_TWO:
  824. self.fixed_jumps[offset] = unop_target
  825. else:
  826. self.fixed_jumps[offset] = self.restrict_to_parent(target, parent)
  827. pass
  828. pass
  829. else:
  830. # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get
  831. # misclassified as RETURN_END_IF. Handle that here.
  832. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF
  833. if op == self.opc.RETURN_VALUE:
  834. next_offset = xdis.next_offset(op, self.opc, offset)
  835. if next_offset < len(code) and (
  836. code[next_offset] == self.opc.JUMP_ABSOLUTE
  837. and offset in self.return_end_ifs
  838. ):
  839. self.return_end_ifs.remove(offset)
  840. pass
  841. pass
  842. elif op == self.opc.JUMP_FORWARD:
  843. # If we have:
  844. # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x:
  845. # then RETURN_VALUE is not RETURN_END_IF
  846. rtarget = self.get_target(offset)
  847. rtarget_prev = self.prev[rtarget]
  848. if (
  849. code[rtarget_prev] == self.opc.RETURN_VALUE
  850. and rtarget_prev in self.return_end_ifs
  851. ):
  852. i = rtarget_prev
  853. while i != offset:
  854. if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]:
  855. return
  856. i = self.prev[i]
  857. self.return_end_ifs.remove(rtarget_prev)
  858. pass
  859. return
  860. def next_except_jump(self, start):
  861. """
  862. Return the next jump that was generated by an except SomeException:
  863. construct in a try...except...else clause or None if not found.
  864. """
  865. if self.code[start] == self.opc.DUP_TOP:
  866. except_match = self.first_instr(
  867. start, len(self.code), self.opc.POP_JUMP_IF_FALSE
  868. )
  869. if except_match:
  870. jmp = self.prev_op[self.get_target(except_match)]
  871. self.ignore_if.add(except_match)
  872. self.not_continue.add(jmp)
  873. return jmp
  874. count_END_FINALLY = 0
  875. count_SETUP_ = 0
  876. for i in self.op_range(start, len(self.code)):
  877. op = self.code[i]
  878. if op == self.opc.END_FINALLY:
  879. if count_END_FINALLY == count_SETUP_:
  880. assert self.code[self.prev_op[i]] in frozenset(
  881. [
  882. self.opc.JUMP_ABSOLUTE,
  883. self.opc.JUMP_FORWARD,
  884. self.opc.RETURN_VALUE,
  885. ]
  886. )
  887. self.not_continue.add(self.prev_op[i])
  888. return self.prev_op[i]
  889. count_END_FINALLY += 1
  890. elif op in self.setup_opts_no_loop:
  891. count_SETUP_ += 1
  892. if __name__ == "__main__":
  893. from xdis.version_info import PYTHON_VERSION_TRIPLE, version_tuple_to_str
  894. if (3, 7) <= PYTHON_VERSION_TRIPLE[:2] < (3, 9):
  895. import inspect
  896. co = inspect.currentframe().f_code # type: ignore
  897. tokens, customize = Scanner37Base(PYTHON_VERSION_TRIPLE).ingest(co)
  898. for t in tokens:
  899. print(t)
  900. else:
  901. print(
  902. "Need to be Python 3.7..3.8 to demo; "
  903. f"I am version {version_tuple_to_str()}."
  904. )
  905. pass