scanner.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. # Copyright (c) 2016, 2018-2025 by Rocky Bernstein
  2. # Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
  3. # Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
  4. # Copyright (c) 1999 John Aycock
  5. #
  6. # This program is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. #
  19. """
  20. scanner/ingestion module. From here we call various version-specific
  21. scanners, e.g. for Python 2.7 or 3.4.
  22. """
  23. import importlib
  24. from abc import ABC
  25. from array import array
  26. from collections import namedtuple
  27. from types import ModuleType
  28. from typing import Optional, Union
  29. import xdis
  30. from xdis import (
  31. Bytecode,
  32. canonic_python_version,
  33. code2num,
  34. extended_arg_val,
  35. instruction_size,
  36. next_offset,
  37. )
  38. from xdis.version_info import IS_PYPY, version_tuple_to_str
  39. from uncompyle6.scanners.tok import Token
  40. # The byte code versions we support.
  41. # Note: these all have to be tuples of 2 ints
  42. PYTHON_VERSIONS = frozenset(
  43. (
  44. (1, 0),
  45. (1, 1),
  46. (1, 3),
  47. (1, 4),
  48. (1, 5),
  49. (1, 6),
  50. (2, 1),
  51. (2, 2),
  52. (2, 3),
  53. (2, 4),
  54. (2, 5),
  55. (2, 6),
  56. (2, 7),
  57. (3, 0),
  58. (3, 1),
  59. (3, 2),
  60. (3, 3),
  61. (3, 4),
  62. (3, 5),
  63. (3, 6),
  64. (3, 7),
  65. (3, 8),
  66. )
  67. )
  68. CANONIC2VERSION = dict(
  69. (canonic_python_version[version_tuple_to_str(python_version)], python_version)
  70. for python_version in PYTHON_VERSIONS
  71. )
  72. # Magic changed mid version for Python 3.5.2. Compatibility was added for
  73. # the older 3.5 interpreter magic.
  74. CANONIC2VERSION["3.5.2"] = 3.5
  75. # FIXME: DRY
  76. L65536 = 65536
  77. def long(num):
  78. return num
  79. CONST_COLLECTIONS = ("CONST_LIST", "CONST_SET", "CONST_DICT", "CONST_MAP")
  80. class Code:
  81. """
  82. Class for representing code-objects.
  83. This is similar to the original code object, but additionally
  84. the diassembled code is stored in the attribute '_tokens'.
  85. """
  86. def __init__(self, co, scanner, classname=None, show_asm=None):
  87. # Full initialization is given below, but for linters
  88. # well set up some initial values.
  89. self.co_code = None # Really either bytes for >= 3.0 and string in < 3.0
  90. for i in dir(co):
  91. if i.startswith("co_"):
  92. setattr(self, i, getattr(co, i))
  93. self._tokens, self._customize = scanner.ingest(co, classname, show_asm=show_asm)
  94. class Scanner(ABC):
  95. def __init__(self, version: tuple, show_asm=None, is_pypy=False):
  96. self.version = version
  97. self.show_asm = show_asm
  98. self.is_pypy = is_pypy
  99. # Temporary initialization.
  100. self.opc = ModuleType("uninitialized")
  101. if version[:2] in PYTHON_VERSIONS:
  102. v_str = f"""opcode_{version_tuple_to_str(version, start=0, end=2, delimiter="")}"""
  103. module_name = f"xdis.opcodes.{v_str}"
  104. if is_pypy:
  105. module_name += "pypy"
  106. self.opc = importlib.import_module(module_name)
  107. else:
  108. raise TypeError(
  109. "%s is not a Python version I know about"
  110. % version_tuple_to_str(version)
  111. )
  112. self.opname = self.opc.opname
  113. # FIXME: This weird Python2 behavior is not Python3
  114. self.resetTokenClass()
  115. def bound_collection_from_tokens(self, tokens, t, i, collection_type):
  116. count = t.attr
  117. assert isinstance(count, int)
  118. assert count <= i
  119. if collection_type == "CONST_DICT":
  120. # constant dictionaries work via BUILD_CONST_KEY_MAP and
  121. # handle the values() like sets and lists.
  122. # However, the keys() are an LOAD_CONST of the keys.
  123. # adjust offset to account for this
  124. count += 1
  125. # For small lists don't bother
  126. if count < 5:
  127. return None
  128. collection_start = i - count
  129. for j in range(collection_start, i):
  130. if tokens[j].kind not in (
  131. "LOAD_CONST",
  132. "LOAD_FAST",
  133. "LOAD_GLOBAL",
  134. "LOAD_NAME",
  135. ):
  136. return None
  137. collection_enum = CONST_COLLECTIONS.index(collection_type)
  138. # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
  139. # add a boundary marker and change LOAD_CONST to something else
  140. new_tokens = tokens[:-count]
  141. start_offset = tokens[collection_start].offset
  142. new_tokens.append(
  143. Token(
  144. opname="COLLECTION_START",
  145. attr=collection_enum,
  146. pattr=collection_type,
  147. offset="%s_0" % start_offset,
  148. has_arg=True,
  149. opc=self.opc,
  150. has_extended_arg=False,
  151. )
  152. )
  153. for j in range(collection_start, i):
  154. if tokens[j] == "LOAD_CONST":
  155. opname = "ADD_VALUE"
  156. else:
  157. opname = "ADD_VALUE_VAR"
  158. new_tokens.append(
  159. Token(
  160. opname=opname,
  161. attr=tokens[j].attr,
  162. pattr=tokens[j].pattr,
  163. offset=tokens[j].offset,
  164. has_arg=True,
  165. linestart=tokens[j].linestart,
  166. opc=self.opc,
  167. has_extended_arg=False,
  168. )
  169. )
  170. new_tokens.append(
  171. Token(
  172. opname="BUILD_%s" % collection_type,
  173. attr=t.attr,
  174. pattr=t.pattr,
  175. offset=t.offset,
  176. has_arg=t.has_arg,
  177. linestart=t.linestart,
  178. opc=t.opc,
  179. has_extended_arg=False,
  180. )
  181. )
  182. return new_tokens
  183. def build_instructions(self, co):
  184. """
  185. Create a list of instructions (a structured object rather than
  186. an array of bytes) and store that in self.insts
  187. """
  188. # FIXME: remove this when all subsidiary functions have been removed.
  189. # We should be able to get everything from the self.insts list.
  190. self.code = array("B", co.co_code)
  191. bytecode = Bytecode(co, self.opc)
  192. self.build_prev_op()
  193. self.insts = self.remove_extended_args(list(bytecode))
  194. self.lines = self.build_lines_data(co)
  195. self.offset2inst_index = {}
  196. for i, inst in enumerate(self.insts):
  197. self.offset2inst_index[inst.offset] = i
  198. return bytecode
  199. def build_lines_data(self, code_obj):
  200. """
  201. Generate various line-related helper data.
  202. """
  203. # Offset: lineno pairs, only for offsets which start line.
  204. # Locally we use list for more convenient iteration using indices
  205. linestarts = list(self.opc.findlinestarts(code_obj))
  206. self.linestarts = dict(linestarts)
  207. if not self.linestarts:
  208. return []
  209. # 'List-map' which shows line number of current op and offset of
  210. # first op on following line, given offset of op as index
  211. lines = []
  212. LineTuple = namedtuple("LineTuple", ["l_no", "next"])
  213. # Iterate through available linestarts, and fill
  214. # the data for all code offsets encountered until
  215. # last linestart offset
  216. _, prev_line_no = linestarts[0]
  217. offset = 0
  218. for start_offset, line_no in linestarts[1:]:
  219. while offset < start_offset:
  220. lines.append(LineTuple(prev_line_no, start_offset))
  221. offset += 1
  222. prev_line_no = line_no
  223. # Fill remaining offsets with reference to last line number
  224. # and code length as start offset of following non-existing line
  225. codelen = len(self.code)
  226. while offset < codelen:
  227. lines.append(LineTuple(prev_line_no, codelen))
  228. offset += 1
  229. return lines
  230. def build_prev_op(self):
  231. """
  232. Compose 'list-map' which allows to jump to previous
  233. op, given offset of current op as index.
  234. """
  235. code = self.code
  236. codelen = len(code)
  237. # 2.x uses prev 3.x uses prev_op. Sigh
  238. # Until we get this sorted out.
  239. self.prev = self.prev_op = [0]
  240. for offset in self.op_range(0, codelen):
  241. op = code[offset]
  242. for _ in range(instruction_size(op, self.opc)):
  243. self.prev_op.append(offset)
  244. def is_jump_forward(self, offset: int) -> bool:
  245. """
  246. Return True if the code at offset is some sort of jump forward.
  247. That is, it is ether "JUMP_FORWARD" or an absolute jump that
  248. goes forward.
  249. """
  250. opname = self.get_inst(offset).opname
  251. if opname == "JUMP_FORWARD":
  252. return True
  253. if opname != "JUMP_ABSOLUTE":
  254. return False
  255. return offset < self.get_target(offset)
  256. def ingest(self, co, classname=None, code_objects={}, show_asm=None):
  257. """
  258. Code to tokenize disassembly. Subclasses must implement this.
  259. """
  260. raise NotImplementedError("This method should have been implemented")
  261. def prev_offset(self, offset: int) -> int:
  262. return self.insts[self.offset2inst_index[offset] - 1].offset
  263. def get_inst(self, offset: int):
  264. # Instructions can get moved as a result of EXTENDED_ARGS removal.
  265. # So if "offset" is not in self.offset2inst_index, then
  266. # we assume that it was an instruction moved back.
  267. # We check that assumption though by looking at
  268. # self.code's opcode.
  269. if offset not in self.offset2inst_index:
  270. offset -= instruction_size(self.opc.EXTENDED_ARG, self.opc)
  271. assert self.code[offset] == self.opc.EXTENDED_ARG
  272. return self.insts[self.offset2inst_index[offset]]
  273. def get_target(self, offset: int, extended_arg: int = 0) -> int:
  274. """
  275. Get next instruction offset for op located at given <offset>.
  276. NOTE: extended_arg is no longer used
  277. """
  278. inst = self.get_inst(offset)
  279. if inst.opcode in self.opc.JREL_OPS | self.opc.JABS_OPS:
  280. target = inst.argval
  281. else:
  282. # No jump offset, so use fall-through offset
  283. target = next_offset(inst.opcode, self.opc, inst.offset)
  284. return target
  285. def get_argument(self, pos: int):
  286. arg = self.code[pos + 1] + self.code[pos + 2] * 256
  287. return arg
  288. def next_offset(self, op, offset: int) -> int:
  289. return xdis.next_offset(op, self.opc, offset)
  290. def first_instr(self, start: int, end: int, instr, target=None, exact=True):
  291. """
  292. Find the first <instr> in the block from start to end.
  293. <instr> is any python bytecode instruction or a list of opcodes
  294. If <instr> is an opcode with a target (like a jump), a target
  295. destination can be specified which must match precisely if exact
  296. is True, or if exact is False, the instruction which has a target
  297. closest to <target> will be returned.
  298. Return index to it or None if not found.
  299. """
  300. code = self.code
  301. assert start >= 0 and end <= len(code)
  302. if not isinstance(instr, list):
  303. instr = [instr]
  304. result_offset = None
  305. current_distance = len(code)
  306. for offset in self.op_range(start, end):
  307. op = code[offset]
  308. if op in instr:
  309. if target is None:
  310. return offset
  311. dest = self.get_target(offset)
  312. if dest == target:
  313. return offset
  314. elif not exact:
  315. new_distance = abs(target - dest)
  316. if new_distance < current_distance:
  317. current_distance = new_distance
  318. result_offset = offset
  319. return result_offset
  320. def last_instr(
  321. self, start: int, end: int, instr, target=None, exact=True
  322. ) -> Optional[int]:
  323. """
  324. Find the last <instr> in the block from start to end.
  325. <instr> is any python bytecode instruction or a list of opcodes
  326. If <instr> is an opcode with a target (like a jump), a target
  327. destination can be specified which must match precisely if exact
  328. is True, or if exact is False, the instruction which has a target
  329. closest to <target> will be returned.
  330. Return index to it or None if not found.
  331. """
  332. code = self.code
  333. # Make sure requested positions do not go out of
  334. # code bounds
  335. if not (start >= 0 and end <= len(code)):
  336. return None
  337. if not isinstance(instr, list):
  338. instr = [instr]
  339. result_offset = None
  340. current_distance = self.insts[-1].offset - self.insts[0].offset
  341. extended_arg = 0
  342. # FIXME: use self.insts rather than code[]
  343. for offset in self.op_range(start, end):
  344. op = code[offset]
  345. if op == self.opc.EXTENDED_ARG:
  346. arg = code2num(code, offset + 1) | extended_arg
  347. extended_arg = extended_arg_val(self.opc, arg)
  348. continue
  349. if op in instr:
  350. if target is None:
  351. result_offset = offset
  352. else:
  353. dest = self.get_target(offset, extended_arg)
  354. if dest == target:
  355. current_distance = 0
  356. result_offset = offset
  357. elif not exact:
  358. new_distance = abs(target - dest)
  359. if new_distance <= current_distance:
  360. current_distance = new_distance
  361. result_offset = offset
  362. pass
  363. pass
  364. pass
  365. pass
  366. extended_arg = 0
  367. pass
  368. return result_offset
  369. def inst_matches(self, start, end, instr, target=None, include_beyond_target=False):
  370. """
  371. Find all `instr` in the block from start to end.
  372. `instr` is a Python opcode or a list of opcodes
  373. If `instr` is an opcode with a target (like a jump), a target
  374. destination can be specified which must match precisely.
  375. Return a list with indexes to them or [] if none found.
  376. """
  377. try:
  378. None in instr
  379. except Exception:
  380. instr = [instr]
  381. first = self.offset2inst_index[start]
  382. result = []
  383. for inst in self.insts[first:]:
  384. if inst.opcode in instr:
  385. if target is None:
  386. result.append(inst.offset)
  387. else:
  388. t = self.get_target(inst.offset)
  389. if include_beyond_target and t >= target:
  390. result.append(inst.offset)
  391. elif t == target:
  392. result.append(inst.offset)
  393. pass
  394. pass
  395. pass
  396. if inst.offset >= end:
  397. break
  398. pass
  399. # FIXME: put in a test
  400. # check = self.all_instr(start, end, instr, target, include_beyond_target)
  401. # assert result == check
  402. return result
  403. # FIXME: this is broken on 3.6+. Replace remaining (2.x-based) calls
  404. # with inst_matches
  405. def all_instr(
  406. self, start: int, end: int, instr, target=None, include_beyond_target=False
  407. ):
  408. """
  409. Find all `instr` in the block from start to end.
  410. `instr` is any Python opcode or a list of opcodes
  411. If `instr` is an opcode with a target (like a jump), a target
  412. destination can be specified which must match precisely.
  413. Return a list with indexes to them or [] if none found.
  414. """
  415. code = self.code
  416. assert start >= 0 and end <= len(code)
  417. try:
  418. None in instr
  419. except:
  420. instr = [instr]
  421. result = []
  422. extended_arg = 0
  423. for offset in self.op_range(start, end):
  424. op = code[offset]
  425. if op == self.opc.EXTENDED_ARG:
  426. arg = code2num(code, offset + 1) | extended_arg
  427. extended_arg = extended_arg_val(self.opc, arg)
  428. continue
  429. if op in instr:
  430. if target is None:
  431. result.append(offset)
  432. else:
  433. t = self.get_target(offset, extended_arg)
  434. if include_beyond_target and t >= target:
  435. result.append(offset)
  436. elif t == target:
  437. result.append(offset)
  438. pass
  439. pass
  440. pass
  441. extended_arg = 0
  442. pass
  443. return result
  444. def opname_for_offset(self, offset):
  445. return self.opc.opname[self.code[offset]]
  446. def op_name(self, op):
  447. return self.opc.opname[op]
  448. def op_range(self, start, end):
  449. """
  450. Iterate through positions of opcodes, skipping
  451. arguments.
  452. """
  453. while start < end:
  454. yield start
  455. start += instruction_size(self.code[start], self.opc)
  456. def remove_extended_args(self, instructions):
  457. """Go through instructions removing extended ARG.
  458. get_instruction_bytes previously adjusted the operand values
  459. to account for these"""
  460. new_instructions = []
  461. last_was_extarg = False
  462. n = len(instructions)
  463. for i, inst in enumerate(instructions):
  464. if (
  465. inst.opname == "EXTENDED_ARG"
  466. and i + 1 < n
  467. and instructions[i + 1].opname != "MAKE_FUNCTION"
  468. ):
  469. last_was_extarg = True
  470. starts_line = inst.starts_line
  471. is_jump_target = inst.is_jump_target
  472. offset = inst.offset
  473. continue
  474. if last_was_extarg:
  475. # j = self.stmts.index(inst.offset)
  476. # self.lines[j] = offset
  477. new_inst = inst._replace(
  478. starts_line=starts_line,
  479. is_jump_target=is_jump_target,
  480. offset=offset,
  481. )
  482. inst = new_inst
  483. if i < n:
  484. new_prev = self.prev_op[instructions[i].offset]
  485. j = instructions[i + 1].offset
  486. old_prev = self.prev_op[j]
  487. while self.prev_op[j] == old_prev and j < n:
  488. self.prev_op[j] = new_prev
  489. j += 1
  490. last_was_extarg = False
  491. new_instructions.append(inst)
  492. return new_instructions
  493. def remove_mid_line_ifs(self, ifs):
  494. """
  495. Go through passed offsets, filtering ifs
  496. located somewhere mid-line.
  497. """
  498. # FIXME: this doesn't work for Python 3.6+
  499. filtered = []
  500. for i in ifs:
  501. # For each offset, if line number of current and next op
  502. # is the same
  503. if self.lines[i].l_no == self.lines[i + 3].l_no:
  504. # Skip last op on line if it is some sort of POP_JUMP.
  505. if self.code[self.prev[self.lines[i].next]] in (
  506. self.opc.PJIT,
  507. self.opc.PJIF,
  508. ):
  509. continue
  510. filtered.append(i)
  511. return filtered
  512. def resetTokenClass(self):
  513. return self.setTokenClass(Token)
  514. def restrict_to_parent(self, target: int, parent) -> int:
  515. """Restrict target to parent structure boundaries."""
  516. if not (parent["start"] < target < parent["end"]):
  517. target = parent["end"]
  518. return target
  519. def setTokenClass(self, tokenClass: Token) -> Token:
  520. self.Token = tokenClass
  521. return self.Token
  522. def get_scanner(version: Union[str, tuple], is_pypy=False, show_asm=None) -> Scanner:
  523. """
  524. Import the right scanner module for ``version`` and return the Scanner class
  525. in that module.
  526. """
  527. # If version is a string, turn that into the corresponding float.
  528. if isinstance(version, str):
  529. if version not in canonic_python_version:
  530. raise RuntimeError(f"Unknown Python version in xdis {version}")
  531. canonic_version = canonic_python_version[version]
  532. if canonic_version not in CANONIC2VERSION:
  533. raise RuntimeError(
  534. f"Unsupported Python version {version} (canonic {canonic_version})"
  535. )
  536. version = CANONIC2VERSION[canonic_version]
  537. # Pick up appropriate scanner
  538. if version[:2] in PYTHON_VERSIONS:
  539. v_str = version_tuple_to_str(version, start=0, end=2, delimiter="")
  540. try:
  541. import importlib
  542. if is_pypy:
  543. scan = importlib.import_module("uncompyle6.scanners.pypy%s" % v_str)
  544. else:
  545. scan = importlib.import_module("uncompyle6.scanners.scanner%s" % v_str)
  546. if False:
  547. print(scan) # Avoid unused scan
  548. except ImportError:
  549. if is_pypy:
  550. exec(
  551. "import uncompyle6.scanners.pypy%s as scan" % v_str,
  552. locals(),
  553. globals(),
  554. )
  555. else:
  556. exec(
  557. "import uncompyle6.scanners.scanner%s as scan" % v_str,
  558. locals(),
  559. globals(),
  560. )
  561. if is_pypy:
  562. scanner = eval(
  563. "scan.ScannerPyPy%s(show_asm=show_asm)" % v_str, locals(), globals()
  564. )
  565. else:
  566. scanner = eval(
  567. "scan.Scanner%s(show_asm=show_asm)" % v_str, locals(), globals()
  568. )
  569. else:
  570. raise RuntimeError(
  571. f"Unsupported Python version, {version_tuple_to_str(version)}, for decompilation"
  572. )
  573. return scanner
  574. if __name__ == "__main__":
  575. import inspect
  576. co = inspect.currentframe().f_code
  577. # scanner = get_scanner('2.7.13', True)
  578. # scanner = get_scanner(sys.version[:5], False)
  579. from xdis.version_info import PYTHON_VERSION_TRIPLE
  580. scanner = get_scanner(PYTHON_VERSION_TRIPLE, IS_PYPY, True)
  581. tokens, customize = scanner.ingest(co, {}, show_asm="after")