scanner.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. # Copyright (c) 2016, 2018-2021, 2024-2025 by Rocky Bernstein
  2. # Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
  3. # Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
  4. # Copyright (c) 1999 John Aycock
  5. #
  6. # This program is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. #
  19. """
  20. scanner/ingestion module. From here we call various version-specific
  21. scanners, e.g. for Python 3.7 or 3.8.
  22. """
  23. import importlib
  24. from abc import ABC
  25. from array import array
  26. from collections import namedtuple
  27. from types import ModuleType
  28. from typing import Optional, Union
  29. import xdis
  30. from xdis import (
  31. Bytecode,
  32. canonic_python_version,
  33. code2num,
  34. extended_arg_val,
  35. instruction_size,
  36. next_offset,
  37. )
  38. from xdis.version_info import IS_PYPY, version_tuple_to_str
  39. from decompyle3.scanners.tok import Token
  40. # The byte code versions we support.
  41. # Note: these all have to be tuples
  42. PYTHON_VERSIONS = frozenset(((3, 7), (3, 8)))
  43. CANONIC2VERSION = dict(
  44. (canonic_python_version[version_tuple_to_str(python_version)], python_version)
  45. for python_version in PYTHON_VERSIONS
  46. )
  47. L65536 = 65536
  48. def long(num):
  49. return num
  50. class Code:
  51. """
  52. Class for representing code-objects.
  53. This is similar to the original code object, but additionally
  54. the diassembled code is stored in the attribute '_tokens'.
  55. """
  56. def __init__(self, co, scanner, classname=None, show_asm=None):
  57. # Full initialization is given below, but for linters
  58. # well set up some initial values.
  59. self.co_code = None # Really either bytes for >= 3.0 and string in < 3.0
  60. for i in dir(co):
  61. if i.startswith("co_"):
  62. setattr(self, i, getattr(co, i))
  63. self._tokens, self._customize = scanner.ingest(co, classname, show_asm=show_asm)
  64. class Scanner(ABC):
  65. def __init__(self, version: tuple, show_asm=None, is_pypy=False):
  66. self.version = version
  67. self.show_asm = show_asm
  68. self.is_pypy = is_pypy
  69. # Temporary initialization.
  70. self.opc = ModuleType("uninitialized")
  71. if version[:2] in PYTHON_VERSIONS:
  72. v_str = f"""opcode_{version_tuple_to_str(version, start=0, end=2, delimiter="")}"""
  73. module_name = f"xdis.opcodes.{v_str}"
  74. if is_pypy:
  75. module_name += "pypy"
  76. self.opc = importlib.import_module(module_name)
  77. else:
  78. raise TypeError(
  79. f"{version_tuple_to_str(version)} is not a Python version I know about"
  80. )
  81. self.opname = self.opc.opname
  82. # FIXME: This weird Python2 behavior is not Python3
  83. self.resetTokenClass()
  84. def build_instructions(self, co):
  85. """
  86. Create a list of instructions (a structured object rather than
  87. an array of bytes) and store that in self.insts
  88. """
  89. # FIXME: remove this when all subsidiary functions have been removed.
  90. # We should be able to get everything from the self.insts list.
  91. self.code = array("B", co.co_code)
  92. bytecode = Bytecode(co, self.opc)
  93. self.build_prev_op()
  94. self.insts = self.remove_extended_args(list(bytecode))
  95. self.lines = self.build_lines_data(co)
  96. self.offset2inst_index = {}
  97. for i, inst in enumerate(self.insts):
  98. self.offset2inst_index[inst.offset] = i
  99. offset = inst.offset
  100. inst_size = inst.inst_size
  101. while inst_size > 0:
  102. self.offset2inst_index[offset] = i
  103. offset += 2
  104. inst_size -= 2
  105. return bytecode
  106. def build_lines_data(self, code_obj):
  107. """
  108. Generate various line-related helper data.
  109. """
  110. # Offset: lineno pairs, only for offsets which start line.
  111. # Locally we use list for more convenient iteration using indices
  112. linestarts = list(self.opc.findlinestarts(code_obj))
  113. self.linestarts = dict(linestarts)
  114. if not self.linestarts:
  115. return []
  116. # 'List-map' which shows line number of current op and offset of
  117. # first op on following line, given offset of op as index
  118. lines = []
  119. LineTuple = namedtuple("LineTuple", ["l_no", "next"])
  120. # Iterate through available linestarts, and fill
  121. # the data for all code offsets encountered until
  122. # last linestart offset
  123. _, prev_line_no = linestarts[0]
  124. offset = 0
  125. for start_offset, line_no in linestarts[1:]:
  126. while offset < start_offset:
  127. lines.append(LineTuple(prev_line_no, start_offset))
  128. offset += 1
  129. prev_line_no = line_no
  130. # Fill remaining offsets with reference to last line number
  131. # and code length as start offset of following non-existing line
  132. codelen = len(self.code)
  133. while offset < codelen:
  134. lines.append(LineTuple(prev_line_no, codelen))
  135. offset += 1
  136. return lines
  137. def build_prev_op(self):
  138. """
  139. Compose 'list-map' which allows to jump to previous
  140. op, given offset of current op as index.
  141. """
  142. code = self.code
  143. codelen = len(code)
  144. # 2.x uses prev 3.x uses prev_op. Sigh
  145. # Until we get this sorted out.
  146. self.prev = self.prev_op = [0]
  147. for offset in self.op_range(0, codelen):
  148. op = code[offset]
  149. for _ in range(instruction_size(op, self.opc)):
  150. self.prev_op.append(offset)
  151. def is_jump_forward(self, offset: int) -> bool:
  152. """
  153. Return True if the code at offset is some sort of jump forward.
  154. That is, it is ether "JUMP_FORWARD" or an absolute jump that
  155. goes forward.
  156. """
  157. opname = self.get_inst(offset).opname
  158. if opname == "JUMP_FORWARD":
  159. return True
  160. if opname != "JUMP_ABSOLUTE":
  161. return False
  162. return offset < self.get_target(offset)
  163. def ingest(self, co, classname=None, code_objects={}, show_asm=None):
  164. """
  165. Code to tokenize disassembly. Subclasses must implement this.
  166. """
  167. raise NotImplementedError("This method should have been implemented")
  168. def prev_offset(self, offset: int) -> int:
  169. return self.insts[self.offset2inst_index[offset] - 1].offset
  170. def get_inst(self, offset: int):
  171. """
  172. Returns the instruction from ``self.insts`` that has at offset
  173. ``offset``.
  174. Instructions can get moved as a result of ``EXTENDED_ARGS`` removal.
  175. So if ``offset`` is not in self.offset2inst_index, then
  176. we assume that it was an instruction moved back.
  177. We check that assumption though by looking at
  178. self.code's opcode.
  179. Sadly instructions can get moved forward too.
  180. So we have to check which direction we are going.
  181. """
  182. offset_increment = instruction_size(self.opc.EXTENDED_ARG, self.opc)
  183. if offset not in self.offset2inst_index:
  184. if self.code[offset] != self.opc.EXTENDED_ARG:
  185. target_name = self.opc.opname[self.code[offset]]
  186. # JUMP_ABSOLUTE can be like this where
  187. # the inst offset is at what used to be an EXTENDED_ARG
  188. # so find the first extended arg.
  189. next_offset = offset - offset_increment
  190. while next_offset not in self.offset2inst_index:
  191. next_offset -= offset_increment
  192. assert self.code[next_offset] == self.opc.EXTENDED_ARG
  193. inst = self.insts[self.offset2inst_index[next_offset]]
  194. assert inst.opname == target_name, inst
  195. else:
  196. next_offset = offset + offset_increment
  197. while next_offset not in self.offset2inst_index:
  198. next_offset += offset_increment
  199. inst = self.insts[self.offset2inst_index[next_offset]]
  200. assert inst.has_extended_arg is True
  201. return inst
  202. return self.insts[self.offset2inst_index[offset]]
  203. def get_target(self, offset: int, extended_arg: int = 0) -> int:
  204. """
  205. Get next instruction offset for op located at given <offset>.
  206. NOTE: extended_arg is no longer used
  207. """
  208. inst = self.get_inst(offset)
  209. if inst.opcode in self.opc.JREL_OPS | self.opc.JABS_OPS:
  210. target = inst.argval
  211. else:
  212. # No jump offset, so use fall-through offset
  213. target = next_offset(inst.opcode, self.opc, inst.offset)
  214. return target
  215. def get_argument(self, pos: int):
  216. arg = self.code[pos + 1] + self.code[pos + 2] * 256
  217. return arg
  218. def next_offset(self, op, offset: int) -> int:
  219. return xdis.next_offset(op, self.opc, offset)
  220. def first_instr(self, start: int, end: int, instr, target=None, exact=True):
  221. """
  222. Find the first <instr> in the block from start to end.
  223. <instr> is any python bytecode instruction or a list of opcodes
  224. If <instr> is an opcode with a target (like a jump), a target
  225. destination can be specified which must match precisely if exact
  226. is True, or if exact is False, the instruction which has a target
  227. closest to <target> will be returned.
  228. Return index to it or None if not found.
  229. """
  230. code = self.code
  231. assert start >= 0 and end <= len(code)
  232. if not isinstance(instr, list):
  233. instr = [instr]
  234. result_offset = None
  235. current_distance = len(code)
  236. for offset in self.op_range(start, end):
  237. op = code[offset]
  238. if op in instr:
  239. if target is None:
  240. return offset
  241. dest = self.get_target(offset)
  242. if dest == target:
  243. return offset
  244. elif not exact:
  245. new_distance = abs(target - dest)
  246. if new_distance < current_distance:
  247. current_distance = new_distance
  248. result_offset = offset
  249. return result_offset
  250. def last_instr(
  251. self, start: int, end: int, instr, target=None, exact=True
  252. ) -> Optional[int]:
  253. """
  254. Find the last <instr> in the block from start to end.
  255. <instr> is any python bytecode instruction or a list of opcodes
  256. If <instr> is an opcode with a target (like a jump), a target
  257. destination can be specified which must match precisely if exact
  258. is True, or if exact is False, the instruction which has a target
  259. closest to <target> will be returned.
  260. Return index to it or None if not found.
  261. """
  262. code = self.code
  263. # Make sure requested positions do not go out of
  264. # code bounds
  265. if not (start >= 0 and end <= len(code)):
  266. return None
  267. if not isinstance(instr, list):
  268. instr = [instr]
  269. result_offset = None
  270. current_distance = self.insts[-1].offset - self.insts[0].offset
  271. extended_arg = 0
  272. # FIXME: use self.insts rather than code[]
  273. for offset in self.op_range(start, end):
  274. op = code[offset]
  275. if op == self.opc.EXTENDED_ARG:
  276. arg = code2num(code, offset + 1) | extended_arg
  277. extended_arg = extended_arg_val(self.opc, arg)
  278. continue
  279. if op in instr:
  280. if target is None:
  281. result_offset = offset
  282. else:
  283. dest = self.get_target(offset, extended_arg)
  284. if dest == target:
  285. current_distance = 0
  286. result_offset = offset
  287. elif not exact:
  288. new_distance = abs(target - dest)
  289. if new_distance <= current_distance:
  290. current_distance = new_distance
  291. result_offset = offset
  292. pass
  293. pass
  294. pass
  295. pass
  296. extended_arg = 0
  297. pass
  298. return result_offset
  299. def inst_matches(self, start, end, instr, target=None, include_beyond_target=False):
  300. """
  301. Find all `instr` in the block from start to end.
  302. `instr` is a Python opcode or a list of opcodes
  303. If `instr` is an opcode with a target (like a jump), a target
  304. destination can be specified which must match precisely.
  305. Return a list with indexes to them or [] if none found.
  306. """
  307. try:
  308. None in instr
  309. except Exception:
  310. instr = [instr]
  311. first = self.offset2inst_index[start]
  312. result = []
  313. for inst in self.insts[first:]:
  314. if inst.opcode in instr:
  315. if target is None:
  316. result.append(inst.offset)
  317. else:
  318. t = self.get_target(inst.offset)
  319. if include_beyond_target and t >= target:
  320. result.append(inst.offset)
  321. elif t == target:
  322. result.append(inst.offset)
  323. pass
  324. pass
  325. pass
  326. if isinstance(inst.offset, int) and inst.offset >= end:
  327. break
  328. pass
  329. # FIXME: put in a test
  330. # check = self.all_instr(start, end, instr, target, include_beyond_target)
  331. # assert result == check
  332. return result
  333. # FIXME: this is broken on 3.6+. Replace remaining (2.x-based) calls
  334. # with inst_matches
  335. def all_instr(
  336. self, start: int, end: int, instr, target=None, include_beyond_target=False
  337. ):
  338. """
  339. Find all `instr` in the block from start to end.
  340. `instr` is any Python opcode or a list of opcodes
  341. If `instr` is an opcode with a target (like a jump), a target
  342. destination can be specified which must match precisely.
  343. Return a list with indexes to them or [] if none found.
  344. """
  345. code = self.code
  346. assert start >= 0 and end <= len(code)
  347. if not isinstance(instr, list):
  348. instr = [instr]
  349. result = []
  350. extended_arg = 0
  351. for offset in self.op_range(start, end):
  352. op = code[offset]
  353. if op == self.opc.EXTENDED_ARG:
  354. arg = code2num(code, offset + 1) | extended_arg
  355. extended_arg = extended_arg_val(self.opc, arg)
  356. continue
  357. if op in instr:
  358. if target is None:
  359. result.append(offset)
  360. else:
  361. t = self.get_target(offset, extended_arg)
  362. if include_beyond_target and t >= target:
  363. result.append(offset)
  364. elif t == target:
  365. result.append(offset)
  366. pass
  367. pass
  368. pass
  369. extended_arg = 0
  370. pass
  371. return result
  372. def opname_for_offset(self, offset):
  373. return self.opc.opname[self.code[offset]]
  374. def op_name(self, op):
  375. return self.opc.opname[op]
  376. def op_range(self, start, end):
  377. """
  378. Iterate through positions of opcodes, skipping
  379. arguments.
  380. """
  381. while start < end:
  382. yield start
  383. start += instruction_size(self.code[start], self.opc)
  384. def remove_extended_args(self, instructions):
  385. """Go through instructions removing extended ARG.
  386. get_instruction_bytes previously adjusted the operand values
  387. to account for these"""
  388. new_instructions = []
  389. last_was_extarg = False
  390. n = len(instructions)
  391. starts_line = False
  392. for i, inst in enumerate(instructions):
  393. if (
  394. inst.opname == "EXTENDED_ARG"
  395. and i + 1 < n
  396. and instructions[i + 1].opname != "MAKE_FUNCTION"
  397. ):
  398. last_was_extarg = True
  399. starts_line = inst.starts_line
  400. is_jump_target = inst.is_jump_target
  401. offset = inst.offset
  402. continue
  403. if last_was_extarg:
  404. # j = self.stmts.index(inst.offset)
  405. # self.lines[j] = offset
  406. new_inst = inst._replace(
  407. starts_line=starts_line,
  408. is_jump_target=is_jump_target,
  409. offset=offset,
  410. )
  411. inst = new_inst
  412. if i < n:
  413. new_prev = self.prev_op[instructions[i].offset]
  414. j = instructions[i + 1].offset
  415. old_prev = self.prev_op[j]
  416. while self.prev_op[j] == old_prev and j < n:
  417. self.prev_op[j] = new_prev
  418. j += 1
  419. last_was_extarg = False
  420. new_instructions.append(inst)
  421. return new_instructions
  422. def remove_mid_line_ifs(self, ifs):
  423. """
  424. Go through passed offsets, filtering ifs
  425. located somewhere mid-line.
  426. """
  427. # FIXME: this doesn't work for Python 3.6+
  428. filtered = []
  429. for i in ifs:
  430. # For each offset, if line number of current and next op
  431. # is the same
  432. if self.lines[i].l_no == self.lines[i + 3].l_no:
  433. # Skip last op on line if it is some sort of POP_JUMP.
  434. if self.code[self.prev[self.lines[i].next]] in (
  435. self.opc.PJIT,
  436. self.opc.PJIF,
  437. ):
  438. continue
  439. filtered.append(i)
  440. return filtered
  441. def resetTokenClass(self):
  442. return self.setTokenClass(Token)
  443. def restrict_to_parent(self, target: int, parent) -> int:
  444. """Restrict target to parent structure boundaries."""
  445. if not (parent["start"] < target < parent["end"]):
  446. target = parent["end"]
  447. return target
  448. def setTokenClass(self, token_class: Token) -> Token:
  449. self.Token = token_class
  450. return self.Token
  451. def get_scanner(version: Union[str, tuple], is_pypy=False, show_asm=None) -> Scanner:
  452. # If version is a string, turn that into the corresponding float.
  453. if isinstance(version, str):
  454. if version not in canonic_python_version:
  455. raise RuntimeError(f"Unknown Python version in xdis {version}")
  456. canonic_version = canonic_python_version[version]
  457. if canonic_version not in CANONIC2VERSION:
  458. raise RuntimeError(
  459. f"Unsupported Python version {version} (canonic {canonic_version})"
  460. )
  461. version = CANONIC2VERSION[canonic_version]
  462. # Pick up appropriate scanner
  463. if version[:2] in PYTHON_VERSIONS:
  464. v_str = version_tuple_to_str(version, start=0, end=2, delimiter="")
  465. try:
  466. import importlib
  467. if is_pypy:
  468. scan = importlib.import_module(f"decompyle3.scanners.pypy{v_str}")
  469. else:
  470. scan = importlib.import_module(f"decompyle3.scanners.scanner{v_str}")
  471. if False:
  472. print(scan) # Avoid unused scan
  473. except ImportError:
  474. if is_pypy:
  475. exec(
  476. f"import decompyle3.scanners.pypy{v_str} as scan",
  477. locals(),
  478. globals(),
  479. )
  480. else:
  481. exec(
  482. f"import decompyle3.scanners.scanner{v_str} as scan",
  483. locals(),
  484. globals(),
  485. )
  486. if is_pypy:
  487. scanner = eval(
  488. f"scan.ScannerPyPy{v_str}(show_asm=show_asm)", locals(), globals()
  489. )
  490. else:
  491. scanner = eval(
  492. f"scan.Scanner{v_str}(show_asm=show_asm)", locals(), globals()
  493. )
  494. else:
  495. raise RuntimeError(
  496. "Unsupported Python version, "
  497. f"{version_tuple_to_str(version)}, for decompilation"
  498. )
  499. return scanner
  500. if __name__ == "__main__":
  501. import inspect
  502. my_co = inspect.currentframe().f_code
  503. from xdis.version_info import PYTHON_VERSION_TRIPLE
  504. scanner = get_scanner(PYTHON_VERSION_TRIPLE, IS_PYPY, True)
  505. tokens, customize = scanner.ingest(my_co, {}, show_asm="after")