instruction.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. # Copyright (c) 2018-2024 by Rocky Bernstein
  2. #
  3. # This program is free software; you can redistribute it and/or
  4. # modify it under the terms of the GNU General Public License
  5. # as published by the Free Software Foundation; either version 2
  6. # of the License, or (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program; if not, write to the Free Software
  15. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16. """Python instruction class
  17. Extracted from Python 3's ``dis`` module but generalized to
  18. allow running on Python 2.
  19. """
  20. import re
  21. from typing import Any, Dict, NamedTuple, Optional, Union
  22. # _Instruction.tos_str.__doc__ = (
  23. # "If not None, a string representation of the top of the stack (TOS)"
  24. # )
  25. # # Python expressions can be straight-line, operator like-basic block code that take
  26. # # items off a stack and push a value onto the stack. In this case, in a linear scan
  27. # # we can basically build up an expression tree.
  28. # # Note this has to be the last field. Code to set this assumes this.
  29. # _Instruction.start_offset.__doc__ = (
  30. # "If not None, the offset of the first instruction feeding into the operation"
  31. # )
  32. _OPNAME_WIDTH = 20
  33. class AssembleFormat(NamedTuple):
  34. """
  35. A structure to hold the essential information
  36. that would be shown in a line of assembly under any
  37. formatting option, e.g. extended-bytes, or classic.
  38. Fields in the order they in which they are defined in constructing an object:
  39. is_jump_target: True if other code jumps to here,
  40. 'loop' if this is a loop beginning, which
  41. in Python can be determined jump to an earlier offset.
  42. Otherwise, False.
  43. is_current_instruction: True if we are stopped at this instruction.
  44. starts_line: Optional Line started by this opcode (if any). Otherwise None.
  45. offset: Start index of operation within bytecode sequence.
  46. opname: human-readable name for operation.
  47. opcode: numeric code for operation.
  48. has_arg: True if opcode takes an argument. In that case,
  49. ``argepr`` will have that value. False
  50. if this opcode doesn't take an argument. When False,
  51. don't look at ``argval`` or ``argrepr``.
  52. arg: Optional numeric argument to operation (if any). Otherwise, None.
  53. argrepr: human-readable description of operation argument.
  54. tos_str: If not None, a string representation of the top of the stack (TOS).
  55. This is obtained by scanning previous instructions and
  56. using information there and in their ``tos_str`` fields.
  57. """
  58. # True if other code jumps to here, the string "loop" if this is a loop
  59. # beginning, which in Python can be determined jump to an earlier
  60. # offset. Otherwise, False.
  61. # Note that this is a generalization of Python's "is_jump_target".
  62. is_jump_target: Union[bool, str]
  63. is_current_instruction: bool
  64. starts_line: Optional[int]
  65. # Offset of the instruction
  66. offset: int
  67. # Human readable name for operation
  68. opname: str
  69. # Numeric code for operation
  70. opcode: int
  71. # True if instruction has an operand, otherwise False.
  72. has_arg: bool
  73. # Numeric operand value if operation has an operand. Otherwise, None.
  74. # This operand value is an index into one of the lists of a code type.
  75. # The exact table indexed depends on optype.
  76. arg: Optional[int]
  77. # String representation of argval if argval is not None.
  78. argrepr: Optional[str]
  79. # If not None, a string representation of the top of the stack (TOS)
  80. tos_str: Optional[str] = None
  81. class Instruction(NamedTuple):
  82. """Details for a bytecode operation
  83. The order of the fields below follows roughly how the values might be displayed
  84. in an assembly listing.
  85. is_jump_target: True if other code jumps to here,
  86. 'loop' if this is a loop beginning, which
  87. in Python can be determined jump to an earlier offset.
  88. Otherwise, False.
  89. starts_line: Optional Line started by this opcode (if any). Otherwise None.
  90. offset: Start index of operation within bytecode sequence.
  91. opname: human-readable name for operation.
  92. opcode: numeric code for operation.
  93. has_arg: True if opcode takes an argument. In that case,
  94. ``argval`` and ``argepr`` will have that value. False
  95. if this opcode doesn't take an argument. When False,
  96. don't look at ``argval`` or ``argrepr``.
  97. arg: Optional numeric argument to operation (if any). Otherwise, None.
  98. argval: resolved arg value (if known). Otherwise, the same as ``arg``.
  99. argrepr: human-readable description of operation argument.
  100. positions: Optional dis.Positions object holding the start and end locations that
  101. are covered by this instruction. This not implemented yet.
  102. optype: Opcode classification. One of:
  103. "compare", "const", "free", "jabs", "jrel", "local",
  104. "name", or "nargs".
  105. inst_size: number of bytes the instruction occupies
  106. has_extended_arg: True if the instruction was built from EXTENDED_ARG
  107. opcodes.
  108. fallthrough: True if the instruction can (not must) fall through to the next
  109. instruction. Note conditionals are in this category, but
  110. returns, raise, and unconditional jumps are not.
  111. Note: the following fields have to appear in the order below and be at the end.
  112. disassembly may replace (delete and insert) an instruction, and it assumes
  113. the ending fields are as follows:
  114. tos_str: If not None, a string representation of the top of the stack (TOS).
  115. This is obtained by scanning previous instructions and
  116. using information there and in their ``tos_str`` fields.
  117. start_offset: if not None the instruction with the lowest offset that
  118. pushes a stack entry that is consume by this opcode
  119. """
  120. # Numeric code for operation
  121. opcode: int
  122. # Human readable name for operation
  123. opname: str
  124. # Numeric operand value if operation has an operand, otherwise None.
  125. # This operand value is an index into one of the lists of a code type.
  126. # The exact table indexed depends on optype.
  127. arg: Optional[int]
  128. # Resolved operand value (if known). This is obtained indexing the appropriate list
  129. # indicated by optype using value arg.
  130. # If for some reason we can't extract a value this way ``argval`` has value as
  131. # ``arg``.
  132. argval: Any
  133. # String representation of argval if argval is not None.
  134. argrepr: Optional[str]
  135. # Offset of the instruction
  136. offset: int
  137. starts_line: Optional[int]
  138. # True if other code jumps to here, the string "loop" if this is a loop
  139. # beginning, which in Python can be determined jump to an earlier
  140. # offset. Otherwise, False.
  141. # Note that this is a generalization of Python's "is_jump_target".
  142. is_jump_target: Union[bool, str]
  143. # dis.Positions object holding the start and end locations that
  144. # are covered by this instruction.
  145. # FIXME: Implement. The below is just a placeholder.
  146. #
  147. positions: Optional[Any]
  148. # The following values are our own extended information not found (yet) #
  149. # in Python's Instruction structure. #
  150. # First, values which can be computed or derived from the above,
  151. # along with an opcode structure. These We add these in an
  152. # instruction to make the instruction self sufficient.
  153. # opcode classification. One of:
  154. # compare, const, free, jabs, jrel, local, name, nargs
  155. optype: str
  156. # True if instruction has an operand, otherwise False.
  157. has_arg: bool
  158. # The number of bytes this instruction consumes.
  159. inst_size: int
  160. # True there were EXTENDED_ARG opcodes before this, otherwise False
  161. has_extended_arg: Optional[bool] = None
  162. # True if the instruction can (not must) fall through to the next
  163. # instruction. Note conditionals are in this category, but
  164. # returns, raise, and unconditional jumps are not.
  165. fallthrough: Optional[bool] = None
  166. # If not None, a string representation of the top of the stack (TOS)
  167. tos_str: Optional[str] = None
  168. # Python expressions can be straight-line, operator like-basic block code that take
  169. # items off a stack and push a value onto the stack. In this case, in a linear scan
  170. # we can basically build up an expression tree.
  171. # Note this has to be the last field. Code to set this assumes this.
  172. start_offset: Optional[int] = None
  173. def disassemble(
  174. self,
  175. opc,
  176. line_starts: Optional[Dict[int, int]] = None,
  177. lineno_width: int=3,
  178. mark_as_current: bool=False,
  179. asm_format: str="classic",
  180. instructions=[],
  181. ) -> str:
  182. """
  183. Format instruction details for inclusion in disassembly output.
  184. ``line_starts`` when it exists is a dictionary mapping a bytecode offsets to
  185. line numbers.
  186. ``lineno_width`` sets the width of the line number field (0 omits it)
  187. ``mark_as_current`` inserts a '-->' marker arrow as part of the line.
  188. """
  189. fields = []
  190. indexed_operand = frozenset(["name", "local", "compare", "free"])
  191. opcode = self.opcode
  192. # Column: Source code line number
  193. if lineno_width:
  194. if self.starts_line is not None:
  195. if asm_format == "asm":
  196. lineno_fmt = "%%%dd:\n" % lineno_width
  197. fields.append(lineno_fmt % self.starts_line)
  198. fields.append(" " * lineno_width)
  199. if self.is_jump_target:
  200. fields.append(" " * (lineno_width - 1))
  201. else:
  202. lineno_fmt = "%%%dd:" % lineno_width
  203. fields.append(lineno_fmt % self.starts_line)
  204. else:
  205. fields.append(" " * (lineno_width + 1))
  206. # Column: Current instruction indicator
  207. if mark_as_current and asm_format != "asm":
  208. fields.append("-->")
  209. else:
  210. fields.append(" ")
  211. # Column: Jump target marker
  212. if self.is_jump_target:
  213. if asm_format != "asm":
  214. fields.append(">>")
  215. else:
  216. fields = ["L%d:\n" % self.offset] + fields
  217. if not self.starts_line:
  218. fields.append(" ")
  219. else:
  220. fields.append(" ")
  221. # Column: Instruction offset from start of code sequence
  222. if asm_format != "asm":
  223. fields.append(repr(self.offset).rjust(4))
  224. # Column: Instruction bytes
  225. if asm_format in ("extended-bytes", "bytes"):
  226. hex_bytecode = "|%02x" % opcode
  227. if self.inst_size == 1:
  228. # Not 3.6 or later
  229. hex_bytecode += " " * (2 * 3)
  230. if self.inst_size == 2:
  231. # Must be Python 3.6 or later
  232. if self.has_arg and self.arg is not None:
  233. hex_bytecode += " %02x" % (self.arg % 256)
  234. else:
  235. hex_bytecode += " 00"
  236. elif self.inst_size == 3 and self.arg is not None:
  237. # Not 3.6 or later
  238. hex_bytecode += " %02x %02x" % divmod(self.arg, 256)
  239. fields.append(hex_bytecode + "|")
  240. # Column: Opcode name
  241. fields.append(self.opname.ljust(_OPNAME_WIDTH))
  242. # Column: Opcode argument
  243. if self.arg is not None:
  244. argrepr = self.argrepr
  245. # The ``argrepr`` value when the instruction was created
  246. # generally has all the information we require. However,
  247. # for "asm" format, want additional explicit information
  248. # linking operands to tables.
  249. if asm_format == "asm":
  250. if self.is_jump() and self.argrepr is not None:
  251. assert self.argrepr.startswith("to ")
  252. jump_target = self.argrepr[len("to ") :]
  253. fields.append("L" + jump_target)
  254. elif self.optype in indexed_operand:
  255. fields.append(repr(self.arg))
  256. fields.append("(%s)" % argrepr)
  257. argrepr = None
  258. elif (
  259. self.optype == "const"
  260. and argrepr is not None
  261. and not re.search(r"\s", argrepr)
  262. ):
  263. fields.append(repr(self.arg))
  264. fields.append("(%s)" % argrepr)
  265. argrepr = None
  266. else:
  267. fields.append(repr(self.arg))
  268. elif asm_format in ("extended", "extended-bytes"):
  269. if (
  270. self.is_jump()
  271. and line_starts is not None
  272. and line_starts.get(self.argval) is not None
  273. ):
  274. new_instruction = list(self)
  275. new_instruction[-2] = f"To line {line_starts[self.argval]}"
  276. # Here and below we use self.__class__ instead of Instruction
  277. # so that other kinds of compatible namedtuple Instructions
  278. # can be used. In particular, the control-flow project
  279. # defines such an ExtendedInstruction namedtuple
  280. self = self.__class__(*new_instruction)
  281. del instructions[-1]
  282. instructions.append(self)
  283. elif (
  284. hasattr(opc, "opcode_extended_fmt")
  285. and self.opname in opc.opcode_extended_fmt
  286. ):
  287. new_repr = opc.opcode_extended_fmt.get(self.opname, lambda opc, instr: None)(
  288. opc, list(reversed(instructions))
  289. )
  290. start_offset = None
  291. if isinstance(new_repr, tuple) and len(new_repr) == 2:
  292. new_repr, start_offset = new_repr
  293. if new_repr:
  294. # Add tos_str info to tos_str field of instruction.
  295. # This the last field in instruction.
  296. new_instruction = list(self)
  297. new_instruction[-1] = start_offset
  298. new_instruction[-2] = new_repr
  299. del instructions[-1]
  300. # See comment above abut the use of self.__class__
  301. self = self.__class__(*new_instruction)
  302. instructions.append(self)
  303. argrepr = new_repr
  304. elif opcode in opc.nullaryloadop:
  305. new_instruction = list(self)
  306. new_instruction[-2] = self.argrepr
  307. start_offset = new_instruction[-1] = self.offset
  308. del instructions[-1]
  309. # See comment above abut the use of self.__class__
  310. self = self.__class__(*new_instruction)
  311. instructions.append(self)
  312. pass
  313. if not argrepr:
  314. if asm_format != "asm" or self.opname == "MAKE_FUNCTION":
  315. fields.append(repr(self.arg))
  316. pass
  317. else:
  318. # Column: Opcode argument details
  319. if len(instructions) > 0:
  320. argval = self.argval
  321. if self.tos_str is None:
  322. fields.append(f"({self.argrepr})")
  323. else:
  324. if self.optype in ("vargs", "encoded_arg"):
  325. prefix = f"{self.argval} ; "
  326. elif self.argrepr is None:
  327. prefix = ""
  328. else:
  329. prefix = f"({self.argrepr}) ; "
  330. if opcode in opc.operator_set | opc.callop:
  331. prefix += "TOS = "
  332. fields.append(f"{prefix}{self.tos_str}")
  333. pass
  334. elif self.argrepr is not None:
  335. fields.append(self.argrepr)
  336. pass
  337. pass
  338. elif asm_format in ("extended", "extended-bytes"):
  339. if (
  340. hasattr(opc, "opcode_extended_fmt")
  341. and self.opname in opc.opcode_extended_fmt
  342. ):
  343. new_repr, start_offset = opc.opcode_extended_fmt.get(self.opname, (None, 0))(
  344. opc, list(reversed(instructions))
  345. )
  346. if new_repr:
  347. new_instruction = list(self)
  348. new_instruction[-2] = new_repr
  349. new_instruction[-1] = start_offset
  350. del instructions[-1]
  351. # See comment above abut the use of self.__class__
  352. instructions.append(self.__class__(*new_instruction))
  353. argval = self.argval
  354. prefix = "" if argval is None else f"({argval}) | "
  355. if self.opcode in opc.operator_set:
  356. prefix += "TOS = "
  357. fields.append(f"{prefix}{new_repr}")
  358. pass
  359. elif (
  360. hasattr(opc, "opcode_arg_fmt")
  361. and self.opname in opc.opcode_arg_fmt
  362. ) and self.argrepr is not None:
  363. fields.append(self.argrepr)
  364. pass
  365. pass
  366. return " ".join(fields).rstrip()
  367. def format_to_assembly_line(
  368. self,
  369. current_instruction_line,
  370. ) -> AssembleFormat:
  371. """
  372. Format instruction into a structure that can be easily
  373. turned a structure contains the essential information
  374. that would be shown as a line in an assembly listing
  375. """
  376. return AssembleFormat(
  377. self.is_jump_target,
  378. current_instruction_line == self.starts_line,
  379. self.starts_line,
  380. self.offset,
  381. self.opname,
  382. self.opcode,
  383. self.has_arg,
  384. self.arg,
  385. self.argrepr,
  386. self.tos_str,
  387. )
  388. def is_jump(self) -> bool:
  389. """
  390. Return True if instruction is some sort of jump.
  391. """
  392. return self.optype in ("jabs", "jrel")
  393. def jumps_forward(self) -> bool:
  394. """
  395. Return True if instruction is jump backwards
  396. """
  397. return self.is_jump() and self.offset < self.argval
  398. # if __name__ == '__main__':
  399. # pass