# Copyright (c) 2018-2024 by Rocky Bernstein # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """Python instruction class Extracted from Python 3's ``dis`` module but generalized to allow running on Python 2. """ import re from typing import Any, Dict, NamedTuple, Optional, Union # _Instruction.tos_str.__doc__ = ( # "If not None, a string representation of the top of the stack (TOS)" # ) # # Python expressions can be straight-line, operator like-basic block code that take # # items off a stack and push a value onto the stack. In this case, in a linear scan # # we can basically build up an expression tree. # # Note this has to be the last field. Code to set this assumes this. # _Instruction.start_offset.__doc__ = ( # "If not None, the offset of the first instruction feeding into the operation" # ) _OPNAME_WIDTH = 20 class AssembleFormat(NamedTuple): """ A structure to hold the essential information that would be shown in a line of assembly under any formatting option, e.g. extended-bytes, or classic. Fields in the order they in which they are defined in constructing an object: is_jump_target: True if other code jumps to here, 'loop' if this is a loop beginning, which in Python can be determined jump to an earlier offset. Otherwise, False. is_current_instruction: True if we are stopped at this instruction. starts_line: Optional Line started by this opcode (if any). Otherwise None. offset: Start index of operation within bytecode sequence. opname: human-readable name for operation. opcode: numeric code for operation. has_arg: True if opcode takes an argument. In that case, ``argepr`` will have that value. False if this opcode doesn't take an argument. When False, don't look at ``argval`` or ``argrepr``. arg: Optional numeric argument to operation (if any). Otherwise, None. argrepr: human-readable description of operation argument. tos_str: If not None, a string representation of the top of the stack (TOS). This is obtained by scanning previous instructions and using information there and in their ``tos_str`` fields. """ # True if other code jumps to here, the string "loop" if this is a loop # beginning, which in Python can be determined jump to an earlier # offset. Otherwise, False. # Note that this is a generalization of Python's "is_jump_target". is_jump_target: Union[bool, str] is_current_instruction: bool starts_line: Optional[int] # Offset of the instruction offset: int # Human readable name for operation opname: str # Numeric code for operation opcode: int # True if instruction has an operand, otherwise False. has_arg: bool # Numeric operand value if operation has an operand. Otherwise, None. # This operand value is an index into one of the lists of a code type. # The exact table indexed depends on optype. arg: Optional[int] # String representation of argval if argval is not None. argrepr: Optional[str] # If not None, a string representation of the top of the stack (TOS) tos_str: Optional[str] = None class Instruction(NamedTuple): """Details for a bytecode operation The order of the fields below follows roughly how the values might be displayed in an assembly listing. is_jump_target: True if other code jumps to here, 'loop' if this is a loop beginning, which in Python can be determined jump to an earlier offset. Otherwise, False. starts_line: Optional Line started by this opcode (if any). Otherwise None. offset: Start index of operation within bytecode sequence. opname: human-readable name for operation. opcode: numeric code for operation. has_arg: True if opcode takes an argument. In that case, ``argval`` and ``argepr`` will have that value. False if this opcode doesn't take an argument. When False, don't look at ``argval`` or ``argrepr``. arg: Optional numeric argument to operation (if any). Otherwise, None. argval: resolved arg value (if known). Otherwise, the same as ``arg``. argrepr: human-readable description of operation argument. positions: Optional dis.Positions object holding the start and end locations that are covered by this instruction. This not implemented yet. optype: Opcode classification. One of: "compare", "const", "free", "jabs", "jrel", "local", "name", or "nargs". inst_size: number of bytes the instruction occupies has_extended_arg: True if the instruction was built from EXTENDED_ARG opcodes. fallthrough: True if the instruction can (not must) fall through to the next instruction. Note conditionals are in this category, but returns, raise, and unconditional jumps are not. Note: the following fields have to appear in the order below and be at the end. disassembly may replace (delete and insert) an instruction, and it assumes the ending fields are as follows: tos_str: If not None, a string representation of the top of the stack (TOS). This is obtained by scanning previous instructions and using information there and in their ``tos_str`` fields. start_offset: if not None the instruction with the lowest offset that pushes a stack entry that is consume by this opcode """ # Numeric code for operation opcode: int # Human readable name for operation opname: str # Numeric operand value if operation has an operand, otherwise None. # This operand value is an index into one of the lists of a code type. # The exact table indexed depends on optype. arg: Optional[int] # Resolved operand value (if known). This is obtained indexing the appropriate list # indicated by optype using value arg. # If for some reason we can't extract a value this way ``argval`` has value as # ``arg``. argval: Any # String representation of argval if argval is not None. argrepr: Optional[str] # Offset of the instruction offset: int starts_line: Optional[int] # True if other code jumps to here, the string "loop" if this is a loop # beginning, which in Python can be determined jump to an earlier # offset. Otherwise, False. # Note that this is a generalization of Python's "is_jump_target". is_jump_target: Union[bool, str] # dis.Positions object holding the start and end locations that # are covered by this instruction. # FIXME: Implement. The below is just a placeholder. # positions: Optional[Any] # The following values are our own extended information not found (yet) # # in Python's Instruction structure. # # First, values which can be computed or derived from the above, # along with an opcode structure. These We add these in an # instruction to make the instruction self sufficient. # opcode classification. One of: # compare, const, free, jabs, jrel, local, name, nargs optype: str # True if instruction has an operand, otherwise False. has_arg: bool # The number of bytes this instruction consumes. inst_size: int # True there were EXTENDED_ARG opcodes before this, otherwise False has_extended_arg: Optional[bool] = None # True if the instruction can (not must) fall through to the next # instruction. Note conditionals are in this category, but # returns, raise, and unconditional jumps are not. fallthrough: Optional[bool] = None # If not None, a string representation of the top of the stack (TOS) tos_str: Optional[str] = None # Python expressions can be straight-line, operator like-basic block code that take # items off a stack and push a value onto the stack. In this case, in a linear scan # we can basically build up an expression tree. # Note this has to be the last field. Code to set this assumes this. start_offset: Optional[int] = None def disassemble( self, opc, line_starts: Optional[Dict[int, int]] = None, lineno_width: int=3, mark_as_current: bool=False, asm_format: str="classic", instructions=[], ) -> str: """ Format instruction details for inclusion in disassembly output. ``line_starts`` when it exists is a dictionary mapping a bytecode offsets to line numbers. ``lineno_width`` sets the width of the line number field (0 omits it) ``mark_as_current`` inserts a '-->' marker arrow as part of the line. """ fields = [] indexed_operand = frozenset(["name", "local", "compare", "free"]) opcode = self.opcode # Column: Source code line number if lineno_width: if self.starts_line is not None: if asm_format == "asm": lineno_fmt = "%%%dd:\n" % lineno_width fields.append(lineno_fmt % self.starts_line) fields.append(" " * lineno_width) if self.is_jump_target: fields.append(" " * (lineno_width - 1)) else: lineno_fmt = "%%%dd:" % lineno_width fields.append(lineno_fmt % self.starts_line) else: fields.append(" " * (lineno_width + 1)) # Column: Current instruction indicator if mark_as_current and asm_format != "asm": fields.append("-->") else: fields.append(" ") # Column: Jump target marker if self.is_jump_target: if asm_format != "asm": fields.append(">>") else: fields = ["L%d:\n" % self.offset] + fields if not self.starts_line: fields.append(" ") else: fields.append(" ") # Column: Instruction offset from start of code sequence if asm_format != "asm": fields.append(repr(self.offset).rjust(4)) # Column: Instruction bytes if asm_format in ("extended-bytes", "bytes"): hex_bytecode = "|%02x" % opcode if self.inst_size == 1: # Not 3.6 or later hex_bytecode += " " * (2 * 3) if self.inst_size == 2: # Must be Python 3.6 or later if self.has_arg and self.arg is not None: hex_bytecode += " %02x" % (self.arg % 256) else: hex_bytecode += " 00" elif self.inst_size == 3 and self.arg is not None: # Not 3.6 or later hex_bytecode += " %02x %02x" % divmod(self.arg, 256) fields.append(hex_bytecode + "|") # Column: Opcode name fields.append(self.opname.ljust(_OPNAME_WIDTH)) # Column: Opcode argument if self.arg is not None: argrepr = self.argrepr # The ``argrepr`` value when the instruction was created # generally has all the information we require. However, # for "asm" format, want additional explicit information # linking operands to tables. if asm_format == "asm": if self.is_jump() and self.argrepr is not None: assert self.argrepr.startswith("to ") jump_target = self.argrepr[len("to ") :] fields.append("L" + jump_target) elif self.optype in indexed_operand: fields.append(repr(self.arg)) fields.append("(%s)" % argrepr) argrepr = None elif ( self.optype == "const" and argrepr is not None and not re.search(r"\s", argrepr) ): fields.append(repr(self.arg)) fields.append("(%s)" % argrepr) argrepr = None else: fields.append(repr(self.arg)) elif asm_format in ("extended", "extended-bytes"): if ( self.is_jump() and line_starts is not None and line_starts.get(self.argval) is not None ): new_instruction = list(self) new_instruction[-2] = f"To line {line_starts[self.argval]}" # Here and below we use self.__class__ instead of Instruction # so that other kinds of compatible namedtuple Instructions # can be used. In particular, the control-flow project # defines such an ExtendedInstruction namedtuple self = self.__class__(*new_instruction) del instructions[-1] instructions.append(self) elif ( hasattr(opc, "opcode_extended_fmt") and self.opname in opc.opcode_extended_fmt ): new_repr = opc.opcode_extended_fmt.get(self.opname, lambda opc, instr: None)( opc, list(reversed(instructions)) ) start_offset = None if isinstance(new_repr, tuple) and len(new_repr) == 2: new_repr, start_offset = new_repr if new_repr: # Add tos_str info to tos_str field of instruction. # This the last field in instruction. new_instruction = list(self) new_instruction[-1] = start_offset new_instruction[-2] = new_repr del instructions[-1] # See comment above abut the use of self.__class__ self = self.__class__(*new_instruction) instructions.append(self) argrepr = new_repr elif opcode in opc.nullaryloadop: new_instruction = list(self) new_instruction[-2] = self.argrepr start_offset = new_instruction[-1] = self.offset del instructions[-1] # See comment above abut the use of self.__class__ self = self.__class__(*new_instruction) instructions.append(self) pass if not argrepr: if asm_format != "asm" or self.opname == "MAKE_FUNCTION": fields.append(repr(self.arg)) pass else: # Column: Opcode argument details if len(instructions) > 0: argval = self.argval if self.tos_str is None: fields.append(f"({self.argrepr})") else: if self.optype in ("vargs", "encoded_arg"): prefix = f"{self.argval} ; " elif self.argrepr is None: prefix = "" else: prefix = f"({self.argrepr}) ; " if opcode in opc.operator_set | opc.callop: prefix += "TOS = " fields.append(f"{prefix}{self.tos_str}") pass elif self.argrepr is not None: fields.append(self.argrepr) pass pass elif asm_format in ("extended", "extended-bytes"): if ( hasattr(opc, "opcode_extended_fmt") and self.opname in opc.opcode_extended_fmt ): new_repr, start_offset = opc.opcode_extended_fmt.get(self.opname, (None, 0))( opc, list(reversed(instructions)) ) if new_repr: new_instruction = list(self) new_instruction[-2] = new_repr new_instruction[-1] = start_offset del instructions[-1] # See comment above abut the use of self.__class__ instructions.append(self.__class__(*new_instruction)) argval = self.argval prefix = "" if argval is None else f"({argval}) | " if self.opcode in opc.operator_set: prefix += "TOS = " fields.append(f"{prefix}{new_repr}") pass elif ( hasattr(opc, "opcode_arg_fmt") and self.opname in opc.opcode_arg_fmt ) and self.argrepr is not None: fields.append(self.argrepr) pass pass return " ".join(fields).rstrip() def format_to_assembly_line( self, current_instruction_line, ) -> AssembleFormat: """ Format instruction into a structure that can be easily turned a structure contains the essential information that would be shown as a line in an assembly listing """ return AssembleFormat( self.is_jump_target, current_instruction_line == self.starts_line, self.starts_line, self.offset, self.opname, self.opcode, self.has_arg, self.arg, self.argrepr, self.tos_str, ) def is_jump(self) -> bool: """ Return True if instruction is some sort of jump. """ return self.optype in ("jabs", "jrel") def jumps_forward(self) -> bool: """ Return True if instruction is jump backwards """ return self.is_jump() and self.offset < self.argval # if __name__ == '__main__': # pass