| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514 |
- from triton.backends.compiler import BaseBackend, GPUTarget, Language
- from triton._C.libtriton import ir, passes, llvm, amd
- from triton import knobs
- from dataclasses import dataclass
- from typing import Any, Dict, Tuple
- from types import ModuleType
- import hashlib
- import os
- import tempfile
- import re
- import functools
- import warnings
- from pathlib import Path
- # The file may be accessed in parallel
- def try_remove(path):
- if os.path.exists(path):
- try:
- os.remove(path)
- except OSError:
- import traceback
- traceback.print_exc()
- def get_min_dot_size(target: GPUTarget):
- # We fallback to use FMA and cast arguments if certain configurations is
- # not supported natively by matrix core units.
- return lambda lhs_type, rhs_type: (1, 1, 1)
- def is_pingpong_schedule_enabled(arch, use_async_copy):
- return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)
- ) if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
- def is_in_thread_transpose_enabled(arch):
- return (arch == "gfx942") if knobs.amd.use_in_thread_transpose is None else knobs.amd.use_in_thread_transpose
- @dataclass(frozen=True)
- class HIPOptions:
- num_warps: int = 4
- waves_per_eu: int = 0
- num_stages: int = 2
- num_ctas: int = 1
- extern_libs: dict = None
- debug: bool = False
- sanitize_overflow: bool = True
- arch: str = None
- # We have native support for OCP fp8 variants since CDNA4/RDNA4. For earlier generations,
- # we software emulate the support for them.
- # UZ fp8 variants (fp8e4b8 and fp8e5b16) are natively supported for CDNA3. For other
- # architectures they are software emulated.
- supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
- deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
- default_dot_input_precision: str = "ieee"
- allowed_dot_input_precisions: Tuple[str] = ("ieee", 'bf16x3', 'bf16x6')
- enable_fp_fusion: bool = True
- launch_cooperative_grid: bool = False
- matrix_instr_nonkdim: int = 0
- kpack: int = 1
- allow_flush_denorm: bool = False
- max_num_imprecise_acc_default: int = 0
- backend_name: str = 'hip'
- instrumentation_mode: str = ""
- # The following option provides hints to the AMDGPU backend regarding instruction scheduling
- # for all `tt.dot` operations in a kernel. The "none" variant preserves the default
- # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
- # The option is experimental and may change at any time regarding its semantics and/or may
- # be gone entirely anytime.
- #
- # Current experimental scheduling variants:
- #
- # attention: enables a bunch of optimizations for attention kernels, including:
- # - iglp 2 and sched.barrier around it
- # - sink-insts-to-avoid-spills flag to avoid register spills
- # memory-bound-attention: enables custom scheduling strategy in llvm backend,
- # This option targets special FA variant, which is memory bound and
- # has a lot of elementwise operations from fused operand dequantizations.
- # Note that this option is highly experimental,
- # and will be removed as soon as default sceduler algorithm is fixed.
- #
- # Option allows to set multiple variants divided by commas:
- # schedule_hint="attention,memory-bound-attention"
- schedule_hint: str = 'none'
- def __post_init__(self):
- gfx_major = int(self.arch[3:-2]) # Drop "gfx" prefix and minor/patch number
- warp_size = 32 if gfx_major >= 10 else 64
- object.__setattr__(self, 'warp_size', warp_size)
- assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
- "num_warps must be a power of 2"
- if (self.arch == 'gfx950') and (self.kpack != 1):
- warnings.warn(
- f"kpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = {self.kpack} will be overwritten to 1 to make transitioning easier."
- )
- object.__setattr__(self, 'kpack', 1)
- default_libdir = Path(__file__).parent / 'lib'
- extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
- for lib in ["ocml", "ockl"]:
- extern_libs[lib] = str(default_libdir / f'{lib}.bc')
- object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
- def hash(self):
- key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
- return hashlib.sha256(key.encode("utf-8")).hexdigest()
- class HIPBackend(BaseBackend):
- instrumentation = None
- supports_native_tensor_specialization = False
- @staticmethod
- def supports_target(target: GPUTarget):
- return target.backend == 'hip'
- def __init__(self, target: GPUTarget) -> None:
- super().__init__(target)
- assert isinstance(target.arch, str)
- self.binary_ext = "hsaco"
- def get_target_name(self, options) -> str:
- return f"hip:{options.arch}"
- def parse_options(self, opts) -> Any:
- args = {'arch': knobs.runtime.override_arch or self.target.arch}
- if opts.get("num_ctas", 1) > 1 and not amd.supports_multi_cta_launch(self.target.arch):
- raise ValueError(f"num_ctas > 1 not supported on {self.target.arch}")
- # Enable XF32 (TF32) for CDNA3 GPUs
- if self.target.arch == 'gfx942':
- allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
- allowed_dot_input_precisions.update({'tf32'})
- args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
- if "supported_fp8_dtypes" not in opts:
- args["supported_fp8_dtypes"] = tuple(sorted(HIPOptions.supported_fp8_dtypes))
- if self.target.arch == 'gfx950':
- deprecated_fp8_dot_operand_dtypes = set(HIPOptions.deprecated_fp8_dot_operand_dtypes)
- deprecated_fp8_dot_operand_dtypes.update({"fp8e5b16", "fp8e4b8"})
- args["deprecated_fp8_dot_operand_dtypes"] = tuple(sorted(deprecated_fp8_dot_operand_dtypes))
- if "enable_fp_fusion" not in opts:
- args["enable_fp_fusion"] = knobs.language.default_fp_fusion
- args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts and opts[k] is not None})
- return HIPOptions(**args)
- def pack_metadata(self, metadata):
- return (
- metadata.num_warps,
- metadata.num_ctas,
- metadata.shared,
- )
- def get_codegen_implementation(self, options):
- return {"min_dot_size": get_min_dot_size(self.target)}
- def get_module_map(self) -> Dict[str, ModuleType]:
- from triton.language.extra.hip import libdevice
- return {"triton.language.extra.libdevice": libdevice}
- def load_dialects(self, ctx):
- amd.load_dialects(ctx)
- if HIPBackend.instrumentation:
- HIPBackend.instrumentation.load_dialects(ctx)
- @staticmethod
- def is_within_2gb(arg):
- import torch
- MAX_INT_32 = 2**31 - 1
- if hasattr(arg, "ptr_range"):
- return arg.ptr_range() <= MAX_INT_32
- if isinstance(arg, torch.Tensor) and hasattr(arg, "untyped_storage"):
- return arg.untyped_storage().size() <= MAX_INT_32
- return False
- @staticmethod
- def parse_attr(desc):
- ret = BaseBackend.parse_attr(desc)
- if "S" in desc:
- ret += [["tt.pointer_range", 32]]
- return ret
- @staticmethod
- def get_tensor_specialization(arg, **kwargs):
- ret = BaseBackend.get_tensor_specialization(arg, **kwargs)
- if knobs.amd.use_buffer_ops and HIPBackend.is_within_2gb(arg):
- ret += "S"
- return ret
- @staticmethod
- def make_ttir(mod, metadata, options):
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- passes.common.add_inliner(pm)
- passes.ttir.add_rewrite_tensor_pointer(pm)
- passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
- passes.common.add_canonicalizer(pm)
- passes.ttir.add_combine(pm)
- passes.ttir.add_reorder_broadcast(pm)
- passes.common.add_cse(pm)
- passes.ttir.add_triton_licm(pm)
- passes.common.add_symbol_dce(pm)
- passes.ttir.add_loop_unroll(pm)
- pm.run(mod, 'make_ttir')
- return mod
- @staticmethod
- def make_ttgir(mod, metadata, options):
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
- options.num_ctas)
- pm.run(mod, 'make_ttgir_early')
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- emuTF32 = False
- passes.ttgpuir.add_coalesce(pm)
- passes.ttgpuir.add_f32_dot_tc(pm, emuTF32)
- passes.ttgpuir.add_remove_layout_conversions(pm)
- passes.ttgpuir.add_optimize_thread_locality(pm)
- amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
- passes.ttgpuir.add_remove_layout_conversions(pm)
- amd.passes.ttgpuir.add_optimize_epilogue(pm)
- amd.passes.ttgpuir.add_optimize_dot_operands(pm, options.arch)
- amd.passes.ttgpuir.add_hoist_layout_conversions(pm)
- passes.ttgpuir.add_fuse_nested_loops(pm)
- passes.common.add_canonicalizer(pm)
- passes.ttir.add_triton_licm(pm)
- passes.common.add_canonicalizer(pm)
- use_async_copy = knobs.amd.use_async_copy
- use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)
- amd.passes.ttgpuir.add_schedule_loops(pm, options.num_stages)
- amd.passes.ttgpuir.add_pipeline(pm, use_async_copy, use_block_pingpong)
- if use_async_copy:
- amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
- passes.common.add_canonicalizer(pm)
- if options.schedule_hint.lower() != "none":
- for hint in options.schedule_hint.split(","):
- amd.passes.ttgpuir.insert_instruction_sched_hints(pm, hint)
- passes.ttgpuir.add_remove_layout_conversions(pm)
- passes.ttgpuir.add_reduce_data_duplication(pm)
- if is_in_thread_transpose_enabled(options.arch):
- amd.passes.ttgpuir.add_in_thread_transpose(pm)
- passes.ttgpuir.add_remove_layout_conversions(pm)
- amd.passes.ttgpuir.add_reorder_instructions(pm)
- if use_block_pingpong and options.num_stages > 1:
- amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
- if knobs.amd.use_buffer_ops:
- amd.passes.ttgpuir.add_canonicalize_pointers(pm)
- passes.common.add_canonicalizer(pm)
- amd.passes.ttgpuir.add_convert_to_buffer_ops(
- pm,
- options.arch,
- knobs.amd.use_buffer_atomics,
- knobs.amd.buffer_ops_analyze_small_tensor_range,
- )
- amd.passes.ttgpuir.add_fold_true_cmpi(pm)
- passes.common.add_canonicalizer(pm)
- passes.common.add_cse(pm)
- passes.common.add_symbol_dce(pm)
- pm.run(mod, 'make_ttgir')
- metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
- return mod
- @staticmethod
- def gluon_to_ttgir(src, metadata, options):
- mod = src
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- passes.gluon.add_inliner(pm)
- passes.gluon.add_resolve_auto_encodings(pm)
- passes.common.add_sccp(pm)
- passes.ttir.add_loop_aware_cse(pm)
- passes.gluon.add_canonicalizer(pm)
- passes.ttgpuir.add_combine_tensor_select_and_if(pm)
- pm.run(mod, 'gluon_to_ttgir')
- metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
- return mod
- @staticmethod
- def make_llir(src, metadata, options):
- mod = src
- # TritonGPU -> LLVM-IR (MLIR)
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
- # custom_lds_size is an experimental parameter that defines amount of LDS available
- # for one thread block. Measured in bytes.
- #
- # If custom_lds_size = 0, pass will consider all LDS is available for one threads block,
- # LDS size is determined by provided arch name.
- custom_lds_size = 0
- amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
- passes.convert.add_scf_to_cf(pm)
- passes.gluon.add_inliner(pm)
- passes.convert.add_index_to_llvmir(pm)
- amd.passes.ttgpuir.add_allocate_shared_memory(pm)
- # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
- if HIPBackend.instrumentation:
- HIPBackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
- ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
- ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
- ## of the value of kernel arg `allow_flush_denorm`.
- ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
- ## depends on the value of kernel arg `allow_flush_denorm`.
- ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
- ## For now it is used as a controller for developers only.
- __HIP_FTZ = True
- amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
- passes.common.add_canonicalizer(pm)
- passes.common.add_cse(pm)
- passes.convert.add_cf_to_llvmir(pm)
- passes.convert.add_arith_to_llvmir(pm)
- passes.common.add_canonicalizer(pm)
- passes.common.add_cse(pm)
- passes.common.add_symbol_dce(pm)
- if options.schedule_hint.lower() != "none":
- amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
- # This can not be moved below the di_scope pass
- if HIPBackend.instrumentation:
- HIPBackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
- if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables:
- passes.llvmir.add_di_scope(pm)
- amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
- pm.run(mod, 'make_llir')
- if knobs.compilation.dump_ir_extract_di_local_variables:
- # comments below on why separate it
- if not knobs.compilation.disable_line_info:
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- passes.llvmir.add_di_scope(pm)
- pm.run(mod, 'make_llir.disable_line_info')
- # insert dbg intrinsic with several DI Attribute including source
- # var name and type info note: unknown reason for now, but this
- # pass and add_di_scope has to be run separately, otherwise if we
- # put them into previous pipline, it trigger a segmentfault without
- # any error message; could be due to a bug in mlir or pybind11
- pm = ir.pass_manager(mod.context)
- pm.enable_debug()
- passes.llvmir.add_di_local_variable(pm)
- pm.run(mod, 'make_llir.dump_ir_extract_di_local_variables')
- # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
- llvm.init_targets()
- context = llvm.context()
- llvm_mod = llvm.to_module(mod, context)
- amd.attach_target_triple(llvm_mod)
- target_features = ''
- if knobs.compilation.enable_asan:
- target_features = '+xnack'
- llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, target_features)
- # Set various control constants on the LLVM module so that device
- # libraries can resolve references to them.
- amd.set_isa_version(llvm_mod, options.arch)
- amd.set_abi_version(llvm_mod, 500)
- amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
- amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
- amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
- amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
- # Set kernel attributes first given this may affect later optimizations.
- fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
- # The public kernel should be kernel 0.
- fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
- fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
- if "memory-bound-attention" in options.schedule_hint.split(','):
- fns[0].add_fn_attr("amdgpu-sched-strategy", "iterative-ilp")
- fns[0].add_fn_attr("uniform-work-group-size", "true")
- # LLVM AMDGPU backend supports the attribute "amdgpu-waves-per-eu"="<min>[, <max>]".
- # This attribute may be attached to a kernel function definition and is an optimization hint.
- # <min> parameter specifies the requested minimum number of waves per EU, and optional <max> parameter
- # specifies the requested maximum number of waves per EU (must be >= <min> if specified).
- # If <max> is omitted, then there is no restriction on the maximum number of waves per EU other than
- # the one dictated by the hardware for which the kernel is compiled. Passing 0, 0 as <min>, <max>
- # implies the default behavior (no limits).
- # Specifying N, N forces LLVM to focus on a single register count, simplifies some heuristics
- # and may improve scheduling.
- fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}, {options.waves_per_eu}")
- denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
- fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
- if knobs.compilation.enable_asan:
- fns[0].add_fn_target_feature("+xnack")
- fns[0].add_fn_asan_attr()
- # Hint the compiler that we'd like the firmware to set the kernel arguments
- # to user SGPRs so that the kernel does not need to s_load its arguments
- # from memory.
- amd.set_all_fn_arg_inreg(fns[0])
- if knobs.compilation.enable_asan:
- default_libdir = Path(__file__).parent / 'lib'
- paths = [
- str(default_libdir / 'asanrtl.bc'),
- str(default_libdir / "ocml.bc"),
- str(default_libdir / "ockl.bc")
- ]
- llvm.link_extern_libs(llvm_mod, paths)
- elif options.extern_libs:
- paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
- if len(paths) > 0:
- llvm.link_extern_libs(llvm_mod, paths)
- llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
- # Architectures with architected SGPRs store the workgroup id in ttmp9 (X) and ttmp7 (Y[15:0], Z[31:16]).
- # These attributes are used to determine if Z should be masked out when loading Y. They are inferred during
- # optimize_module from calls to @llvm.amdgcn.workgroup.id.x/y/z(). We cannot rely on this because a
- # dispatch dimensions might be used even if there is no program_id() call for it.
- if amd.has_architected_sgprs(options.arch):
- fns[0].remove_fn_attr("amdgpu-no-workgroup-id-x")
- fns[0].remove_fn_attr("amdgpu-no-workgroup-id-y")
- fns[0].remove_fn_attr("amdgpu-no-workgroup-id-z")
- if knobs.amd.scalarize_packed_fops:
- amd.add_scalarize_packed_fops_llvm_pass(fns[0])
- # Get some metadata
- metadata["shared"] = src.get_int_attr("ttg.shared")
- metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
- metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
- amd.cleanup_bitcode_metadata(llvm_mod)
- # Disable inlining of print related functions,
- # because inlining of these function could slow down compilation significantly
- amd.disable_print_inline(llvm_mod)
- return str(llvm_mod)
- @staticmethod
- def make_amdgcn(src, metadata, options):
- # Find kernel names (there should only be one)
- # We get the name at the last possible step to accommodate `triton.compile`
- # on user-provided LLVM
- names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
- assert len(names) == 1
- metadata["name"] = names[0]
- # llvm -> hsaco
- flags = []
- features = '-real-true16' if 'gfx11' in options.arch else ''
- ir_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()
- dump_file_id = names[0] + '_' + ir_hash
- _ = llvm.translate_to_mir(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
- dump_file_id)
- llvm.dump_sched_dag(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
- dump_file_id)
- amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
- False)
- if knobs.amd.dump_amdgcn:
- print("// -----// AMDGCN Dump //----- //")
- print(amdgcn)
- return amdgcn
- @staticmethod
- def make_hsaco(src, metadata, options):
- target_features = ''
- if knobs.compilation.enable_asan:
- target_features = '+xnack'
- hsaco = amd.assemble_amdgcn(src, options.arch, target_features)
- # Use delete=False so the file can be reopened on Windows after closing.
- # Clean up with try_remove outside the context managers.
- with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.o') as tmp_in, \
- tempfile.NamedTemporaryFile(delete=False, mode='rb', suffix='.hsaco') as tmp_out:
- tmp_in.write(hsaco)
- tmp_in.flush()
- tmp_out_name = tmp_out.name
- tmp_in_name = tmp_in.name
- amd.link_hsaco(tmp_in_name, tmp_out_name)
- with open(tmp_out_name, 'rb') as f:
- ret = f.read()
- try_remove(tmp_in_name)
- try_remove(tmp_out_name)
- return ret
- def add_stages(self, stages, options, language):
- if language == Language.TRITON:
- stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
- stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
- elif language == Language.GLUON:
- stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options)
- stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
- stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
- stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
- if knobs.runtime.add_stages_inspection_hook is not None:
- knobs.runtime.add_stages_inspection_hook(self, stages, options, language, None)
- @functools.lru_cache()
- def hash(self):
- return f'{self.target}'
|