| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568 |
- # MIT License
- #
- # Copyright (c) 2023 Advanced Micro Devices, Inc.
- #
- # Permission is hereby granted, free of charge, to any person obtaining a copy
- # of this software and associated documentation files (the "Software"), to deal
- # in the Software without restriction, including without limitation the rights
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- # copies of the Software, and to permit persons to whom the Software is
- # furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in all
- # copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- # SOFTWARE.
- # Python bindings for ROCm-SMI library
- from ctypes import *
- from os.path import join, realpath, isfile
- import os
- import logging
- import subprocess
- import sys
- import threading
- from enum import IntEnum, auto
- ## Error checking
- class ROCMLError_NotSupported(Exception):
- pass
- class ROCMLError_FunctionNotFound(Exception):
- pass
- class ROCMLError_LibraryNotFound(Exception):
- pass
- class ROCMLError_DriverNotLoaded(Exception):
- pass
- class ROCMLError_Unknown(Exception):
- pass
- class ROCMLError_Uninitialized(Exception):
- pass
- class ROCMLState(IntEnum):
- UNINITIALIZED = auto()
- """No attempt yet made to initialize PyROCML"""
- INITIALIZED = auto()
- """PyROCML was successfully initialized"""
- DISABLED_PYROCML_NOT_AVAILABLE = auto()
- """PyROCML not installed"""
- DISABLED_CONFIG = auto()
- """PyROCML diagnostics disabled by ``distributed.diagnostics.rocml`` config setting"""
- DISABLED_LIBRARY_NOT_FOUND = auto()
- """PyROCML available, but ROCML not installed"""
- LIBROCM_NAME = 'librocm_smi64.so'
- RSMI_MAX_BUFFER_LENGTH = 256
- # Policy enums
- RSMI_MAX_NUM_FREQUENCIES = 32
- class rsmi_status_t(c_int):
- RSMI_STATUS_SUCCESS = 0x0
- RSMI_STATUS_INVALID_ARGS = 0x1
- RSMI_STATUS_NOT_SUPPORTED = 0x2
- RSMI_STATUS_FILE_ERROR = 0x3
- RSMI_STATUS_PERMISSION = 0x4
- RSMI_STATUS_OUT_OF_RESOURCES = 0x5
- RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
- RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
- RSMI_STATUS_INIT_ERROR = 0x8
- RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
- RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
- RSMI_STATUS_NOT_FOUND = 0xA
- RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
- RSMI_STATUS_INTERRUPT = 0xC
- RSMI_STATUS_UNEXPECTED_SIZE = 0xD
- RSMI_STATUS_NO_DATA = 0xE
- RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
- #Dictionary of rsmi ret codes and it's verbose output
- rsmi_status_verbose_err_out = {
- rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
- rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
- rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
- rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
- rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
- rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
- rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
- rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
- rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
- rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
- rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
- rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
- rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
- rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
- rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
- rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occurred'
- }
- class rsmi_init_flags_t(c_int):
- RSMI_INIT_FLAG_ALL_GPUS = 0x1
- class rsmi_memory_type_t(c_int):
- RSMI_MEM_TYPE_FIRST = 0
- RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
- RSMI_MEM_TYPE_VIS_VRAM = 1
- RSMI_MEM_TYPE_GTT = 2
- RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
- # memory_type_l includes names for with rsmi_memory_type_t
- # Usage example to get corresponding names:
- # memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
- memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
- class rsmi_retired_page_record_t(Structure):
- _fields_ = [('page_address', c_uint64),
- ('page_size', c_uint64),
- ('status', c_int)]
- class rsmi_sw_component_t(c_int):
- RSMI_SW_COMP_FIRST = 0x0
- RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
- RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
- class rsmi_frequencies_t(Structure):
- _fields_ = [('num_supported', c_int32),
- ('current', c_uint32),
- ('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
- class rsmi_pcie_bandwidth_t(Structure):
- _fields_ = [('transfer_rate', rsmi_frequencies_t),
- ('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
- class rsmi_process_info_t(Structure):
- _fields_ = [('process_id', c_uint32),
- ('pasid', c_uint32), # PSA: Power Spectrum Analysis ?
- ('vram_usage', c_uint64),
- ('sdma_usage', c_uint64), # SDMA: System Direct Memory Access
- ('cu_occupancy', c_uint32)]
- class rsmi_xgmi_status_t(c_int):
- RSMI_XGMI_STATUS_NO_ERRORS = 0
- RSMI_XGMI_STATUS_ERROR = 1
- RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
- class rsmi_io_link_type(c_int):
- RSMI_IOLINK_TYPE_UNDEFINED = 0
- RSMI_IOLINK_TYPE_HYPERTRANSPORT = 1
- RSMI_IOLINK_TYPE_PCIEXPRESS = 2
- RSMI_IOLINK_TYPE_AMBA = 3
- RSMI_IOLINK_TYPE_MIPI = 4
- RSMI_IOLINK_TYPE_QPI_1_1 = 5
- RSMI_IOLINK_TYPE_RESERVED1 = 6
- RSMI_IOLINK_TYPE_RESERVED2 = 7
- RSMI_IOLINK_TYPE_RAPID_IO = 8
- RSMI_IOLINK_TYPE_INFINIBAND = 9
- RSMI_IOLINK_TYPE_RESERVED3 = 10
- RSMI_IOLINK_TYPE_XGMI = 11
- RSMI_IOLINK_TYPE_XGOP = 12
- RSMI_IOLINK_TYPE_GZ = 13
- RSMI_IOLINK_TYPE_ETHERNET_RDMA = 14
- RSMI_IOLINK_TYPE_RDMA_OTHER = 15
- RSMI_IOLINK_TYPE_OTHER = 16
- RSMI_IOLINK_TYPE_NUMIOLINKTYPES = 17
- RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF
- ## Library loading
- rocm_lib = None
- lib_load_lock = threading.Lock()
- _rocm_lib_refcount = 0
- ## Function access, to prevent lib_load_lock deadlock
- _rocml_get_function_ptr_cache = dict()
- def _rocml_get_function_ptr(name):
- global rocm_lib
- if name in _rocml_get_function_ptr_cache:
- return _rocml_get_function_ptr_cache[name]
- lib_load_lock.acquire()
- try:
- # ensure library was loaded
- if rocm_lib == None:
- raise ROCMLError_Uninitialized
- try:
- _rocml_get_function_ptr_cache[name] = getattr(rocm_lib, name)
- return _rocml_get_function_ptr_cache[name]
- except AttributeError:
- raise ROCMLError_FunctionNotFound
- finally:
- # lock is always freed
- lib_load_lock.release()
- def _load_rocm_library():
- """Load ROCm library if not already loaded"""
- global rocm_lib
- if rocm_lib == None:
- lib_load_lock.acquire()
- try:
- if rocm_lib == None:
- try:
- if sys.platform[:3] == 'win':
- raise ROCMLError_NotSupported('Windows platform is not supported yet')
- else:
- # assume linux
- path_librocm = _find_lib_rocm()
- cdll.LoadLibrary(path_librocm)
- rocm_lib = CDLL(path_librocm)
- except OSError:
- raise ROCMLError_LibraryNotFound('ROCm library not found')
- if rocm_lib == None:
- raise ROCMLError_LibraryNotFound('ROCm library not found')
- finally:
- lib_load_lock.release()
- def _find_lib_rocm():
- """search for librocm and returns path
- if search fails, returns empty string
- """
- rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
- rocm_lib_path = join(rocm_path, f'lib/{LIBROCM_NAME}')
- return rocm_lib_path if isfile(rocm_lib_path) else ''
- def _driver_initialized():
- """ Returns true if amdgpu is found in the list of initialized modules
- """
- initialized = ''
- try:
- initialized = str(subprocess.check_output("cat /sys/module/amdgpu/initstate |grep live", shell=True, stderr=subprocess.DEVNULL))
- except subprocess.CalledProcessError:
- pass
- return len(initialized) > 0
- def smi_initialize():
- """Initialize ROCm binding of SMI"""
- _load_rocm_library()
- if _driver_initialized():
- ret_init = rocm_lib.rsmi_init(0)
- if ret_init != 0:
- logging.debug("ROCm SMI init returned value: %s", ret_init)
- raise RuntimeError('ROCm SMI initialization failed')
- else:
- raise RuntimeError('ROCm driver initilization failed')
- # update reference count
- global _rocm_lib_refcount
- lib_load_lock.acquire()
- _rocm_lib_refcount += 1
- lib_load_lock.release()
- def rsmi_ret_ok(my_ret):
- """ Returns true if RSMI call status is 0 (success)
- @param device: DRM device identifier
- @param my_ret: Return of RSMI call (rocm_smi_lib API)
- @param metric: Parameter of GPU currently being analyzed
- """
- if my_ret != rsmi_status_t.RSMI_STATUS_SUCCESS:
- err_str = c_char_p()
- rocm_lib.rsmi_status_string(my_ret, byref(err_str))
- logging.debug("ROCm RSMI error: %s", err_str.value.decode())
- return False
- return True
- def smi_shutdown():
- """leave the library loaded, but shutdown the interface"""
- rsmi_ret_ok(rocm_lib.rsmi_shut_down())
- # update reference count
- global _rocm_lib_refcount
- lib_load_lock.acquire()
- _rocm_lib_refcount -= 1
- lib_load_lock.release()
- def smi_get_kernel_version():
- """returns ROCm kernerl driver version"""
- ver_str = create_string_buffer(256)
- ret = rocm_lib.rsmi_version_str_get(rsmi_sw_component_t.RSMI_SW_COMP_DRIVER, ver_str, 256)
- return ver_str.value.decode() if rsmi_ret_ok(ret) else ''
- def smi_get_device_id(dev):
- """returns device id of the device as 64bit integer"""
- uid = c_uint64()
- ret = rocm_lib.rsmi_dev_id_get(dev, byref(uid))
- return uid.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_count():
- """returns a list of GPU devices """
- num_device = c_uint32(0)
- ret = rocm_lib.rsmi_num_monitor_devices(byref(num_device))
- return num_device.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_name(dev):
- """returns the name of a GPU device"""
- series = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
- ret = rocm_lib.rsmi_dev_name_get(dev, series, RSMI_MAX_BUFFER_LENGTH)
- return series.value.decode() if rsmi_ret_ok(ret) else ''
- def smi_get_device_unique_id(dev):
- """returns unique id of the device as 64bit integer"""
- uid = c_uint64()
- ret = rocm_lib.rsmi_dev_unique_id_get(dev, byref(uid))
- return uid.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_utilization(dev):
- """returns GPU device busy percent of device_id dev"""
- busy_percent = c_uint32()
- ret = rocm_lib.rsmi_dev_busy_percent_get(dev, byref(busy_percent))
- return busy_percent.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_memory_used(dev, type='VRAM'):
- """returns used memory of device_id dev in bytes"""
- type_idx = memory_type_l.index(type)
- used = c_uint64()
- ret = rocm_lib.rsmi_dev_memory_usage_get(dev, type_idx, byref(used))
- return used.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_memory_total(dev, type='VRAM'):
- """returns total memory of device_id dev in bytes"""
- type_idx = memory_type_l.index(type)
- total = c_uint64()
- ret = rocm_lib.rsmi_dev_memory_total_get(dev, type_idx, byref(total))
- return total.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_memory_busy(dev):
- """returns percentage of time any device memory is being used"""
- busy_percent = c_uint32()
- ret = rocm_lib.rsmi_dev_memory_busy_percent_get(dev, byref(busy_percent))
- return busy_percent.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_memory_reserved_pages(dev):
- """returns info about reserved memory pages"""
- num_pages = c_uint32()
- records = rsmi_retired_page_record_t()
- ret = rocm_lib.rsmi_dev_memory_reserved_pages_get(dev, byref(num_pages), byref(records))
- return (num_pages.value, records) if rsmi_ret_ok(ret) else -1
- # PCIE functions
- def smi_get_device_pcie_bandwidth(dev):
- """returns list of possible pcie bandwidths for the device in bytes/sec"""
- bandwidth = rsmi_pcie_bandwidth_t()
- ret = rocm_lib.rsmi_dev_pci_bandwidth_get(dev, byref(bandwidth))
- return bandwidth if rsmi_ret_ok(ret) else -1
- def smi_get_device_pci_id(dev):
- """returns unique PCI ID of the device in 64bit Hex with format:
- BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |
- ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7)
- """
- bdfid = c_uint64()
- ret = rocm_lib.rsmi_dev_pci_id_get(dev, byref(bdfid))
- return bdfid.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_topo_numa_affinity(dev):
- """returns the NUMA node associated with the device"""
- numa_node = c_uint32()
- ret = reocm_lib.rsmi_topo_numa_affinity_get(dev, byref(numa_node))
- return numa_node.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_pcie_throughput(dev):
- """returns measured pcie throughput for the device in bytes/sec"""
- sent = c_uint64()
- recv = c_uint64()
- max_pkt_sz = c_uint64()
- ret = rocm_lib.rsmi_dev_pci_throughput_get(dev, byref(sent), byref(recv), byref(max_pkt_sz))
- return (recv.value + sent.value) * max_pkt_sz.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_pci_replay_counter(dev):
- """return PCIe replay counter of the device"""
- counter = c_uint64()
- ret = rocm_lib.rsmi_dev_pci_replay_counter_get(dev, byref(counter))
- return counter.value if rsmi_ret_ok(ret) else -1
- # Compute partition functions
- def smi_get_device_compute_partition(dev):
- """returns the compute partition of the device"""
- partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
- ret = rocm_lib.rsmi_dev_compute_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
- return partition.value.decode() if rsmi_ret_ok(ret) else ''
- def smi_set_device_compute_partition(dev, partition):
- """modifies the compute partition of the selected device"""
- ret = rocm_lib.rsmi_dev_compute_partition_set(dev, partition)
- return rsmi_ret_ok(ret)
- def smi_reset_device_compute_partition(dev):
- """reverts the compute partition of the selected device to its boot state"""
- ret = rocm_lib.rsmi_dev_compute_partition_reset(dev)
- return rsmi_ret_ok(ret)
- # Memory partition functions
- def smi_get_device_memory_partition(dev):
- """returns the memory partition of the device"""
- partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
- ret = rocm_lib.rsmi_dev_memory_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
- return partition.value.decode() if rsmi_ret_ok(ret) else ''
- def smi_set_device_memory_partition(dev, partition):
- """modifies the memory partition of the selected device"""
- ret = rocm_lib.rsmi_dev_memory_partition_set(dev, partition)
- return rsmi_ret_ok(ret)
- def smi_reset_device_memory_partition(dev):
- """reverts the memory partition of the selected device to its boot state"""
- ret = rocm_lib.rsmi_dev_memory_partition_reset(dev)
- return rsmi_ret_ok(ret)
- # Hardware Topology functions
- def smi_get_device_topo_numa_node_number(dev):
- """returns the NUMA node associated with the device"""
- numa_node = c_uint32()
- ret = rocm_lib.rsmi_topo_get_numa_node_number(dev, byref(numa_node))
- return numa_node.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_topo_link_weight(dev_src, dev_dst):
- """returns the weight of the link between two devices"""
- weight = c_uint64()
- ret = rocm_lib.rsmi_topo_get_link_weight(dev_src, dev_dst, byref(weight))
- return weight.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_minmax_bandwidth(dev_src, dev_dst):
- """returns the minimum and maximum io link bandwidth between two devices
- API works if src and dst are connected via XGMI and are 1 hop away.
- """
- assert smi_get_device_link_type(dev_src, dev_dst)[0] == 1, 'Devices must be 1 hop away'
- min_bandwidth = c_uint64()
- max_bandwidth = c_uint64()
- ret = rocm_lib.rsmi_minmax_bandwidth_get(dev_src, dev_dst, byref(min_bandwidth), byref(max_bandwidth))
- return (min_bandwidth.value, max_bandwidth.value) if rsmi_ret_ok(ret) else -1
- def smi_get_device_link_type(dev_src, dev_dst):
- """returns the hops and the type of link between two devices"""
- hops = c_uint64()
- link_type = rsmi_io_link_type()
- ret = rocm_lib.rsmi_topo_get_link_type(dev_src, dev_dst, byref(hops), byref(link_type))
- return (hops.value, link_type.value) if rsmi_ret_ok(ret) else -1
- def smi_is_device_p2p_accessible(dev_src, dev_dst):
- """returns true if two devices are p2p accessible"""
- accessible = c_bool()
- ret = rocm_lib.rsmi_is_P2P_accessible(dev_src, dev_dst, byref(accessible))
- return accessible.value if rsmi_ret_ok(ret) else -1
- def smi_get_device_compute_process():
- """returns list of process ids running compute on the system"""
- num_procs = c_uint32()
- ret = rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
- if rsmi_ret_ok(ret):
- buff_sz = num_procs.value + 10
- proc_info = (rsmi_process_info_t * buff_sz)()
- ret2 = rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
- return [proc_info[i].process_id for i in range(num_procs.value)] if rsmi_ret_ok(ret2) else []
- else:
- return []
- def smi_get_compute_process_info_by_device(device_id: int, proc_ids: list) -> list:
- """Returns list of process info running compute on the specified device by process IDs.
- Args:
- device_id: The device index to query
- proc_ids: List of process IDs to get info for
- Returns:
- List of process info structures for the specified device and process IDs
- """
- proc_infos = []
- for proc_id in proc_ids:
- proc_info = rsmi_process_info_t()
- ret = rocm_lib.rsmi_compute_process_info_by_device_get(proc_id, device_id, byref(proc_info))
- if rsmi_ret_ok(ret):
- proc_infos.append(proc_info)
- return proc_infos
- def smi_get_device_average_power(dev):
- """returns average power of device_id dev"""
- power = c_uint32()
- ret = rocm_lib.rsmi_dev_power_ave_get(dev, 0, byref(power))
- return power.value * 1e-6 if rsmi_ret_ok(ret) else -1
- # XGMI fuctions
- def smi_get_device_xgmi_error_status(dev):
- """returns XGMI error status for a device"""
- status = rsmi_xgmi_status_t()
- ret = rocm_lib.rsmi_dev_xgmi_error_status(dev, byref(status))
- return status.value if rsmi_ret_ok(ret) else -1
- def smi_reset_device_xgmi_error(dev):
- """resets XGMI error status for a device"""
- ret = rocm_lib.rsmi_dev_xgmi_error_reset(dev)
- return rsmi_ret_ok(ret)
- def smi_get_device_xgmi_hive_id(dev):
- """returns XGMI hive ID for a device"""
- hive_id = c_uint64()
- ret = rocm_lib.rsmi_dev_xgmi_hive_id_get(dev, byref(hive_id))
- return hive_id.value if rsmi_ret_ok(ret) else -1
|