pyamdsmi.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. # MIT License
  2. #
  3. # Copyright (c) 2023 Advanced Micro Devices, Inc.
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in all
  13. # copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. # Python bindings for ROCm-SMI library
  23. from ctypes import *
  24. from os.path import join, realpath, isfile
  25. import os
  26. import logging
  27. import subprocess
  28. import sys
  29. import threading
  30. from enum import IntEnum, auto
  31. ## Error checking
  32. class ROCMLError_NotSupported(Exception):
  33. pass
  34. class ROCMLError_FunctionNotFound(Exception):
  35. pass
  36. class ROCMLError_LibraryNotFound(Exception):
  37. pass
  38. class ROCMLError_DriverNotLoaded(Exception):
  39. pass
  40. class ROCMLError_Unknown(Exception):
  41. pass
  42. class ROCMLError_Uninitialized(Exception):
  43. pass
  44. class ROCMLState(IntEnum):
  45. UNINITIALIZED = auto()
  46. """No attempt yet made to initialize PyROCML"""
  47. INITIALIZED = auto()
  48. """PyROCML was successfully initialized"""
  49. DISABLED_PYROCML_NOT_AVAILABLE = auto()
  50. """PyROCML not installed"""
  51. DISABLED_CONFIG = auto()
  52. """PyROCML diagnostics disabled by ``distributed.diagnostics.rocml`` config setting"""
  53. DISABLED_LIBRARY_NOT_FOUND = auto()
  54. """PyROCML available, but ROCML not installed"""
  55. LIBROCM_NAME = 'librocm_smi64.so'
  56. RSMI_MAX_BUFFER_LENGTH = 256
  57. # Policy enums
  58. RSMI_MAX_NUM_FREQUENCIES = 32
  59. class rsmi_status_t(c_int):
  60. RSMI_STATUS_SUCCESS = 0x0
  61. RSMI_STATUS_INVALID_ARGS = 0x1
  62. RSMI_STATUS_NOT_SUPPORTED = 0x2
  63. RSMI_STATUS_FILE_ERROR = 0x3
  64. RSMI_STATUS_PERMISSION = 0x4
  65. RSMI_STATUS_OUT_OF_RESOURCES = 0x5
  66. RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
  67. RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
  68. RSMI_STATUS_INIT_ERROR = 0x8
  69. RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
  70. RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
  71. RSMI_STATUS_NOT_FOUND = 0xA
  72. RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
  73. RSMI_STATUS_INTERRUPT = 0xC
  74. RSMI_STATUS_UNEXPECTED_SIZE = 0xD
  75. RSMI_STATUS_NO_DATA = 0xE
  76. RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
  77. #Dictionary of rsmi ret codes and it's verbose output
  78. rsmi_status_verbose_err_out = {
  79. rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
  80. rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
  81. rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
  82. rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
  83. rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
  84. rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
  85. rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
  86. rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
  87. rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
  88. rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
  89. rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
  90. rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
  91. rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
  92. rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
  93. rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
  94. rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occurred'
  95. }
  96. class rsmi_init_flags_t(c_int):
  97. RSMI_INIT_FLAG_ALL_GPUS = 0x1
  98. class rsmi_memory_type_t(c_int):
  99. RSMI_MEM_TYPE_FIRST = 0
  100. RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
  101. RSMI_MEM_TYPE_VIS_VRAM = 1
  102. RSMI_MEM_TYPE_GTT = 2
  103. RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
  104. # memory_type_l includes names for with rsmi_memory_type_t
  105. # Usage example to get corresponding names:
  106. # memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
  107. memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
  108. class rsmi_retired_page_record_t(Structure):
  109. _fields_ = [('page_address', c_uint64),
  110. ('page_size', c_uint64),
  111. ('status', c_int)]
  112. class rsmi_sw_component_t(c_int):
  113. RSMI_SW_COMP_FIRST = 0x0
  114. RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
  115. RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
  116. class rsmi_frequencies_t(Structure):
  117. _fields_ = [('num_supported', c_int32),
  118. ('current', c_uint32),
  119. ('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
  120. class rsmi_pcie_bandwidth_t(Structure):
  121. _fields_ = [('transfer_rate', rsmi_frequencies_t),
  122. ('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
  123. class rsmi_process_info_t(Structure):
  124. _fields_ = [('process_id', c_uint32),
  125. ('pasid', c_uint32), # PSA: Power Spectrum Analysis ?
  126. ('vram_usage', c_uint64),
  127. ('sdma_usage', c_uint64), # SDMA: System Direct Memory Access
  128. ('cu_occupancy', c_uint32)]
  129. class rsmi_xgmi_status_t(c_int):
  130. RSMI_XGMI_STATUS_NO_ERRORS = 0
  131. RSMI_XGMI_STATUS_ERROR = 1
  132. RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
  133. class rsmi_io_link_type(c_int):
  134. RSMI_IOLINK_TYPE_UNDEFINED = 0
  135. RSMI_IOLINK_TYPE_HYPERTRANSPORT = 1
  136. RSMI_IOLINK_TYPE_PCIEXPRESS = 2
  137. RSMI_IOLINK_TYPE_AMBA = 3
  138. RSMI_IOLINK_TYPE_MIPI = 4
  139. RSMI_IOLINK_TYPE_QPI_1_1 = 5
  140. RSMI_IOLINK_TYPE_RESERVED1 = 6
  141. RSMI_IOLINK_TYPE_RESERVED2 = 7
  142. RSMI_IOLINK_TYPE_RAPID_IO = 8
  143. RSMI_IOLINK_TYPE_INFINIBAND = 9
  144. RSMI_IOLINK_TYPE_RESERVED3 = 10
  145. RSMI_IOLINK_TYPE_XGMI = 11
  146. RSMI_IOLINK_TYPE_XGOP = 12
  147. RSMI_IOLINK_TYPE_GZ = 13
  148. RSMI_IOLINK_TYPE_ETHERNET_RDMA = 14
  149. RSMI_IOLINK_TYPE_RDMA_OTHER = 15
  150. RSMI_IOLINK_TYPE_OTHER = 16
  151. RSMI_IOLINK_TYPE_NUMIOLINKTYPES = 17
  152. RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF
  153. ## Library loading
  154. rocm_lib = None
  155. lib_load_lock = threading.Lock()
  156. _rocm_lib_refcount = 0
  157. ## Function access, to prevent lib_load_lock deadlock
  158. _rocml_get_function_ptr_cache = dict()
  159. def _rocml_get_function_ptr(name):
  160. global rocm_lib
  161. if name in _rocml_get_function_ptr_cache:
  162. return _rocml_get_function_ptr_cache[name]
  163. lib_load_lock.acquire()
  164. try:
  165. # ensure library was loaded
  166. if rocm_lib == None:
  167. raise ROCMLError_Uninitialized
  168. try:
  169. _rocml_get_function_ptr_cache[name] = getattr(rocm_lib, name)
  170. return _rocml_get_function_ptr_cache[name]
  171. except AttributeError:
  172. raise ROCMLError_FunctionNotFound
  173. finally:
  174. # lock is always freed
  175. lib_load_lock.release()
  176. def _load_rocm_library():
  177. """Load ROCm library if not already loaded"""
  178. global rocm_lib
  179. if rocm_lib == None:
  180. lib_load_lock.acquire()
  181. try:
  182. if rocm_lib == None:
  183. try:
  184. if sys.platform[:3] == 'win':
  185. raise ROCMLError_NotSupported('Windows platform is not supported yet')
  186. else:
  187. # assume linux
  188. path_librocm = _find_lib_rocm()
  189. cdll.LoadLibrary(path_librocm)
  190. rocm_lib = CDLL(path_librocm)
  191. except OSError:
  192. raise ROCMLError_LibraryNotFound('ROCm library not found')
  193. if rocm_lib == None:
  194. raise ROCMLError_LibraryNotFound('ROCm library not found')
  195. finally:
  196. lib_load_lock.release()
  197. def _find_lib_rocm():
  198. """search for librocm and returns path
  199. if search fails, returns empty string
  200. """
  201. rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
  202. rocm_lib_path = join(rocm_path, f'lib/{LIBROCM_NAME}')
  203. return rocm_lib_path if isfile(rocm_lib_path) else ''
  204. def _driver_initialized():
  205. """ Returns true if amdgpu is found in the list of initialized modules
  206. """
  207. initialized = ''
  208. try:
  209. initialized = str(subprocess.check_output("cat /sys/module/amdgpu/initstate |grep live", shell=True, stderr=subprocess.DEVNULL))
  210. except subprocess.CalledProcessError:
  211. pass
  212. return len(initialized) > 0
  213. def smi_initialize():
  214. """Initialize ROCm binding of SMI"""
  215. _load_rocm_library()
  216. if _driver_initialized():
  217. ret_init = rocm_lib.rsmi_init(0)
  218. if ret_init != 0:
  219. logging.debug("ROCm SMI init returned value: %s", ret_init)
  220. raise RuntimeError('ROCm SMI initialization failed')
  221. else:
  222. raise RuntimeError('ROCm driver initilization failed')
  223. # update reference count
  224. global _rocm_lib_refcount
  225. lib_load_lock.acquire()
  226. _rocm_lib_refcount += 1
  227. lib_load_lock.release()
  228. def rsmi_ret_ok(my_ret):
  229. """ Returns true if RSMI call status is 0 (success)
  230. @param device: DRM device identifier
  231. @param my_ret: Return of RSMI call (rocm_smi_lib API)
  232. @param metric: Parameter of GPU currently being analyzed
  233. """
  234. if my_ret != rsmi_status_t.RSMI_STATUS_SUCCESS:
  235. err_str = c_char_p()
  236. rocm_lib.rsmi_status_string(my_ret, byref(err_str))
  237. logging.debug("ROCm RSMI error: %s", err_str.value.decode())
  238. return False
  239. return True
  240. def smi_shutdown():
  241. """leave the library loaded, but shutdown the interface"""
  242. rsmi_ret_ok(rocm_lib.rsmi_shut_down())
  243. # update reference count
  244. global _rocm_lib_refcount
  245. lib_load_lock.acquire()
  246. _rocm_lib_refcount -= 1
  247. lib_load_lock.release()
  248. def smi_get_kernel_version():
  249. """returns ROCm kernerl driver version"""
  250. ver_str = create_string_buffer(256)
  251. ret = rocm_lib.rsmi_version_str_get(rsmi_sw_component_t.RSMI_SW_COMP_DRIVER, ver_str, 256)
  252. return ver_str.value.decode() if rsmi_ret_ok(ret) else ''
  253. def smi_get_device_id(dev):
  254. """returns device id of the device as 64bit integer"""
  255. uid = c_uint64()
  256. ret = rocm_lib.rsmi_dev_id_get(dev, byref(uid))
  257. return uid.value if rsmi_ret_ok(ret) else -1
  258. def smi_get_device_count():
  259. """returns a list of GPU devices """
  260. num_device = c_uint32(0)
  261. ret = rocm_lib.rsmi_num_monitor_devices(byref(num_device))
  262. return num_device.value if rsmi_ret_ok(ret) else -1
  263. def smi_get_device_name(dev):
  264. """returns the name of a GPU device"""
  265. series = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
  266. ret = rocm_lib.rsmi_dev_name_get(dev, series, RSMI_MAX_BUFFER_LENGTH)
  267. return series.value.decode() if rsmi_ret_ok(ret) else ''
  268. def smi_get_device_unique_id(dev):
  269. """returns unique id of the device as 64bit integer"""
  270. uid = c_uint64()
  271. ret = rocm_lib.rsmi_dev_unique_id_get(dev, byref(uid))
  272. return uid.value if rsmi_ret_ok(ret) else -1
  273. def smi_get_device_utilization(dev):
  274. """returns GPU device busy percent of device_id dev"""
  275. busy_percent = c_uint32()
  276. ret = rocm_lib.rsmi_dev_busy_percent_get(dev, byref(busy_percent))
  277. return busy_percent.value if rsmi_ret_ok(ret) else -1
  278. def smi_get_device_memory_used(dev, type='VRAM'):
  279. """returns used memory of device_id dev in bytes"""
  280. type_idx = memory_type_l.index(type)
  281. used = c_uint64()
  282. ret = rocm_lib.rsmi_dev_memory_usage_get(dev, type_idx, byref(used))
  283. return used.value if rsmi_ret_ok(ret) else -1
  284. def smi_get_device_memory_total(dev, type='VRAM'):
  285. """returns total memory of device_id dev in bytes"""
  286. type_idx = memory_type_l.index(type)
  287. total = c_uint64()
  288. ret = rocm_lib.rsmi_dev_memory_total_get(dev, type_idx, byref(total))
  289. return total.value if rsmi_ret_ok(ret) else -1
  290. def smi_get_device_memory_busy(dev):
  291. """returns percentage of time any device memory is being used"""
  292. busy_percent = c_uint32()
  293. ret = rocm_lib.rsmi_dev_memory_busy_percent_get(dev, byref(busy_percent))
  294. return busy_percent.value if rsmi_ret_ok(ret) else -1
  295. def smi_get_device_memory_reserved_pages(dev):
  296. """returns info about reserved memory pages"""
  297. num_pages = c_uint32()
  298. records = rsmi_retired_page_record_t()
  299. ret = rocm_lib.rsmi_dev_memory_reserved_pages_get(dev, byref(num_pages), byref(records))
  300. return (num_pages.value, records) if rsmi_ret_ok(ret) else -1
  301. # PCIE functions
  302. def smi_get_device_pcie_bandwidth(dev):
  303. """returns list of possible pcie bandwidths for the device in bytes/sec"""
  304. bandwidth = rsmi_pcie_bandwidth_t()
  305. ret = rocm_lib.rsmi_dev_pci_bandwidth_get(dev, byref(bandwidth))
  306. return bandwidth if rsmi_ret_ok(ret) else -1
  307. def smi_get_device_pci_id(dev):
  308. """returns unique PCI ID of the device in 64bit Hex with format:
  309. BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |
  310. ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7)
  311. """
  312. bdfid = c_uint64()
  313. ret = rocm_lib.rsmi_dev_pci_id_get(dev, byref(bdfid))
  314. return bdfid.value if rsmi_ret_ok(ret) else -1
  315. def smi_get_device_topo_numa_affinity(dev):
  316. """returns the NUMA node associated with the device"""
  317. numa_node = c_uint32()
  318. ret = reocm_lib.rsmi_topo_numa_affinity_get(dev, byref(numa_node))
  319. return numa_node.value if rsmi_ret_ok(ret) else -1
  320. def smi_get_device_pcie_throughput(dev):
  321. """returns measured pcie throughput for the device in bytes/sec"""
  322. sent = c_uint64()
  323. recv = c_uint64()
  324. max_pkt_sz = c_uint64()
  325. ret = rocm_lib.rsmi_dev_pci_throughput_get(dev, byref(sent), byref(recv), byref(max_pkt_sz))
  326. return (recv.value + sent.value) * max_pkt_sz.value if rsmi_ret_ok(ret) else -1
  327. def smi_get_device_pci_replay_counter(dev):
  328. """return PCIe replay counter of the device"""
  329. counter = c_uint64()
  330. ret = rocm_lib.rsmi_dev_pci_replay_counter_get(dev, byref(counter))
  331. return counter.value if rsmi_ret_ok(ret) else -1
  332. # Compute partition functions
  333. def smi_get_device_compute_partition(dev):
  334. """returns the compute partition of the device"""
  335. partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
  336. ret = rocm_lib.rsmi_dev_compute_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
  337. return partition.value.decode() if rsmi_ret_ok(ret) else ''
  338. def smi_set_device_compute_partition(dev, partition):
  339. """modifies the compute partition of the selected device"""
  340. ret = rocm_lib.rsmi_dev_compute_partition_set(dev, partition)
  341. return rsmi_ret_ok(ret)
  342. def smi_reset_device_compute_partition(dev):
  343. """reverts the compute partition of the selected device to its boot state"""
  344. ret = rocm_lib.rsmi_dev_compute_partition_reset(dev)
  345. return rsmi_ret_ok(ret)
  346. # Memory partition functions
  347. def smi_get_device_memory_partition(dev):
  348. """returns the memory partition of the device"""
  349. partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
  350. ret = rocm_lib.rsmi_dev_memory_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
  351. return partition.value.decode() if rsmi_ret_ok(ret) else ''
  352. def smi_set_device_memory_partition(dev, partition):
  353. """modifies the memory partition of the selected device"""
  354. ret = rocm_lib.rsmi_dev_memory_partition_set(dev, partition)
  355. return rsmi_ret_ok(ret)
  356. def smi_reset_device_memory_partition(dev):
  357. """reverts the memory partition of the selected device to its boot state"""
  358. ret = rocm_lib.rsmi_dev_memory_partition_reset(dev)
  359. return rsmi_ret_ok(ret)
  360. # Hardware Topology functions
  361. def smi_get_device_topo_numa_node_number(dev):
  362. """returns the NUMA node associated with the device"""
  363. numa_node = c_uint32()
  364. ret = rocm_lib.rsmi_topo_get_numa_node_number(dev, byref(numa_node))
  365. return numa_node.value if rsmi_ret_ok(ret) else -1
  366. def smi_get_device_topo_link_weight(dev_src, dev_dst):
  367. """returns the weight of the link between two devices"""
  368. weight = c_uint64()
  369. ret = rocm_lib.rsmi_topo_get_link_weight(dev_src, dev_dst, byref(weight))
  370. return weight.value if rsmi_ret_ok(ret) else -1
  371. def smi_get_device_minmax_bandwidth(dev_src, dev_dst):
  372. """returns the minimum and maximum io link bandwidth between two devices
  373. API works if src and dst are connected via XGMI and are 1 hop away.
  374. """
  375. assert smi_get_device_link_type(dev_src, dev_dst)[0] == 1, 'Devices must be 1 hop away'
  376. min_bandwidth = c_uint64()
  377. max_bandwidth = c_uint64()
  378. ret = rocm_lib.rsmi_minmax_bandwidth_get(dev_src, dev_dst, byref(min_bandwidth), byref(max_bandwidth))
  379. return (min_bandwidth.value, max_bandwidth.value) if rsmi_ret_ok(ret) else -1
  380. def smi_get_device_link_type(dev_src, dev_dst):
  381. """returns the hops and the type of link between two devices"""
  382. hops = c_uint64()
  383. link_type = rsmi_io_link_type()
  384. ret = rocm_lib.rsmi_topo_get_link_type(dev_src, dev_dst, byref(hops), byref(link_type))
  385. return (hops.value, link_type.value) if rsmi_ret_ok(ret) else -1
  386. def smi_is_device_p2p_accessible(dev_src, dev_dst):
  387. """returns true if two devices are p2p accessible"""
  388. accessible = c_bool()
  389. ret = rocm_lib.rsmi_is_P2P_accessible(dev_src, dev_dst, byref(accessible))
  390. return accessible.value if rsmi_ret_ok(ret) else -1
  391. def smi_get_device_compute_process():
  392. """returns list of process ids running compute on the system"""
  393. num_procs = c_uint32()
  394. ret = rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
  395. if rsmi_ret_ok(ret):
  396. buff_sz = num_procs.value + 10
  397. proc_info = (rsmi_process_info_t * buff_sz)()
  398. ret2 = rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
  399. return [proc_info[i].process_id for i in range(num_procs.value)] if rsmi_ret_ok(ret2) else []
  400. else:
  401. return []
  402. def smi_get_compute_process_info_by_device(device_id: int, proc_ids: list) -> list:
  403. """Returns list of process info running compute on the specified device by process IDs.
  404. Args:
  405. device_id: The device index to query
  406. proc_ids: List of process IDs to get info for
  407. Returns:
  408. List of process info structures for the specified device and process IDs
  409. """
  410. proc_infos = []
  411. for proc_id in proc_ids:
  412. proc_info = rsmi_process_info_t()
  413. ret = rocm_lib.rsmi_compute_process_info_by_device_get(proc_id, device_id, byref(proc_info))
  414. if rsmi_ret_ok(ret):
  415. proc_infos.append(proc_info)
  416. return proc_infos
  417. def smi_get_device_average_power(dev):
  418. """returns average power of device_id dev"""
  419. power = c_uint32()
  420. ret = rocm_lib.rsmi_dev_power_ave_get(dev, 0, byref(power))
  421. return power.value * 1e-6 if rsmi_ret_ok(ret) else -1
  422. # XGMI fuctions
  423. def smi_get_device_xgmi_error_status(dev):
  424. """returns XGMI error status for a device"""
  425. status = rsmi_xgmi_status_t()
  426. ret = rocm_lib.rsmi_dev_xgmi_error_status(dev, byref(status))
  427. return status.value if rsmi_ret_ok(ret) else -1
  428. def smi_reset_device_xgmi_error(dev):
  429. """resets XGMI error status for a device"""
  430. ret = rocm_lib.rsmi_dev_xgmi_error_reset(dev)
  431. return rsmi_ret_ok(ret)
  432. def smi_get_device_xgmi_hive_id(dev):
  433. """returns XGMI hive ID for a device"""
  434. hive_id = c_uint64()
  435. ret = rocm_lib.rsmi_dev_xgmi_hive_id_get(dev, byref(hive_id))
  436. return hive_id.value if rsmi_ret_ok(ret) else -1