machine_info.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # -------------------------------------------------------------------------
  2. # Copyright (c) Microsoft Corporation. All rights reserved.
  3. # Licensed under the MIT License.
  4. # --------------------------------------------------------------------------
  5. # It is used to dump machine information for Notebooks
  6. import argparse
  7. import importlib.metadata
  8. import json
  9. import logging
  10. import platform
  11. from os import environ
  12. import cpuinfo
  13. import psutil
  14. from py3nvml.py3nvml import (
  15. NVMLError,
  16. nvmlDeviceGetCount,
  17. nvmlDeviceGetHandleByIndex,
  18. nvmlDeviceGetMemoryInfo,
  19. nvmlDeviceGetName,
  20. nvmlInit,
  21. nvmlShutdown,
  22. nvmlSystemGetDriverVersion,
  23. )
  24. class MachineInfo:
  25. """Class encapsulating Machine Info logic."""
  26. def __init__(self, silent=False, logger=None):
  27. self.silent = silent
  28. if logger is None:
  29. logging.basicConfig(
  30. format="%(asctime)s - %(name)s - %(levelname)s: %(message)s",
  31. level=logging.INFO,
  32. )
  33. self.logger = logging.getLogger(__name__)
  34. else:
  35. self.logger = logger
  36. self.machine_info = None
  37. try:
  38. self.machine_info = self.get_machine_info()
  39. except Exception:
  40. self.logger.exception("Exception in getting machine info.")
  41. self.machine_info = None
  42. def get_machine_info(self):
  43. """Get machine info in metric format"""
  44. gpu_info = self.get_gpu_info_by_nvml()
  45. cpu_info = cpuinfo.get_cpu_info()
  46. machine_info = {
  47. "gpu": gpu_info,
  48. "cpu": self.get_cpu_info(),
  49. "memory": self.get_memory_info(),
  50. "os": platform.platform(),
  51. "python": self._try_get(cpu_info, ["python_version"]),
  52. "packages": self.get_related_packages(),
  53. "onnxruntime": self.get_onnxruntime_info(),
  54. "pytorch": self.get_pytorch_info(),
  55. "tensorflow": self.get_tensorflow_info(),
  56. }
  57. return machine_info
  58. def get_memory_info(self) -> dict:
  59. """Get memory info"""
  60. mem = psutil.virtual_memory()
  61. return {"total": mem.total, "available": mem.available}
  62. def _try_get(self, cpu_info: dict, names: list) -> str:
  63. for name in names:
  64. if name in cpu_info:
  65. value = cpu_info[name]
  66. if isinstance(value, (list, tuple)):
  67. return ",".join([str(i) for i in value])
  68. return value
  69. return ""
  70. def get_cpu_info(self) -> dict:
  71. """Get CPU info"""
  72. cpu_info = cpuinfo.get_cpu_info()
  73. return {
  74. "brand": self._try_get(cpu_info, ["brand", "brand_raw"]),
  75. "cores": psutil.cpu_count(logical=False),
  76. "logical_cores": psutil.cpu_count(logical=True),
  77. "hz": self._try_get(cpu_info, ["hz_actual"]),
  78. "l2_cache": self._try_get(cpu_info, ["l2_cache_size"]),
  79. "flags": self._try_get(cpu_info, ["flags"]),
  80. "processor": platform.uname().processor,
  81. }
  82. def get_gpu_info_by_nvml(self) -> dict:
  83. """Get GPU info using nvml"""
  84. gpu_info_list = []
  85. driver_version = None
  86. try:
  87. nvmlInit()
  88. driver_version = nvmlSystemGetDriverVersion()
  89. deviceCount = nvmlDeviceGetCount() # noqa: N806
  90. for i in range(deviceCount):
  91. handle = nvmlDeviceGetHandleByIndex(i)
  92. info = nvmlDeviceGetMemoryInfo(handle)
  93. gpu_info = {}
  94. gpu_info["memory_total"] = info.total
  95. gpu_info["memory_available"] = info.free
  96. gpu_info["name"] = nvmlDeviceGetName(handle)
  97. gpu_info_list.append(gpu_info)
  98. nvmlShutdown()
  99. except NVMLError as error:
  100. if not self.silent:
  101. self.logger.error("Error fetching GPU information using nvml: %s", error)
  102. return None
  103. result = {"driver_version": driver_version, "devices": gpu_info_list}
  104. if "CUDA_VISIBLE_DEVICES" in environ:
  105. result["cuda_visible"] = environ["CUDA_VISIBLE_DEVICES"]
  106. return result
  107. def get_related_packages(self) -> list[str]:
  108. related_packages = {
  109. "onnxruntime-gpu",
  110. "onnxruntime",
  111. "onnx",
  112. "transformers",
  113. "protobuf",
  114. "sympy",
  115. "torch",
  116. "tensorflow",
  117. "flatbuffers",
  118. "numpy",
  119. "onnxconverter-common",
  120. }
  121. related_packages_list = {}
  122. for dist in importlib.metadata.distributions():
  123. if dist.metadata["Name"].lower() in related_packages:
  124. related_packages_list[dist.metadata["Name"].lower()] = dist.version
  125. return related_packages_list
  126. def get_onnxruntime_info(self) -> dict:
  127. try:
  128. import onnxruntime # noqa: PLC0415
  129. return {
  130. "version": onnxruntime.__version__,
  131. "support_gpu": "CUDAExecutionProvider" in onnxruntime.get_available_providers(),
  132. }
  133. except ImportError as error:
  134. if not self.silent:
  135. self.logger.exception(error)
  136. return None
  137. except Exception as exception:
  138. if not self.silent:
  139. self.logger.exception(exception, False)
  140. return None
  141. def get_pytorch_info(self) -> dict:
  142. try:
  143. import torch # noqa: PLC0415
  144. return {
  145. "version": torch.__version__,
  146. "support_gpu": torch.cuda.is_available(),
  147. "cuda": torch.version.cuda,
  148. }
  149. except ImportError as error:
  150. if not self.silent:
  151. self.logger.exception(error)
  152. return None
  153. except Exception as exception:
  154. if not self.silent:
  155. self.logger.exception(exception, False)
  156. return None
  157. def get_tensorflow_info(self) -> dict:
  158. try:
  159. import tensorflow as tf # noqa: PLC0415
  160. return {
  161. "version": tf.version.VERSION,
  162. "git_version": tf.version.GIT_VERSION,
  163. "support_gpu": tf.test.is_built_with_cuda(),
  164. }
  165. except ImportError as error:
  166. if not self.silent:
  167. self.logger.exception(error)
  168. return None
  169. except ModuleNotFoundError as error:
  170. if not self.silent:
  171. self.logger.exception(error)
  172. return None
  173. def parse_arguments():
  174. parser = argparse.ArgumentParser()
  175. parser.add_argument(
  176. "--silent",
  177. required=False,
  178. action="store_true",
  179. help="Do not print error message",
  180. )
  181. parser.set_defaults(silent=False)
  182. args = parser.parse_args()
  183. return args
  184. def get_machine_info(silent=True) -> str:
  185. machine = MachineInfo(silent)
  186. return json.dumps(machine.machine_info, indent=2)
  187. def get_device_info(silent=True) -> str:
  188. machine = MachineInfo(silent)
  189. info = machine.machine_info
  190. if info:
  191. info = {key: value for key, value in info.items() if key in ["gpu", "cpu", "memory"]}
  192. return json.dumps(info, indent=2)
  193. if __name__ == "__main__":
  194. args = parse_arguments()
  195. print(get_machine_info(args.silent))