collect_env.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. # mypy: allow-untyped-defs
  2. # Unlike the rest of the PyTorch this file must be python2 compliant.
  3. # This script outputs relevant system environment info
  4. # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
  5. import datetime
  6. import json
  7. import locale
  8. import os
  9. import re
  10. import subprocess
  11. import sys
  12. from collections import namedtuple
  13. from typing import cast as _cast, Dict as _Dict
  14. try:
  15. import torch
  16. TORCH_AVAILABLE = True
  17. except (ImportError, NameError, AttributeError, OSError):
  18. TORCH_AVAILABLE = False
  19. # System Environment Information
  20. SystemEnv = namedtuple(
  21. "SystemEnv",
  22. [
  23. "torch_version",
  24. "is_debug_build",
  25. "cuda_compiled_version",
  26. "gcc_version",
  27. "clang_version",
  28. "cmake_version",
  29. "os",
  30. "libc_version",
  31. "python_version",
  32. "python_platform",
  33. "is_cuda_available",
  34. "cuda_runtime_version",
  35. "cuda_module_loading",
  36. "nvidia_driver_version",
  37. "nvidia_gpu_models",
  38. "cudnn_version",
  39. "is_xpu_available",
  40. "pip_version", # 'pip' or 'pip3'
  41. "pip_packages",
  42. "conda_packages",
  43. "hip_compiled_version",
  44. "hip_runtime_version",
  45. "miopen_runtime_version",
  46. "caching_allocator_config",
  47. "is_xnnpack_available",
  48. "cpu_info",
  49. ],
  50. )
  51. COMMON_PATTERNS = [
  52. "torch",
  53. "numpy",
  54. "triton",
  55. "optree",
  56. ]
  57. NVIDIA_PATTERNS = [
  58. "cuda-cudart",
  59. "cuda-cupti",
  60. "cuda-libraries",
  61. "cuda-opencl",
  62. "cuda-nvrtc",
  63. "cuda-runtime",
  64. "cublas",
  65. "cudnn",
  66. "cufft",
  67. "curand",
  68. "cusolver",
  69. "cusparse",
  70. "nccl",
  71. "nvjitlink",
  72. "nvtx",
  73. ]
  74. ONEAPI_PATTERNS = [
  75. "dpcpp-cpp-rt",
  76. "intel-cmplr-lib-rt",
  77. "intel-cmplr-lib-ur",
  78. "intel-cmplr-lic-rt",
  79. "intel-opencl-rt",
  80. "intel-sycl-rt",
  81. "mkl",
  82. "onemkl-sycl-blas",
  83. "onemkl-sycl-dft",
  84. "onemkl-sycl-lapack",
  85. "onemkl-sycl-rng",
  86. "onemkl-sycl-sparse",
  87. "intel-openmp",
  88. "tbb",
  89. "impi-rt",
  90. "impi-devel",
  91. "oneccl",
  92. "oneccl-devel",
  93. "intel-pti",
  94. "umf",
  95. "tcmlib",
  96. ]
  97. CONDA_PATTERNS = [
  98. "cudatoolkit",
  99. "soumith",
  100. "mkl",
  101. "magma",
  102. ]
  103. PIP_PATTERNS = [
  104. "mypy",
  105. "flake8",
  106. "onnx",
  107. ]
  108. def run(command):
  109. """Return (return-code, stdout, stderr)."""
  110. shell = type(command) is str
  111. p = subprocess.Popen(
  112. command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
  113. )
  114. raw_output, raw_err = p.communicate()
  115. rc = p.returncode
  116. if get_platform() == "win32":
  117. enc = "oem"
  118. else:
  119. enc = locale.getpreferredencoding()
  120. output = raw_output.decode(enc)
  121. err = raw_err.decode(enc)
  122. return rc, output.strip(), err.strip()
  123. def run_and_read_all(run_lambda, command):
  124. """Run command using run_lambda; reads and returns entire output if rc is 0."""
  125. rc, out, _ = run_lambda(command)
  126. if rc != 0:
  127. return None
  128. return out
  129. def run_and_parse_first_match(run_lambda, command, regex):
  130. """Run command using run_lambda, returns the first regex match if it exists."""
  131. rc, out, _ = run_lambda(command)
  132. if rc != 0:
  133. return None
  134. match = re.search(regex, out)
  135. if match is None:
  136. return None
  137. return match.group(1)
  138. def run_and_return_first_line(run_lambda, command):
  139. """Run command using run_lambda and returns first line if output is not empty."""
  140. rc, out, _ = run_lambda(command)
  141. if rc != 0:
  142. return None
  143. return out.split("\n")[0]
  144. def get_conda_packages(run_lambda, patterns=None):
  145. if patterns is None:
  146. patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
  147. conda = os.environ.get("CONDA_EXE", "conda")
  148. out = run_and_read_all(run_lambda, "{} list".format(conda))
  149. if out is None:
  150. return out
  151. return "\n".join(
  152. line
  153. for line in out.splitlines()
  154. if not line.startswith("#") and any(name in line for name in patterns)
  155. )
  156. def get_gcc_version(run_lambda):
  157. return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
  158. def get_clang_version(run_lambda):
  159. return run_and_parse_first_match(
  160. run_lambda, "clang --version", r"clang version (.*)"
  161. )
  162. def get_cmake_version(run_lambda):
  163. return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
  164. def get_nvidia_driver_version(run_lambda):
  165. if get_platform() == "darwin":
  166. cmd = "kextstat | grep -i cuda"
  167. return run_and_parse_first_match(
  168. run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]"
  169. )
  170. smi = get_nvidia_smi()
  171. return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
  172. def get_gpu_info(run_lambda):
  173. if get_platform() == "darwin" or (
  174. TORCH_AVAILABLE
  175. and hasattr(torch.version, "hip")
  176. and torch.version.hip is not None
  177. ):
  178. if TORCH_AVAILABLE and torch.cuda.is_available():
  179. if torch.version.hip is not None:
  180. prop = torch.cuda.get_device_properties(0)
  181. if hasattr(prop, "gcnArchName"):
  182. gcnArch = " ({})".format(prop.gcnArchName)
  183. else:
  184. gcnArch = "NoGCNArchNameOnOldPyTorch"
  185. else:
  186. gcnArch = ""
  187. return torch.cuda.get_device_name(None) + gcnArch
  188. return None
  189. smi = get_nvidia_smi()
  190. uuid_regex = re.compile(r" \(UUID: .+?\)")
  191. rc, out, _ = run_lambda(smi + " -L")
  192. if rc != 0:
  193. return None
  194. # Anonymize GPUs by removing their UUID
  195. return re.sub(uuid_regex, "", out)
  196. def get_running_cuda_version(run_lambda):
  197. return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
  198. def get_cudnn_version(run_lambda):
  199. """Return a list of libcudnn.so; it's hard to tell which one is being used."""
  200. if get_platform() == "win32":
  201. system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
  202. cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
  203. where_cmd = os.path.join(system_root, "System32", "where")
  204. cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
  205. elif get_platform() == "darwin":
  206. # CUDA libraries and drivers can be found in /usr/local/cuda/. See
  207. # https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-mac-os-x/index.html#installation
  208. # https://docs.nvidia.com/deeplearning/cudnn/installation/latest/
  209. # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
  210. cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
  211. else:
  212. cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
  213. rc, out, _ = run_lambda(cudnn_cmd)
  214. # find will return 1 if there are permission errors or if not found
  215. if len(out) == 0 or (rc != 1 and rc != 0):
  216. l = os.environ.get("CUDNN_LIBRARY")
  217. if l is not None and os.path.isfile(l):
  218. return os.path.realpath(l)
  219. return None
  220. files_set = set()
  221. for fn in out.split("\n"):
  222. fn = os.path.realpath(fn) # eliminate symbolic links
  223. if os.path.isfile(fn):
  224. files_set.add(fn)
  225. if not files_set:
  226. return None
  227. # Alphabetize the result because the order is non-deterministic otherwise
  228. files = sorted(files_set)
  229. if len(files) == 1:
  230. return files[0]
  231. result = "\n".join(files)
  232. return "Probably one of the following:\n{}".format(result)
  233. def get_nvidia_smi():
  234. # Note: nvidia-smi is currently available only on Windows and Linux
  235. smi = "nvidia-smi"
  236. if get_platform() == "win32":
  237. system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
  238. program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
  239. legacy_path = os.path.join(
  240. program_files_root, "NVIDIA Corporation", "NVSMI", smi
  241. )
  242. new_path = os.path.join(system_root, "System32", smi)
  243. smis = [new_path, legacy_path]
  244. for candidate_smi in smis:
  245. if os.path.exists(candidate_smi):
  246. smi = '"{}"'.format(candidate_smi)
  247. break
  248. return smi
  249. def _detect_linux_pkg_manager():
  250. if get_platform() != "linux":
  251. return "N/A"
  252. for mgr_name in ["dpkg", "dnf", "yum", "zypper"]:
  253. rc, _, _ = run(f"which {mgr_name}")
  254. if rc == 0:
  255. return mgr_name
  256. return "N/A"
  257. def get_linux_pkg_version(run_lambda, pkg_name):
  258. pkg_mgr = _detect_linux_pkg_manager()
  259. if pkg_mgr == "N/A":
  260. return "N/A"
  261. grep_version = {
  262. "dpkg": {
  263. "field_index": 2,
  264. "command": "dpkg -l | grep {}",
  265. },
  266. "dnf": {
  267. "field_index": 1,
  268. "command": "dnf list | grep {}",
  269. },
  270. "yum": {
  271. "field_index": 1,
  272. "command": "yum list | grep {}",
  273. },
  274. "zypper": {
  275. "field_index": 2,
  276. "command": "zypper info {} | grep Version",
  277. },
  278. }
  279. # pyrefly: ignore [redundant-cast]
  280. field_index: int = int(_cast(int, grep_version[pkg_mgr]["field_index"]))
  281. cmd: str = str(grep_version[pkg_mgr]["command"])
  282. cmd = cmd.format(pkg_name)
  283. ret = run_and_read_all(run_lambda, cmd)
  284. if ret is None or ret == "":
  285. return "N/A"
  286. lst = re.sub(" +", " ", ret).split(" ")
  287. if len(lst) <= field_index:
  288. return "N/A"
  289. return lst[field_index]
  290. def get_intel_gpu_driver_version(run_lambda):
  291. lst = []
  292. platform = get_platform()
  293. if platform == "linux":
  294. pkgs = { # type: ignore[var-annotated]
  295. "dpkg": {
  296. "intel-opencl-icd",
  297. "libze1",
  298. "level-zero",
  299. },
  300. "dnf": {
  301. "intel-opencl",
  302. "level-zero",
  303. },
  304. "yum": {
  305. "intel-opencl",
  306. "level-zero",
  307. },
  308. "zypper": {
  309. "intel-opencl",
  310. "level-zero",
  311. },
  312. }.get(_detect_linux_pkg_manager(), {})
  313. for pkg in pkgs:
  314. ver = get_linux_pkg_version(run_lambda, pkg)
  315. if ver != "N/A":
  316. lst.append(f"* {pkg}:\t{ver}")
  317. if platform in ["win32", "cygwin"]:
  318. txt = run_and_read_all(
  319. run_lambda,
  320. 'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
  321. -and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName,DriverVersion,DriverDate\
  322. | ConvertTo-Json"',
  323. )
  324. try:
  325. obj = json.loads(txt)
  326. if type(obj) is list:
  327. for o in obj:
  328. lst.append(
  329. f'* {o["DeviceName"]}: {o["DriverVersion"]} ({o["DriverDate"]})'
  330. )
  331. else:
  332. lst.append(f'* {obj["DriverVersion"]} ({obj["DriverDate"]})')
  333. except ValueError as e:
  334. lst.append(txt)
  335. lst.append(str(e))
  336. return "\n".join(lst)
  337. def get_intel_gpu_onboard(run_lambda):
  338. lst: list[str] = []
  339. platform = get_platform()
  340. if platform == "linux":
  341. txt = run_and_read_all(run_lambda, "xpu-smi discovery -j")
  342. if txt:
  343. try:
  344. obj = json.loads(txt)
  345. device_list = obj.get("device_list", [])
  346. if isinstance(device_list, list) and device_list:
  347. lst.extend(f'* {device["device_name"]}' for device in device_list)
  348. else:
  349. lst.append("N/A")
  350. except (ValueError, TypeError) as e:
  351. lst.append(txt)
  352. lst.append(str(e))
  353. else:
  354. lst.append("N/A")
  355. if platform in ["win32", "cygwin"]:
  356. txt = run_and_read_all(
  357. run_lambda,
  358. 'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
  359. -and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName | ConvertTo-Json"',
  360. )
  361. if txt:
  362. try:
  363. obj = json.loads(txt)
  364. if isinstance(obj, list) and obj:
  365. lst.extend(f'* {device["DeviceName"]}' for device in obj)
  366. else:
  367. lst.append(f'* {obj.get("DeviceName", "N/A")}')
  368. except ValueError as e:
  369. lst.append(txt)
  370. lst.append(str(e))
  371. else:
  372. lst.append("N/A")
  373. return "\n".join(lst)
  374. def get_intel_gpu_detected(run_lambda):
  375. if not TORCH_AVAILABLE or not hasattr(torch, "xpu"):
  376. return "N/A"
  377. device_count = torch.xpu.device_count()
  378. if device_count == 0:
  379. return "N/A"
  380. devices = [
  381. f"* [{i}] {torch.xpu.get_device_properties(i)}" for i in range(device_count)
  382. ]
  383. return "\n".join(devices)
  384. # example outputs of CPU infos
  385. # * linux
  386. # Architecture: x86_64
  387. # CPU op-mode(s): 32-bit, 64-bit
  388. # Address sizes: 46 bits physical, 48 bits virtual
  389. # Byte Order: Little Endian
  390. # CPU(s): 128
  391. # On-line CPU(s) list: 0-127
  392. # Vendor ID: GenuineIntel
  393. # Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
  394. # CPU family: 6
  395. # Model: 106
  396. # Thread(s) per core: 2
  397. # Core(s) per socket: 32
  398. # Socket(s): 2
  399. # Stepping: 6
  400. # BogoMIPS: 5799.78
  401. # Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
  402. # sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
  403. # xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
  404. # pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
  405. # hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
  406. # fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
  407. # avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
  408. # xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
  409. # avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
  410. # Virtualization features:
  411. # Hypervisor vendor: KVM
  412. # Virtualization type: full
  413. # Caches (sum of all):
  414. # L1d: 3 MiB (64 instances)
  415. # L1i: 2 MiB (64 instances)
  416. # L2: 80 MiB (64 instances)
  417. # L3: 108 MiB (2 instances)
  418. # NUMA:
  419. # NUMA node(s): 2
  420. # NUMA node0 CPU(s): 0-31,64-95
  421. # NUMA node1 CPU(s): 32-63,96-127
  422. # Vulnerabilities:
  423. # Itlb multihit: Not affected
  424. # L1tf: Not affected
  425. # Mds: Not affected
  426. # Meltdown: Not affected
  427. # Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
  428. # Retbleed: Not affected
  429. # Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
  430. # Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
  431. # Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
  432. # Srbds: Not affected
  433. # Tsx async abort: Not affected
  434. # * win32
  435. # Architecture=9
  436. # CurrentClockSpeed=2900
  437. # DeviceID=CPU0
  438. # Family=179
  439. # L2CacheSize=40960
  440. # L2CacheSpeed=
  441. # Manufacturer=GenuineIntel
  442. # MaxClockSpeed=2900
  443. # Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
  444. # ProcessorType=3
  445. # Revision=27142
  446. #
  447. # Architecture=9
  448. # CurrentClockSpeed=2900
  449. # DeviceID=CPU1
  450. # Family=179
  451. # L2CacheSize=40960
  452. # L2CacheSpeed=
  453. # Manufacturer=GenuineIntel
  454. # MaxClockSpeed=2900
  455. # Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
  456. # ProcessorType=3
  457. # Revision=27142
  458. def get_cpu_info(run_lambda):
  459. rc, out, err = 0, "", ""
  460. if get_platform() == "linux":
  461. rc, out, err = run_lambda("lscpu")
  462. elif get_platform() == "win32":
  463. rc, out, err = run_lambda(
  464. 'powershell.exe "gwmi -Class Win32_Processor | Select-Object -Property Name,Manufacturer,Family,\
  465. Architecture,ProcessorType,DeviceID,CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision\
  466. | ConvertTo-Json"'
  467. )
  468. if rc == 0:
  469. lst = []
  470. try:
  471. obj = json.loads(out)
  472. if type(obj) is list:
  473. for o in obj:
  474. lst.append("----------------------")
  475. lst.extend([f"{k}: {v}" for (k, v) in o.items()])
  476. else:
  477. lst.extend([f"{k}: {v}" for (k, v) in obj.items()])
  478. except ValueError as e:
  479. lst.append(out)
  480. lst.append(str(e))
  481. out = "\n".join(lst)
  482. elif get_platform() == "darwin":
  483. rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
  484. cpu_info = "None"
  485. if rc == 0:
  486. cpu_info = out
  487. else:
  488. cpu_info = err
  489. return cpu_info
  490. def get_platform():
  491. if sys.platform.startswith("linux"):
  492. return "linux"
  493. elif sys.platform.startswith("win32"):
  494. return "win32"
  495. elif sys.platform.startswith("cygwin"):
  496. return "cygwin"
  497. elif sys.platform.startswith("darwin"):
  498. return "darwin"
  499. else:
  500. return sys.platform
  501. def get_mac_version(run_lambda):
  502. return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
  503. def get_windows_version(run_lambda):
  504. ret = run_and_read_all(
  505. run_lambda,
  506. 'powershell.exe "gwmi -Class Win32_OperatingSystem | Select-Object -Property Caption,\
  507. OSArchitecture,Version | ConvertTo-Json"',
  508. )
  509. try:
  510. obj = json.loads(ret)
  511. ret = f'{obj["Caption"]} ({obj["Version"]} {obj["OSArchitecture"]})'
  512. except ValueError as e:
  513. ret += f"\n{str(e)}"
  514. return ret
  515. def get_lsb_version(run_lambda):
  516. return run_and_parse_first_match(
  517. run_lambda, "lsb_release -a", r"Description:\t(.*)"
  518. )
  519. def check_release_file(run_lambda):
  520. return run_and_parse_first_match(
  521. run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
  522. )
  523. def get_os(run_lambda):
  524. from platform import machine
  525. platform = get_platform()
  526. if platform in ["win32", "cygwin"]:
  527. return get_windows_version(run_lambda)
  528. if platform == "darwin":
  529. version = get_mac_version(run_lambda)
  530. if version is None:
  531. return None
  532. return "macOS {} ({})".format(version, machine())
  533. if platform == "linux":
  534. # Ubuntu/Debian based
  535. desc = get_lsb_version(run_lambda)
  536. if desc is not None:
  537. return "{} ({})".format(desc, machine())
  538. # Try reading /etc/*-release
  539. desc = check_release_file(run_lambda)
  540. if desc is not None:
  541. return "{} ({})".format(desc, machine())
  542. return "{} ({})".format(platform, machine())
  543. # Unknown platform
  544. return platform
  545. def get_python_platform():
  546. import platform
  547. return platform.platform()
  548. def get_libc_version():
  549. import platform
  550. if get_platform() != "linux":
  551. return "N/A"
  552. return "-".join(platform.libc_ver())
  553. def get_pip_packages(run_lambda, patterns=None):
  554. """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
  555. if patterns is None:
  556. patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
  557. pip_version = "pip3" if sys.version_info.major == 3 else "pip"
  558. os.environ["PIP_DISABLE_PIP_VERSION_CHECK"] = "1"
  559. # People generally have pip as `pip` or `pip3`
  560. # But here it is invoked as `python -mpip`
  561. out = run_and_read_all(
  562. run_lambda, [sys.executable, "-mpip", "list", "--format=freeze"]
  563. )
  564. if out is None:
  565. return pip_version, out
  566. filtered_out = "\n".join(
  567. line for line in out.splitlines() if any(name in line for name in patterns)
  568. )
  569. return pip_version, filtered_out
  570. def get_cachingallocator_config() -> _Dict[str, str]:
  571. """Return the caching allocator configuration from environment variables.
  572. """
  573. # pyrefly: ignore [bad-return]
  574. return {
  575. var: os.environ.get(var)
  576. for var in (
  577. "PYTORCH_CUDA_ALLOC_CONF",
  578. "PYTORCH_HIP_ALLOC_CONF",
  579. "PYTORCH_ALLOC_CONF",
  580. )
  581. if os.environ.get(var)
  582. }
  583. def get_cuda_module_loading_config():
  584. if TORCH_AVAILABLE and torch.cuda.is_available():
  585. torch.cuda.init()
  586. config = os.environ.get("CUDA_MODULE_LOADING", "")
  587. return config
  588. else:
  589. return "N/A"
  590. def is_xnnpack_available():
  591. if TORCH_AVAILABLE:
  592. import torch.backends.xnnpack
  593. return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined]
  594. else:
  595. return "N/A"
  596. def get_env_info():
  597. """
  598. Collects environment information to aid in debugging.
  599. The returned environment information contains details on torch version, is debug build
  600. or not, cuda compiled version, gcc version, clang version, cmake version, operating
  601. system, libc version, python version, python platform, CUDA availability, CUDA
  602. runtime version, CUDA module loading config, GPU model and configuration, Nvidia
  603. driver version, cuDNN version, pip version and versions of relevant pip and
  604. conda packages, HIP runtime version, MIOpen runtime version,
  605. Caching allocator config, XNNPACK availability and CPU information.
  606. Returns:
  607. SystemEnv (namedtuple): A tuple containing various environment details
  608. and system information.
  609. """
  610. run_lambda = run
  611. pip_version, pip_list_output = get_pip_packages(run_lambda)
  612. if TORCH_AVAILABLE:
  613. version_str = torch.__version__
  614. debug_mode_str = str(torch.version.debug)
  615. cuda_available_str = str(torch.cuda.is_available())
  616. cuda_version_str = torch.version.cuda
  617. xpu_available_str = str(torch.xpu.is_available())
  618. if torch.xpu.is_available():
  619. xpu_available_str = (
  620. f"{xpu_available_str}\n"
  621. + f"XPU used to build PyTorch: {torch.version.xpu}\n"
  622. + f"Intel GPU driver version:\n{get_intel_gpu_driver_version(run_lambda)}\n"
  623. + f"Intel GPU models onboard:\n{get_intel_gpu_onboard(run_lambda)}\n"
  624. + f"Intel GPU models detected:\n{get_intel_gpu_detected(run_lambda)}"
  625. )
  626. if (
  627. not hasattr(torch.version, "hip") or torch.version.hip is None
  628. ): # cuda version
  629. hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
  630. else: # HIP version
  631. def get_version_or_na(cfg, prefix):
  632. _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
  633. return _lst[0] if _lst else "N/A"
  634. cfg = torch._C._show_config().split("\n")
  635. hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
  636. miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
  637. cuda_version_str = "N/A"
  638. hip_compiled_version = torch.version.hip
  639. else:
  640. version_str = debug_mode_str = cuda_available_str = cuda_version_str = xpu_available_str = "N/A" # type: ignore[assignment]
  641. hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
  642. sys_version = sys.version.replace("\n", " ")
  643. conda_packages = get_conda_packages(run_lambda)
  644. return SystemEnv(
  645. torch_version=version_str,
  646. is_debug_build=debug_mode_str,
  647. python_version="{} ({}-bit runtime)".format(
  648. sys_version, sys.maxsize.bit_length() + 1
  649. ),
  650. python_platform=get_python_platform(),
  651. is_cuda_available=cuda_available_str,
  652. cuda_compiled_version=cuda_version_str,
  653. cuda_runtime_version=get_running_cuda_version(run_lambda),
  654. cuda_module_loading=get_cuda_module_loading_config(),
  655. nvidia_gpu_models=get_gpu_info(run_lambda),
  656. nvidia_driver_version=get_nvidia_driver_version(run_lambda),
  657. cudnn_version=get_cudnn_version(run_lambda),
  658. is_xpu_available=xpu_available_str,
  659. hip_compiled_version=hip_compiled_version,
  660. hip_runtime_version=hip_runtime_version,
  661. miopen_runtime_version=miopen_runtime_version,
  662. pip_version=pip_version,
  663. pip_packages=pip_list_output,
  664. conda_packages=conda_packages,
  665. os=get_os(run_lambda),
  666. libc_version=get_libc_version(),
  667. gcc_version=get_gcc_version(run_lambda),
  668. clang_version=get_clang_version(run_lambda),
  669. cmake_version=get_cmake_version(run_lambda),
  670. caching_allocator_config=get_cachingallocator_config(),
  671. is_xnnpack_available=is_xnnpack_available(),
  672. cpu_info=get_cpu_info(run_lambda),
  673. )
  674. env_info_fmt = """
  675. PyTorch version: {torch_version}
  676. Is debug build: {is_debug_build}
  677. CUDA used to build PyTorch: {cuda_compiled_version}
  678. ROCM used to build PyTorch: {hip_compiled_version}
  679. OS: {os}
  680. GCC version: {gcc_version}
  681. Clang version: {clang_version}
  682. CMake version: {cmake_version}
  683. Libc version: {libc_version}
  684. Python version: {python_version}
  685. Python platform: {python_platform}
  686. Is CUDA available: {is_cuda_available}
  687. CUDA runtime version: {cuda_runtime_version}
  688. CUDA_MODULE_LOADING set to: {cuda_module_loading}
  689. GPU models and configuration: {nvidia_gpu_models}
  690. Nvidia driver version: {nvidia_driver_version}
  691. cuDNN version: {cudnn_version}
  692. Is XPU available: {is_xpu_available}
  693. HIP runtime version: {hip_runtime_version}
  694. MIOpen runtime version: {miopen_runtime_version}
  695. Is XNNPACK available: {is_xnnpack_available}
  696. Caching allocator config: {caching_allocator_config}
  697. CPU:
  698. {cpu_info}
  699. Versions of relevant libraries:
  700. {pip_packages}
  701. {conda_packages}
  702. """.strip()
  703. def pretty_str(envinfo):
  704. def replace_nones(dct, replacement="Could not collect"):
  705. for key in dct:
  706. if dct[key] is not None:
  707. continue
  708. dct[key] = replacement
  709. return dct
  710. def replace_bools(dct, true="Yes", false="No"):
  711. for key in dct:
  712. if dct[key] is True:
  713. dct[key] = true
  714. elif dct[key] is False:
  715. dct[key] = false
  716. return dct
  717. def prepend(text, tag="[prepend]"):
  718. lines = text.split("\n")
  719. updated_lines = [tag + line for line in lines]
  720. return "\n".join(updated_lines)
  721. def replace_if_empty(text, replacement="No relevant packages"):
  722. if text is not None and len(text) == 0:
  723. return replacement
  724. return text
  725. def maybe_start_on_next_line(string):
  726. # If `string` is multiline, prepend a \n to it.
  727. if string is not None and len(string.split("\n")) > 1:
  728. return "\n{}\n".format(string)
  729. return string
  730. mutable_dict = envinfo._asdict()
  731. # If nvidia_gpu_models is multiline, start on the next line
  732. mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
  733. envinfo.nvidia_gpu_models
  734. )
  735. # If the machine doesn't have CUDA, report some fields as 'No CUDA'
  736. dynamic_cuda_fields = [
  737. "cuda_runtime_version",
  738. "nvidia_gpu_models",
  739. "nvidia_driver_version",
  740. ]
  741. all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
  742. all_dynamic_cuda_fields_missing = all(
  743. mutable_dict[field] is None for field in dynamic_cuda_fields
  744. )
  745. if (
  746. TORCH_AVAILABLE
  747. and not torch.cuda.is_available()
  748. and all_dynamic_cuda_fields_missing
  749. ):
  750. for field in all_cuda_fields:
  751. mutable_dict[field] = "No CUDA"
  752. if envinfo.cuda_compiled_version is None:
  753. mutable_dict["cuda_compiled_version"] = "None"
  754. # Replace True with Yes, False with No
  755. mutable_dict = replace_bools(mutable_dict)
  756. # Replace all None objects with 'Could not collect'
  757. mutable_dict = replace_nones(mutable_dict)
  758. # If either of these are '', replace with 'No relevant packages'
  759. mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
  760. mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
  761. # Tag conda and pip packages with a prefix
  762. # If they were previously None, they'll show up as ie '[conda] Could not collect'
  763. if mutable_dict["pip_packages"]:
  764. mutable_dict["pip_packages"] = prepend(
  765. mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)
  766. )
  767. if mutable_dict["conda_packages"]:
  768. mutable_dict["conda_packages"] = prepend(
  769. mutable_dict["conda_packages"], "[conda] "
  770. )
  771. mutable_dict["cpu_info"] = envinfo.cpu_info
  772. mutable_dict["caching_allocator_config"] = envinfo.caching_allocator_config
  773. if not envinfo.caching_allocator_config:
  774. mutable_dict["caching_allocator_config"] = "N/A"
  775. return env_info_fmt.format(**mutable_dict)
  776. def get_pretty_env_info():
  777. """
  778. Returns a pretty string of environment information.
  779. This function retrieves environment information by calling the `get_env_info` function
  780. and then formats the information into a human-readable string. The retrieved environment
  781. information is listed in the document of `get_env_info`.
  782. This function is used in `python collect_env.py` that should be executed when reporting a bug.
  783. Returns:
  784. str: A pretty string of the environment information.
  785. """
  786. return pretty_str(get_env_info())
  787. def main() -> None:
  788. print("Collecting environment information...")
  789. output = get_pretty_env_info()
  790. print(output)
  791. if (
  792. TORCH_AVAILABLE
  793. and hasattr(torch, "utils")
  794. and hasattr(torch.utils, "_crash_handler")
  795. ):
  796. minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
  797. if sys.platform == "linux" and os.path.exists(minidump_dir):
  798. dumps = [
  799. os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)
  800. ]
  801. latest = max(dumps, key=os.path.getctime)
  802. ctime = os.path.getctime(latest)
  803. creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
  804. "%Y-%m-%d %H:%M:%S"
  805. )
  806. msg = (
  807. "\n*** Detected a minidump at {} created on {}, ".format(
  808. latest, creation_time
  809. )
  810. + "if this is related to your bug please include it when you file a report ***"
  811. )
  812. print(msg, file=sys.stderr)
  813. if __name__ == "__main__":
  814. main()