npu.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import glob
  2. import logging
  3. import os
  4. from typing import List, Optional, Tuple
  5. from ray._private.accelerators.accelerator import AcceleratorManager
  6. from ray._private.ray_constants import env_bool
  7. logger = logging.getLogger(__name__)
  8. ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
  9. NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = (
  10. "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"
  11. )
  12. class NPUAcceleratorManager(AcceleratorManager):
  13. """Ascend NPU accelerators."""
  14. @staticmethod
  15. def get_resource_name() -> str:
  16. return "NPU"
  17. @staticmethod
  18. def get_visible_accelerator_ids_env_var() -> str:
  19. return ASCEND_RT_VISIBLE_DEVICES_ENV_VAR
  20. @staticmethod
  21. def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
  22. ascend_visible_devices = os.environ.get(
  23. NPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
  24. )
  25. if ascend_visible_devices is None:
  26. return None
  27. if ascend_visible_devices == "":
  28. return []
  29. if ascend_visible_devices == "NoDevFiles":
  30. return []
  31. return list(ascend_visible_devices.split(","))
  32. @staticmethod
  33. def get_current_node_num_accelerators() -> int:
  34. """Attempt to detect the number of NPUs on this machine.
  35. NPU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
  36. Returns:
  37. The number of NPUs if any were detected, otherwise 0.
  38. """
  39. try:
  40. import acl
  41. device_count, ret = acl.rt.get_device_count()
  42. if ret == 0:
  43. return device_count
  44. except Exception as e:
  45. logger.debug("Could not import AscendCL: %s", e)
  46. try:
  47. npu_files = glob.glob("/dev/davinci[0-9]*")
  48. return len(npu_files)
  49. except Exception as e:
  50. logger.debug("Failed to detect number of NPUs: %s", e)
  51. return 0
  52. @staticmethod
  53. def get_current_node_accelerator_type() -> Optional[str]:
  54. """Get the type of the Ascend NPU on the current node.
  55. Returns:
  56. A string of the type, such as "Ascend910A", "Ascend910B", "Ascend310P1".
  57. """
  58. try:
  59. import acl
  60. return acl.get_soc_name()
  61. except Exception:
  62. logger.exception("Failed to detect NPU type.")
  63. return None
  64. @staticmethod
  65. def validate_resource_request_quantity(
  66. quantity: float,
  67. ) -> Tuple[bool, Optional[str]]:
  68. return (True, None)
  69. @staticmethod
  70. def set_current_process_visible_accelerator_ids(
  71. visible_npu_devices: List[str],
  72. ) -> None:
  73. if env_bool(NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, False):
  74. return
  75. os.environ[
  76. NPUAcceleratorManager.get_visible_accelerator_ids_env_var()
  77. ] = ",".join([str(i) for i in visible_npu_devices])