hpu.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. import logging
  2. import os
  3. from functools import lru_cache
  4. from importlib.util import find_spec
  5. from typing import List, Optional, Tuple
  6. from ray._private.accelerators.accelerator import AcceleratorManager
  7. from ray._private.ray_constants import env_bool
  8. logger = logging.getLogger(__name__)
  9. HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES"
  10. NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"
  11. @lru_cache()
  12. def is_package_present(package_name: str) -> bool:
  13. try:
  14. return find_spec(package_name) is not None
  15. except ModuleNotFoundError:
  16. return False
  17. HPU_PACKAGE_AVAILABLE = is_package_present("habana_frameworks")
  18. class HPUAcceleratorManager(AcceleratorManager):
  19. """Intel Habana(HPU) accelerators."""
  20. @staticmethod
  21. def get_resource_name() -> str:
  22. return "HPU"
  23. @staticmethod
  24. def get_visible_accelerator_ids_env_var() -> str:
  25. return HABANA_VISIBLE_DEVICES_ENV_VAR
  26. @staticmethod
  27. def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
  28. hpu_visible_devices = os.environ.get(
  29. HPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
  30. )
  31. if hpu_visible_devices is None:
  32. return None
  33. if hpu_visible_devices == "":
  34. return []
  35. return list(hpu_visible_devices.split(","))
  36. @staticmethod
  37. def get_current_node_num_accelerators() -> int:
  38. """Attempt to detect the number of HPUs on this machine.
  39. Returns:
  40. The number of HPUs if any were detected, otherwise 0.
  41. """
  42. if HPU_PACKAGE_AVAILABLE:
  43. import habana_frameworks.torch.hpu as torch_hpu
  44. if torch_hpu.is_available():
  45. return torch_hpu.device_count()
  46. else:
  47. logging.info("HPU devices not available")
  48. return 0
  49. else:
  50. return 0
  51. @staticmethod
  52. def is_initialized() -> bool:
  53. """Attempt to check if HPU backend is initialized.
  54. Returns:
  55. True if backend initialized else False.
  56. """
  57. if HPU_PACKAGE_AVAILABLE:
  58. import habana_frameworks.torch.hpu as torch_hpu
  59. if torch_hpu.is_available() and torch_hpu.is_initialized():
  60. return True
  61. else:
  62. return False
  63. else:
  64. return False
  65. @staticmethod
  66. def get_current_node_accelerator_type() -> Optional[str]:
  67. """Attempt to detect the HPU family type.
  68. Returns:
  69. The device name (GAUDI, GAUDI2) if detected else None.
  70. """
  71. if HPUAcceleratorManager.is_initialized():
  72. import habana_frameworks.torch.hpu as torch_hpu
  73. return f"Intel-{torch_hpu.get_device_name()}"
  74. else:
  75. logging.info("HPU type cannot be detected")
  76. return None
  77. @staticmethod
  78. def validate_resource_request_quantity(
  79. quantity: float,
  80. ) -> Tuple[bool, Optional[str]]:
  81. if isinstance(quantity, float) and not quantity.is_integer():
  82. return (
  83. False,
  84. f"{HPUAcceleratorManager.get_resource_name()} resource quantity"
  85. " must be whole numbers. "
  86. f"The specified quantity {quantity} is invalid.",
  87. )
  88. else:
  89. return (True, None)
  90. @staticmethod
  91. def set_current_process_visible_accelerator_ids(
  92. visible_hpu_devices: List[str],
  93. ) -> None:
  94. if env_bool(NOSET_HABANA_VISIBLE_MODULES_ENV_VAR, False):
  95. return
  96. os.environ[
  97. HPUAcceleratorManager.get_visible_accelerator_ids_env_var()
  98. ] = ",".join([str(i) for i in visible_hpu_devices])