| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- import logging
- import os
- from functools import lru_cache
- from importlib.util import find_spec
- from typing import List, Optional, Tuple
- from ray._private.accelerators.accelerator import AcceleratorManager
- from ray._private.ray_constants import env_bool
- logger = logging.getLogger(__name__)
- HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES"
- NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"
- @lru_cache()
- def is_package_present(package_name: str) -> bool:
- try:
- return find_spec(package_name) is not None
- except ModuleNotFoundError:
- return False
- HPU_PACKAGE_AVAILABLE = is_package_present("habana_frameworks")
- class HPUAcceleratorManager(AcceleratorManager):
- """Intel Habana(HPU) accelerators."""
- @staticmethod
- def get_resource_name() -> str:
- return "HPU"
- @staticmethod
- def get_visible_accelerator_ids_env_var() -> str:
- return HABANA_VISIBLE_DEVICES_ENV_VAR
- @staticmethod
- def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
- hpu_visible_devices = os.environ.get(
- HPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
- )
- if hpu_visible_devices is None:
- return None
- if hpu_visible_devices == "":
- return []
- return list(hpu_visible_devices.split(","))
- @staticmethod
- def get_current_node_num_accelerators() -> int:
- """Attempt to detect the number of HPUs on this machine.
- Returns:
- The number of HPUs if any were detected, otherwise 0.
- """
- if HPU_PACKAGE_AVAILABLE:
- import habana_frameworks.torch.hpu as torch_hpu
- if torch_hpu.is_available():
- return torch_hpu.device_count()
- else:
- logging.info("HPU devices not available")
- return 0
- else:
- return 0
- @staticmethod
- def is_initialized() -> bool:
- """Attempt to check if HPU backend is initialized.
- Returns:
- True if backend initialized else False.
- """
- if HPU_PACKAGE_AVAILABLE:
- import habana_frameworks.torch.hpu as torch_hpu
- if torch_hpu.is_available() and torch_hpu.is_initialized():
- return True
- else:
- return False
- else:
- return False
- @staticmethod
- def get_current_node_accelerator_type() -> Optional[str]:
- """Attempt to detect the HPU family type.
- Returns:
- The device name (GAUDI, GAUDI2) if detected else None.
- """
- if HPUAcceleratorManager.is_initialized():
- import habana_frameworks.torch.hpu as torch_hpu
- return f"Intel-{torch_hpu.get_device_name()}"
- else:
- logging.info("HPU type cannot be detected")
- return None
- @staticmethod
- def validate_resource_request_quantity(
- quantity: float,
- ) -> Tuple[bool, Optional[str]]:
- if isinstance(quantity, float) and not quantity.is_integer():
- return (
- False,
- f"{HPUAcceleratorManager.get_resource_name()} resource quantity"
- " must be whole numbers. "
- f"The specified quantity {quantity} is invalid.",
- )
- else:
- return (True, None)
- @staticmethod
- def set_current_process_visible_accelerator_ids(
- visible_hpu_devices: List[str],
- ) -> None:
- if env_bool(NOSET_HABANA_VISIBLE_MODULES_ENV_VAR, False):
- return
- os.environ[
- HPUAcceleratorManager.get_visible_accelerator_ids_env_var()
- ] = ",".join([str(i) for i in visible_hpu_devices])
|