neuron.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import json
  2. import logging
  3. import os
  4. import subprocess
  5. import sys
  6. from typing import List, Optional, Tuple
  7. from ray._private.accelerators.accelerator import AcceleratorManager
  8. from ray._private.ray_constants import env_bool
  9. logger = logging.getLogger(__name__)
  10. NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
  11. NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = (
  12. "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
  13. )
  14. # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
  15. # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
  16. # Subject to removal after the information is available via public API
  17. AWS_NEURON_INSTANCE_MAP = {
  18. "trn1.2xlarge": 2,
  19. "trn1.32xlarge": 32,
  20. "trn1n.32xlarge": 32,
  21. "inf2.xlarge": 2,
  22. "inf2.8xlarge": 2,
  23. "inf2.24xlarge": 12,
  24. "inf2.48xlarge": 24,
  25. }
  26. class NeuronAcceleratorManager(AcceleratorManager):
  27. """AWS Inferentia and Trainium accelerators."""
  28. @staticmethod
  29. def get_resource_name() -> str:
  30. return "neuron_cores"
  31. @staticmethod
  32. def get_visible_accelerator_ids_env_var() -> str:
  33. return NEURON_RT_VISIBLE_CORES_ENV_VAR
  34. @staticmethod
  35. def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
  36. neuron_visible_cores = os.environ.get(
  37. NeuronAcceleratorManager.get_visible_accelerator_ids_env_var(), None
  38. )
  39. if neuron_visible_cores is None:
  40. return None
  41. if neuron_visible_cores == "":
  42. return []
  43. return list(neuron_visible_cores.split(","))
  44. @staticmethod
  45. def get_current_node_num_accelerators() -> int:
  46. """
  47. Attempt to detect the number of Neuron cores on this machine.
  48. Returns:
  49. The number of Neuron cores if any were detected, otherwise 0.
  50. """
  51. nc_count: int = 0
  52. neuron_path = "/opt/aws/neuron/bin/"
  53. if sys.platform.startswith("linux") and os.path.isdir(neuron_path):
  54. result = subprocess.run(
  55. [os.path.join(neuron_path, "neuron-ls"), "--json-output"],
  56. stdout=subprocess.PIPE,
  57. stderr=subprocess.PIPE,
  58. )
  59. if result.returncode == 0 and result.stdout:
  60. neuron_devices = json.loads(result.stdout)
  61. for neuron_device in neuron_devices:
  62. nc_count += neuron_device.get("nc_count", 0)
  63. return nc_count
  64. @staticmethod
  65. def get_current_node_accelerator_type() -> Optional[str]:
  66. from ray.util.accelerators import AWS_NEURON_CORE
  67. return AWS_NEURON_CORE
  68. @staticmethod
  69. def validate_resource_request_quantity(
  70. quantity: float,
  71. ) -> Tuple[bool, Optional[str]]:
  72. if isinstance(quantity, float) and not quantity.is_integer():
  73. return (
  74. False,
  75. f"{NeuronAcceleratorManager.get_resource_name()} resource quantity"
  76. " must be whole numbers. "
  77. f"The specified quantity {quantity} is invalid.",
  78. )
  79. else:
  80. return (True, None)
  81. @staticmethod
  82. def set_current_process_visible_accelerator_ids(
  83. visible_neuron_core_ids: List[str],
  84. ) -> None:
  85. """Set the NEURON_RT_VISIBLE_CORES environment variable based on
  86. given visible_neuron_core_ids.
  87. Args:
  88. visible_neuron_core_ids (List[str]): List of int representing core IDs.
  89. """
  90. if env_bool(NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR, False):
  91. return
  92. os.environ[
  93. NeuronAcceleratorManager.get_visible_accelerator_ids_env_var()
  94. ] = ",".join([str(i) for i in visible_neuron_core_ids])
  95. @staticmethod
  96. def get_ec2_instance_num_accelerators(
  97. instance_type: str, instances: dict
  98. ) -> Optional[int]:
  99. # TODO: AWS SDK (public API) doesn't yet expose the NeuronCore
  100. # information. It will be available (work-in-progress)
  101. # as xxAcceleratorInfo in InstanceTypeInfo.
  102. # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_InstanceTypeInfo.html
  103. # See https://github.com/ray-project/ray/issues/38473
  104. return AWS_NEURON_INSTANCE_MAP.get(instance_type.lower(), None)
  105. @staticmethod
  106. def get_ec2_instance_accelerator_type(
  107. instance_type: str, instances: dict
  108. ) -> Optional[str]:
  109. from ray.util.accelerators import AWS_NEURON_CORE
  110. return AWS_NEURON_CORE