resource_isolation_config.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. import logging
  2. from typing import Optional
  3. import ray._common.utils
  4. import ray._private.ray_constants as ray_constants
  5. import ray._private.utils as utils
  6. logger = logging.getLogger(__name__)
  7. # See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights
  8. # for information about cpu weights
  9. _CGROUP_CPU_MAX_WEIGHT: int = 10000
  10. class ResourceIsolationConfig:
  11. """Configuration for enabling resource isolation by reserving memory and cpu for ray system processes through cgroupv2.
  12. Validates configuration for resource isolation by enforcing types, correct combinations of values, applying default values,
  13. and sanity checking cpu and memory reservations. Also, converts system_reserved_cpu into cpu.weights for cgroupv2.
  14. Attributes:
  15. enable_resource_isolation: True if cgroupv2 based isolation of ray
  16. system processes is enabled.
  17. cgroup_path: The path for the cgroup the raylet should use to enforce
  18. resource isolation.
  19. system_reserved_cpu: The amount of cores reserved for ray system
  20. processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
  21. and < the total number of cores available.
  22. system_reserved_memory: The amount of memory in bytes reserved
  23. for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES
  24. and system_reserved_cpu + object_store_bytes < the total memory available.
  25. TODO(54703): Link documentation when it's available.
  26. """
  27. def __init__(
  28. self,
  29. enable_resource_isolation: bool = False,
  30. cgroup_path: Optional[str] = None,
  31. system_reserved_cpu: Optional[float] = None,
  32. system_reserved_memory: Optional[int] = None,
  33. object_store_memory: Optional[int] = None,
  34. ):
  35. """
  36. Raises:
  37. ValueError: On invalid inputs.
  38. Args:
  39. enable_resource_isolation: True if cgroupv2 based isolation of ray
  40. system processes is enabled.
  41. cgroup_path: The path for the cgroup the raylet should use to enforce
  42. resource isolation.
  43. system_reserved_cpu: The amount of cores reserved for ray system
  44. processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
  45. and < the total number of cores available.
  46. system_reserved_memory: The amount of memory in bytes reserved
  47. for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES
  48. and system_reserved_memory + object_store_memory < the total memory available.
  49. object_store_memory: The amount of memory in bytes reserved for the object store.
  50. Must be not None when resource isolation is enabled.
  51. """
  52. self._resource_isolation_enabled = enable_resource_isolation
  53. self.cgroup_path = cgroup_path
  54. self.system_reserved_memory = system_reserved_memory
  55. self.system_pids = ""
  56. # cgroupv2 cpu.weight calculated from system_reserved_cpu assumes ray uses all available cores.
  57. self.system_reserved_cpu_weight: int = None
  58. if not enable_resource_isolation:
  59. if self.cgroup_path:
  60. raise ValueError(
  61. "cgroup_path cannot be set when resource isolation is not enabled. "
  62. "Set enable_resource_isolation to True if you're using ray.init or use the "
  63. "--enable-resource-isolation flag if you're using the ray cli."
  64. )
  65. if system_reserved_cpu:
  66. raise ValueError(
  67. "system_reserved_cpu cannot be set when resource isolation is not enabled. "
  68. "Set enable_resource_isolation to True if you're using ray.init or use the "
  69. "--enable-resource-isolation flag if you're using the ray cli."
  70. )
  71. if self.system_reserved_memory:
  72. raise ValueError(
  73. "system_reserved_memory cannot be set when resource isolation is not enabled. "
  74. "Set enable_resource_isolation to True if you're using ray.init or use the "
  75. "--enable-resource-isolation flag if you're using the ray cli."
  76. )
  77. return
  78. if object_store_memory is None:
  79. raise ValueError(
  80. "object_store_memory must be resolved before creating a ResourceIsolationConfig "
  81. "when resource isolation is enabled. This is likely a bug in Ray, "
  82. "please report it at https://github.com/ray-project/ray/issues/new/choose."
  83. )
  84. self.system_reserved_cpu_weight = self._validate_and_get_system_reserved_cpu(
  85. system_reserved_cpu
  86. )
  87. self.system_reserved_memory = self._validate_and_get_system_reserved_memory(
  88. system_reserved_memory, object_store_memory
  89. )
  90. self.cgroup_path = self._validate_and_get_cgroup_path(cgroup_path)
  91. def is_enabled(self) -> bool:
  92. return self._resource_isolation_enabled
  93. def add_system_pids(self, system_pids: str):
  94. """A comma-separated list of pids to move into the system cgroup."""
  95. self.system_pids = system_pids
  96. @staticmethod
  97. def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
  98. """Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not specified.
  99. Args:
  100. cgroup_path: The path for the cgroup the raylet should use to enforce
  101. resource isolation.
  102. Returns:
  103. str: The validated cgroup path.
  104. Raises:
  105. ValueError: If cgroup_path is not a string.
  106. """
  107. if not cgroup_path:
  108. cgroup_path = ray_constants.DEFAULT_CGROUP_PATH
  109. if not isinstance(cgroup_path, str):
  110. raise ValueError(
  111. f"Invalid value={cgroup_path} for cgroup_path. "
  112. "Use a string to represent the path for the cgroup that the raylet should use "
  113. "to enable resource isolation."
  114. )
  115. return cgroup_path
  116. @staticmethod
  117. def _validate_and_get_system_reserved_cpu(
  118. system_reserved_cpu: Optional[float],
  119. ) -> int:
  120. """If system_reserved_cpu is specified, validates it, otherwise returns the default value.
  121. Validation entails checking the type, ensuring that the value is in range, and converts it
  122. into cpu.weights for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights
  123. for more information.
  124. If system_reserved_cpu is not specified, returns a default value between
  125. [DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES].
  126. # TODO(54703): The errors from this method are user-facing and thus need
  127. to be linked the user-facing documentation once it's available.
  128. Args:
  129. system_reserved_cpu: The amount of cores reserved for ray system
  130. processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
  131. and < the total number of cores available.
  132. Raises:
  133. ValueError: If system_reserved_cpu is specified, but invalid or if the system
  134. does not have enough available cpus.
  135. """
  136. available_system_cpus = utils.get_num_cpus(truncate=False)
  137. if available_system_cpus < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
  138. raise ValueError(
  139. f"The available number of cpu cores on this system {available_system_cpus} is less than "
  140. f"the minimum amount that is required for ray's system processes. "
  141. f"Pick a number of cpu cores greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
  142. )
  143. if not system_reserved_cpu:
  144. system_reserved_cpu = float(
  145. min(
  146. max(
  147. ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES,
  148. ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
  149. * available_system_cpus,
  150. ),
  151. ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES,
  152. )
  153. )
  154. if not (
  155. isinstance(system_reserved_cpu, float)
  156. or isinstance(system_reserved_cpu, int)
  157. ):
  158. raise ValueError(
  159. f"Invalid value={system_reserved_cpu} for system_reserved_cpu. "
  160. "Use a float to represent the number of cores that need to be reserved for "
  161. "ray system processes to enable resource isolation."
  162. )
  163. system_reserved_cpu = float(system_reserved_cpu)
  164. if system_reserved_cpu < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
  165. raise ValueError(
  166. f"The requested system_reserved_cpu={system_reserved_cpu} is less than "
  167. f"the minimum number of cpus that can be used for resource isolation. "
  168. "Pick a number of cpu cores to reserve for ray system processes "
  169. f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
  170. )
  171. if system_reserved_cpu >= available_system_cpus:
  172. raise ValueError(
  173. f"The requested system_reserved_cpu={system_reserved_cpu} is greater than or equal to "
  174. f"the number of cpus available={available_system_cpus}. "
  175. "Pick a smaller number of cpu cores to reserve for ray system processes."
  176. )
  177. # Converting the number of cores the user defined into cpu.weights
  178. # This assumes that ray is allowed to use all available CPU
  179. # cores and distribute them between system, worker and
  180. # user processes
  181. return int(
  182. (system_reserved_cpu / float(available_system_cpus))
  183. * _CGROUP_CPU_MAX_WEIGHT
  184. )
  185. @staticmethod
  186. def _validate_and_get_system_reserved_memory(
  187. system_reserved_memory: Optional[int],
  188. object_store_memory: int,
  189. ) -> int:
  190. """If system_reserved_memory is not specified, returns the default value. Otherwise,
  191. checks the type, makes sure that the value is in range.
  192. Args:
  193. system_reserved_memory: The amount of memory in bytes reserved
  194. for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES
  195. and < the total memory available.
  196. object_store_memory: The amount of memory in bytes reserved for the object store.
  197. Returns:
  198. int: The validated system reserved memory in bytes.
  199. Raises:
  200. ValueError: If system_reserved_memory is specified, but invalid.
  201. """
  202. available_system_memory = ray._common.utils.get_system_memory()
  203. if (
  204. available_system_memory
  205. < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
  206. ):
  207. raise ValueError(
  208. f"The available memory on this system {available_system_memory} is less than "
  209. f"the minimum amount that is required for ray's system processes. "
  210. f"Pick a number of bytes greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
  211. )
  212. if not system_reserved_memory:
  213. system_reserved_memory = int(
  214. min(
  215. max(
  216. ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES,
  217. ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
  218. * available_system_memory,
  219. ),
  220. ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES,
  221. )
  222. )
  223. if not isinstance(system_reserved_memory, int):
  224. raise ValueError(
  225. f"Invalid value {system_reserved_memory} for system_reserved_memory. "
  226. "Use an integer to represent the number bytes that need to be reserved for "
  227. "ray system processes to enable resource isolation."
  228. )
  229. if (
  230. system_reserved_memory
  231. < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
  232. ):
  233. raise ValueError(
  234. f"The requested system_reserved_memory {system_reserved_memory} is less than "
  235. f"the minimum number of bytes that can be used for resource isolation. "
  236. "Pick a number of bytes to reserve for ray system processes "
  237. f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
  238. )
  239. total_system_reserved_memory = system_reserved_memory + object_store_memory
  240. if total_system_reserved_memory > available_system_memory:
  241. raise ValueError(
  242. f"The total requested system_reserved_memory={total_system_reserved_memory} is greater than "
  243. f"the amount of memory available={available_system_memory}."
  244. )
  245. return total_system_reserved_memory