parameter.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. import logging
  2. import os
  3. import pathlib
  4. from typing import Dict, List, Optional
  5. import ray._private.ray_constants as ray_constants
  6. from ray._private.resource_isolation_config import ResourceIsolationConfig
  7. from ray._private.utils import get_ray_client_dependency_error
  8. logger = logging.getLogger(__name__)
  9. class RayParams:
  10. """A class used to store the parameters used by Ray.
  11. Attributes:
  12. redis_address: The address of the Redis server to connect to. If
  13. this address is not provided, then this command will start Redis, a
  14. raylet, a plasma store, a plasma manager, and some workers.
  15. It will also kill these processes when Python exits.
  16. redis_port: The port that the primary Redis shard should listen
  17. to. If None, then it will fall back to
  18. ray._private.ray_constants.DEFAULT_PORT, or a random port if the default is
  19. not available.
  20. redis_shard_ports: A list of the ports to use for the non-primary Redis
  21. shards. If None, then it will fall back to the ports right after
  22. redis_port, or random ports if those are not available.
  23. num_cpus: Number of CPUs to configure the raylet with.
  24. num_gpus: Number of GPUs to configure the raylet with.
  25. resources: A dictionary mapping the name of a resource to the quantity
  26. of that resource available.
  27. labels: The key-value labels of the node.
  28. memory: Total available memory for workers requesting memory.
  29. available_memory_bytes: The memory available for use on this node in bytes.
  30. object_store_memory: The amount of memory (in bytes) to start the
  31. object store with.
  32. object_manager_port int: The port to use for the object manager.
  33. node_manager_port: The port to use for the node manager.
  34. gcs_server_port: The port to use for the GCS server.
  35. node_ip_address: The IP address of the node that we are on.
  36. min_worker_port: The lowest port number that workers will bind
  37. on. If not set or set to 0, random ports will be chosen.
  38. max_worker_port: The highest port number that workers will bind
  39. on. If set, min_worker_port must also be set.
  40. worker_port_list: An explicit list of ports to be used for
  41. workers (comma-separated). Overrides min_worker_port and
  42. max_worker_port.
  43. ray_client_server_port: The port number the ray client server
  44. will bind on. If not set, the ray client server will not
  45. be started.
  46. redirect_output: True if stdout and stderr for non-worker
  47. processes should be redirected to files and false otherwise.
  48. log_to_stderr: If set, controls whether non-worker stdout/stderr should be
  49. written to stderr (True) or redirected to log files (False). This is the
  50. preferred replacement for the deprecated `redirect_output` field.
  51. external_addresses: The address of external Redis server to
  52. connect to, in format of "ip1:port1,ip2:port2,...". If this
  53. address is provided, then ray won't start Redis instances in the
  54. head node but use external Redis server(s) instead.
  55. num_redis_shards: The number of Redis shards to start in addition to
  56. the primary Redis shard.
  57. redis_max_clients: If provided, attempt to configure Redis with this
  58. maxclients number.
  59. redis_username: Prevents external clients without the username
  60. from connecting to Redis if provided.
  61. redis_password: Prevents external clients without the password
  62. from connecting to Redis if provided.
  63. plasma_directory: A directory where the Plasma memory mapped files will
  64. be created.
  65. object_spilling_directory: The path to spill objects to. The same path will
  66. be used as the object store fallback directory as well.
  67. worker_path: The path of the source code that will be run by the
  68. worker.
  69. setup_worker_path: The path of the Python file that will set up
  70. the environment for the worker process.
  71. huge_pages: Boolean flag indicating whether to start the Object
  72. Store with hugetlbfs support. Requires plasma_directory.
  73. include_dashboard: Boolean flag indicating whether to start the web
  74. UI, which displays the status of the Ray cluster. If this value is
  75. None, then the UI will be started if the relevant dependencies are
  76. present.
  77. dashboard_host: The host to bind the web UI server to. Can either be
  78. localhost (127.0.0.1) or 0.0.0.0 (available from all interfaces).
  79. By default, this is set to localhost to prevent access from
  80. external machines.
  81. dashboard_port: The port to bind the dashboard server to.
  82. Defaults to 8265.
  83. dashboard_agent_listen_port: The port for dashboard agents to listen on
  84. for HTTP requests.
  85. Defaults to 52365.
  86. runtime_env_agent_port: The port at which the runtime env agent
  87. listens to for HTTP.
  88. Defaults to random available port.
  89. plasma_store_socket_name: If provided, it specifies the socket
  90. name used by the plasma store.
  91. raylet_socket_name: If provided, it specifies the socket path
  92. used by the raylet process.
  93. temp_dir: If provided, it will specify the root temporary
  94. directory for the Ray process. Must be an absolute path.
  95. runtime_env_dir_name: If provided, specifies the directory that
  96. will be created in the session dir to hold runtime_env files.
  97. include_log_monitor: If True, then start a log monitor to
  98. monitor the log files for all processes on this node and push their
  99. contents to Redis.
  100. autoscaling_config: path to autoscaling config file.
  101. metrics_agent_port: The port to bind metrics agent.
  102. metrics_export_port: The port at which metrics are exposed
  103. through a Prometheus endpoint.
  104. no_monitor: If True, the ray autoscaler monitor for this cluster
  105. will not be started.
  106. _system_config: Configuration for overriding RayConfig
  107. defaults. Used to set system configuration and for experimental Ray
  108. core feature flags.
  109. enable_object_reconstruction: Enable plasma reconstruction on
  110. failure.
  111. ray_debugger_external: If true, make the Ray debugger for a
  112. worker available externally to the node it is running on. This will
  113. bind on 0.0.0.0 instead of localhost.
  114. env_vars: Override environment variables for the raylet.
  115. session_name: The current Ray session name.
  116. webui: The url of the UI.
  117. cluster_id: The cluster ID in hex string.
  118. resource_isolation_config: settings for cgroupv2 based isolation of ray
  119. system processes (defaults to no isolation if config not provided)
  120. """
  121. def __init__(
  122. self,
  123. redis_address: Optional[str] = None,
  124. gcs_address: Optional[str] = None,
  125. num_cpus: Optional[int] = None,
  126. num_gpus: Optional[int] = None,
  127. resources: Optional[Dict[str, float]] = None,
  128. labels: Optional[Dict[str, str]] = None,
  129. memory: Optional[float] = None,
  130. available_memory_bytes: Optional[int] = None,
  131. object_store_memory: Optional[float] = None,
  132. redis_port: Optional[int] = None,
  133. redis_shard_ports: Optional[List[int]] = None,
  134. object_manager_port: Optional[int] = None,
  135. node_manager_port: int = 0,
  136. gcs_server_port: Optional[int] = None,
  137. node_ip_address: Optional[str] = None,
  138. node_name: Optional[str] = None,
  139. min_worker_port: Optional[int] = None,
  140. max_worker_port: Optional[int] = None,
  141. worker_port_list: Optional[List[int]] = None,
  142. ray_client_server_port: Optional[int] = None,
  143. driver_mode=None,
  144. redirect_output: Optional[bool] = None,
  145. log_to_stderr: Optional[bool] = None,
  146. external_addresses: Optional[List[str]] = None,
  147. num_redis_shards: Optional[int] = None,
  148. redis_max_clients: Optional[int] = None,
  149. redis_username: Optional[str] = ray_constants.REDIS_DEFAULT_USERNAME,
  150. redis_password: Optional[str] = ray_constants.REDIS_DEFAULT_PASSWORD,
  151. plasma_directory: Optional[str] = None,
  152. object_spilling_directory: Optional[str] = None,
  153. worker_path: Optional[str] = None,
  154. setup_worker_path: Optional[str] = None,
  155. huge_pages: Optional[bool] = False,
  156. include_dashboard: Optional[bool] = None,
  157. dashboard_host: Optional[str] = ray_constants.DEFAULT_DASHBOARD_IP,
  158. dashboard_port: Optional[bool] = ray_constants.DEFAULT_DASHBOARD_PORT,
  159. dashboard_agent_listen_port: Optional[
  160. int
  161. ] = ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
  162. runtime_env_agent_port: Optional[int] = None,
  163. plasma_store_socket_name: Optional[str] = None,
  164. raylet_socket_name: Optional[str] = None,
  165. temp_dir: Optional[str] = None,
  166. runtime_env_dir_name: Optional[str] = None,
  167. include_log_monitor: Optional[str] = None,
  168. autoscaling_config: Optional[str] = None,
  169. ray_debugger_external: bool = False,
  170. _system_config: Optional[Dict[str, str]] = None,
  171. enable_object_reconstruction: Optional[bool] = False,
  172. metrics_agent_port: Optional[int] = None,
  173. metrics_export_port: Optional[int] = None,
  174. tracing_startup_hook=None,
  175. no_monitor: Optional[bool] = False,
  176. env_vars: Optional[Dict[str, str]] = None,
  177. session_name: Optional[str] = None,
  178. webui: Optional[str] = None,
  179. cluster_id: Optional[str] = None,
  180. node_id: Optional[str] = None,
  181. resource_isolation_config: Optional[ResourceIsolationConfig] = None,
  182. ):
  183. self.redis_address = redis_address
  184. self.gcs_address = gcs_address
  185. self.num_cpus = num_cpus
  186. self.num_gpus = num_gpus
  187. self.memory = memory
  188. self.available_memory_bytes = available_memory_bytes
  189. self.object_store_memory = object_store_memory
  190. self.resources = resources
  191. self.redis_port = redis_port
  192. self.redis_shard_ports = redis_shard_ports
  193. self.object_manager_port = object_manager_port
  194. self.node_manager_port = node_manager_port
  195. self.gcs_server_port = gcs_server_port
  196. self.node_ip_address = node_ip_address
  197. self.node_name = node_name
  198. self.min_worker_port = min_worker_port
  199. self.max_worker_port = max_worker_port
  200. self.worker_port_list = worker_port_list
  201. self.ray_client_server_port = ray_client_server_port
  202. self.driver_mode = driver_mode
  203. self.redirect_output = redirect_output
  204. self.log_to_stderr = log_to_stderr
  205. self.external_addresses = external_addresses
  206. self.num_redis_shards = num_redis_shards
  207. self.redis_max_clients = redis_max_clients
  208. self.redis_username = redis_username
  209. self.redis_password = redis_password
  210. self.plasma_directory = plasma_directory
  211. self.object_spilling_directory = object_spilling_directory
  212. self.worker_path = worker_path
  213. self.setup_worker_path = setup_worker_path
  214. self.huge_pages = huge_pages
  215. self.include_dashboard = include_dashboard
  216. self.dashboard_host = dashboard_host
  217. self.dashboard_port = dashboard_port
  218. self.dashboard_agent_listen_port = dashboard_agent_listen_port
  219. self.runtime_env_agent_port = runtime_env_agent_port
  220. self.plasma_store_socket_name = plasma_store_socket_name
  221. self.raylet_socket_name = raylet_socket_name
  222. self.temp_dir = temp_dir
  223. self.runtime_env_dir_name = (
  224. runtime_env_dir_name or ray_constants.DEFAULT_RUNTIME_ENV_DIR_NAME
  225. )
  226. self.include_log_monitor = include_log_monitor
  227. self.autoscaling_config = autoscaling_config
  228. self.metrics_agent_port = metrics_agent_port
  229. self.metrics_export_port = metrics_export_port
  230. self.tracing_startup_hook = tracing_startup_hook
  231. self.no_monitor = no_monitor
  232. self.ray_debugger_external = ray_debugger_external
  233. self.env_vars = env_vars
  234. self.session_name = session_name
  235. self.webui = webui
  236. self._system_config = _system_config or {}
  237. self._enable_object_reconstruction = enable_object_reconstruction
  238. self.labels = labels
  239. self._check_usage()
  240. self.cluster_id = cluster_id
  241. self.node_id = node_id
  242. self.resource_isolation_config = resource_isolation_config
  243. if not self.resource_isolation_config:
  244. self.resource_isolation_config = ResourceIsolationConfig(
  245. object_store_memory=object_store_memory, enable_resource_isolation=False
  246. )
  247. # Set the internal config options for object reconstruction.
  248. if enable_object_reconstruction:
  249. # Turn off object pinning.
  250. if self._system_config is None:
  251. self._system_config = dict()
  252. print(self._system_config)
  253. self._system_config["lineage_pinning_enabled"] = True
  254. def update(self, **kwargs):
  255. """Update the settings according to the keyword arguments.
  256. Args:
  257. kwargs: The keyword arguments to set corresponding fields.
  258. """
  259. for arg in kwargs:
  260. if hasattr(self, arg):
  261. setattr(self, arg, kwargs[arg])
  262. else:
  263. raise ValueError(f"Invalid RayParams parameter in update: {arg}")
  264. self._check_usage()
  265. def update_if_absent(self, **kwargs):
  266. """Update the settings when the target fields are None.
  267. Args:
  268. kwargs: The keyword arguments to set corresponding fields.
  269. """
  270. for arg in kwargs:
  271. if hasattr(self, arg):
  272. if getattr(self, arg) is None:
  273. setattr(self, arg, kwargs[arg])
  274. else:
  275. raise ValueError(
  276. f"Invalid RayParams parameter in update_if_absent: {arg}"
  277. )
  278. self._check_usage()
  279. def update_pre_selected_port(self):
  280. """Update the pre-selected port information
  281. Returns:
  282. The dictionary mapping of component -> ports.
  283. """
  284. def wrap_port(port):
  285. # 0 port means select a random port for the grpc server.
  286. if port is None or port == 0:
  287. return []
  288. else:
  289. return [port]
  290. # Create a dictionary of the component -> port mapping.
  291. pre_selected_ports = {
  292. "gcs": wrap_port(self.redis_port),
  293. "object_manager": wrap_port(self.object_manager_port),
  294. "node_manager": wrap_port(self.node_manager_port),
  295. "gcs_server": wrap_port(self.gcs_server_port),
  296. "client_server": wrap_port(self.ray_client_server_port),
  297. "dashboard": wrap_port(self.dashboard_port),
  298. "dashboard_agent_grpc": wrap_port(self.metrics_agent_port),
  299. "dashboard_agent_http": wrap_port(self.dashboard_agent_listen_port),
  300. "runtime_env_agent": wrap_port(self.runtime_env_agent_port),
  301. "metrics_export": wrap_port(self.metrics_export_port),
  302. }
  303. redis_shard_ports = self.redis_shard_ports
  304. if redis_shard_ports is None:
  305. redis_shard_ports = []
  306. pre_selected_ports["redis_shards"] = redis_shard_ports
  307. if self.worker_port_list is None:
  308. if self.min_worker_port is not None and self.max_worker_port is not None:
  309. pre_selected_ports["worker_ports"] = list(
  310. range(self.min_worker_port, self.max_worker_port + 1)
  311. )
  312. else:
  313. # The dict is not updated when it requires random ports.
  314. pre_selected_ports["worker_ports"] = []
  315. else:
  316. pre_selected_ports["worker_ports"] = [
  317. int(port) for port in self.worker_port_list.split(",")
  318. ]
  319. # Update the pre selected port set.
  320. self.reserved_ports = set()
  321. for comp, port_list in pre_selected_ports.items():
  322. for port in port_list:
  323. if port in self.reserved_ports:
  324. raise ValueError(
  325. f"Ray component {comp} is trying to use "
  326. f"a port number {port} that is used by other components.\n"
  327. f"Port information: {self._format_ports(pre_selected_ports)}\n"
  328. "If you allocate ports, please make sure the same port "
  329. "is not used by multiple components."
  330. )
  331. self.reserved_ports.add(port)
  332. def _check_usage(self):
  333. if self.worker_port_list is not None:
  334. for port_str in self.worker_port_list.split(","):
  335. try:
  336. port = int(port_str)
  337. except ValueError as e:
  338. raise ValueError(
  339. "worker_port_list must be a comma-separated "
  340. f"list of integers: {e}"
  341. ) from None
  342. if port < 1024 or port > 65535:
  343. raise ValueError(
  344. "Ports in worker_port_list must be "
  345. f"between 1024 and 65535. Got: {port}"
  346. )
  347. # Used primarily for testing.
  348. if os.environ.get("RAY_USE_RANDOM_PORTS", False):
  349. if self.min_worker_port is None and self.max_worker_port is None:
  350. self.min_worker_port = 0
  351. self.max_worker_port = 0
  352. if self.min_worker_port is not None:
  353. if self.min_worker_port != 0 and (
  354. self.min_worker_port < 1024 or self.min_worker_port > 65535
  355. ):
  356. raise ValueError(
  357. "min_worker_port must be 0 or an integer between 1024 and 65535."
  358. )
  359. if self.max_worker_port is not None:
  360. if self.min_worker_port is None:
  361. raise ValueError(
  362. "If max_worker_port is set, min_worker_port must also be set."
  363. )
  364. elif self.max_worker_port != 0:
  365. if self.max_worker_port < 1024 or self.max_worker_port > 65535:
  366. raise ValueError(
  367. "max_worker_port must be 0 or an integer between "
  368. "1024 and 65535."
  369. )
  370. elif self.max_worker_port <= self.min_worker_port:
  371. raise ValueError(
  372. "max_worker_port must be higher than min_worker_port."
  373. )
  374. if self.ray_client_server_port is not None:
  375. if get_ray_client_dependency_error() is not None:
  376. raise ValueError(
  377. "Ray Client requires pip package `ray[client]`. "
  378. "If you installed the minimal Ray (e.g. `pip install ray`), "
  379. "please reinstall by executing `pip install ray[client]`."
  380. )
  381. if (
  382. self.ray_client_server_port < 1024
  383. or self.ray_client_server_port > 65535
  384. ):
  385. raise ValueError(
  386. "ray_client_server_port must be an integer "
  387. "between 1024 and 65535."
  388. )
  389. if self.runtime_env_agent_port is not None:
  390. if self.runtime_env_agent_port != 0 and (
  391. self.runtime_env_agent_port < 1024
  392. or self.runtime_env_agent_port > 65535
  393. ):
  394. raise ValueError(
  395. "runtime_env_agent_port must be 0 (auto-assign) or an integer "
  396. "between 1024 and 65535."
  397. )
  398. if self.resources is not None:
  399. def build_error(resource, alternative):
  400. return (
  401. f"{self.resources} -> `{resource}` cannot be a "
  402. "custom resource because it is one of the default resources "
  403. f"({ray_constants.DEFAULT_RESOURCES}). "
  404. f"Use `{alternative}` instead. For example, use `ray start "
  405. f"--{alternative.replace('_', '-')}=1` instead of "
  406. f"`ray start --resources={{'{resource}': 1}}`"
  407. )
  408. assert "CPU" not in self.resources, build_error("CPU", "num_cpus")
  409. assert "GPU" not in self.resources, build_error("GPU", "num_gpus")
  410. assert "memory" not in self.resources, build_error("memory", "memory")
  411. assert "object_store_memory" not in self.resources, build_error(
  412. "object_store_memory", "object_store_memory"
  413. )
  414. if self.redirect_output is not None:
  415. raise DeprecationWarning("The redirect_output argument is deprecated.")
  416. if self.temp_dir is not None and not os.path.isabs(self.temp_dir):
  417. raise ValueError("temp_dir must be absolute path or None.")
  418. if self.temp_dir is not None and os.getenv("VIRTUAL_ENV"):
  419. is_relative = True
  420. try:
  421. (
  422. pathlib.Path(self.temp_dir)
  423. .resolve()
  424. .relative_to(pathlib.Path(os.getenv("VIRTUAL_ENV")).resolve())
  425. )
  426. except ValueError:
  427. is_relative = False
  428. if is_relative:
  429. raise ValueError(
  430. "temp_dir must not be child directory of virtualenv root"
  431. )
  432. def _format_ports(self, pre_selected_ports):
  433. """Format the pre-selected ports information to be more human-readable."""
  434. ports = pre_selected_ports.copy()
  435. for comp, port_list in ports.items():
  436. if len(port_list) == 1:
  437. ports[comp] = port_list[0]
  438. elif len(port_list) == 0:
  439. # Nothing is selected, meaning it will be randomly selected.
  440. ports[comp] = "random"
  441. elif comp == "worker_ports":
  442. min_port = port_list[0]
  443. max_port = port_list[len(port_list) - 1]
  444. if len(port_list) < 50:
  445. port_range_str = str(port_list)
  446. else:
  447. port_range_str = f"from {min_port} to {max_port}"
  448. ports[comp] = f"{len(port_list)} ports {port_range_str}"
  449. return ports