constants.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import os
  2. import sys
  3. from ray._private.ray_constants import ( # noqa F401
  4. AGENT_PROCESS_TYPE_DASHBOARD_AGENT,
  5. AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT,
  6. AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
  7. DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
  8. LABELS_ENVIRONMENT_VARIABLE,
  9. LOGGER_FORMAT,
  10. RESOURCES_ENVIRONMENT_VARIABLE,
  11. )
  12. def env_integer(key, default):
  13. if key in os.environ:
  14. val = os.environ[key]
  15. if val == "inf":
  16. return sys.maxsize
  17. else:
  18. return int(val)
  19. return default
  20. # Whether autoscaler cluster status logging is enabled. Set to 0 disable.
  21. AUTOSCALER_STATUS_LOG = env_integer("RAY_ENABLE_CLUSTER_STATUS_LOG", 1)
  22. # The name of the environment variable for plugging in a utilization scorer.
  23. AUTOSCALER_UTILIZATION_SCORER_KEY = "RAY_AUTOSCALER_UTILIZATION_SCORER"
  24. # Whether to avoid launching GPU nodes for CPU only tasks.
  25. AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
  26. # How long to wait for a node to start and terminate, in seconds.
  27. AUTOSCALER_NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
  28. AUTOSCALER_NODE_TERMINATE_WAIT_S = env_integer("AUTOSCALER_NODE_TERMINATE_WAIT_S", 900)
  29. # Interval at which to check if node SSH became available.
  30. AUTOSCALER_NODE_SSH_INTERVAL_S = env_integer("AUTOSCALER_NODE_SSH_INTERVAL_S", 5)
  31. # Abort autoscaling if more than this number of errors are encountered. This
  32. # is a safety feature to prevent e.g. runaway node launches.
  33. AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
  34. # The maximum number of nodes to launch in a single request.
  35. # Multiple requests may be made for this batch size, up to
  36. # the limit of AUTOSCALER_MAX_CONCURRENT_LAUNCHES.
  37. AUTOSCALER_MAX_LAUNCH_BATCH = env_integer("AUTOSCALER_MAX_LAUNCH_BATCH", 5)
  38. # Max number of nodes to launch at a time.
  39. AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
  40. "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10
  41. )
  42. # Default upscaling speed for the autoscaler. This specifies how many nodes
  43. # to request at a time, where the desired number to upscale is
  44. # min(1, upscaling_speed * current_num_nodes)
  45. # e.g. 1.0 means to request enough nodes to double
  46. # the cluster size in each round of requests.
  47. # When the upscaling speed is 0.0, the autoscaler will request 1 node.
  48. DEFAULT_UPSCALING_SPEED = 0.0
  49. # Interval at which to perform autoscaling updates.
  50. AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
  51. # The autoscaler will attempt to restart Ray on nodes it hasn't heard from
  52. # in more than this interval.
  53. AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30)
  54. # The maximum number of nodes (including failed nodes) that the autoscaler will
  55. # track for logging purposes.
  56. AUTOSCALER_MAX_NODES_TRACKED = 1500
  57. AUTOSCALER_MAX_FAILURES_DISPLAYED = 20
  58. AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S = env_integer(
  59. "AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S", 30 * 60
  60. )
  61. AUTOSCALER_REPORT_PER_NODE_STATUS = (
  62. env_integer("AUTOSCALER_REPORT_PER_NODE_STATUS", 1) == 1
  63. )
  64. # The maximum allowed resource demand vector size to guarantee the resource
  65. # demand scheduler bin packing algorithm takes a reasonable amount of time
  66. # to run.
  67. AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = env_integer(
  68. "AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE", 1000
  69. )
  70. # Port that autoscaler prometheus metrics will be exported to
  71. AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217)
  72. # The minimum number of nodes to launch concurrently.
  73. AUTOSCALER_UPSCALING_INITIAL_NUM_NODES = 5
  74. # Max number of retries to AWS (default is 5, time increases exponentially)
  75. BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
  76. # Max number of retries to create an EC2 node (retry different subnet)
  77. BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
  78. # ray home path in the container image
  79. RAY_HOME = "/home/ray"
  80. # The order of this list matters! `scripts.py` kills the ray processes in order of this
  81. # list. Think twice when you add to this list.
  82. # Invariants:
  83. # RAYLET must be the first in the list.
  84. # GCS SERVER must be the last in the list.
  85. RAY_PROCESSES = [
  86. # The first element is the substring to filter.
  87. # The second element, if True, is to filter ps results by command name
  88. # (only the first 15 charactors of the executable name on Linux);
  89. # if False, is to filter ps results by command with all its arguments.
  90. # See STANDARD FORMAT SPECIFIERS section of
  91. # http://man7.org/linux/man-pages/man1/ps.1.html
  92. # about comm and args. This can help avoid killing non-ray processes.
  93. # Format:
  94. # Keyword to filter, filter by command (True)/filter by args (False)
  95. ["raylet", True],
  96. ["plasma_store", True],
  97. ["monitor.py", False],
  98. ["ray.util.client.server", False],
  99. ["default_worker.py", False], # Python worker.
  100. ["setup_worker.py", False], # Python environment setup worker.
  101. # For mac osx, setproctitle doesn't change the process name returned
  102. # by psutil but only cmdline.
  103. [
  104. "ray::",
  105. sys.platform != "darwin",
  106. ], # Python worker. TODO(mehrdadn): Fix for Windows
  107. ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker.
  108. ["log_monitor.py", False],
  109. [AGENT_PROCESS_TYPE_DASHBOARD_AGENT, False],
  110. [os.path.join("dashboard", "dashboard.py"), False],
  111. [AGENT_PROCESS_TYPE_RUNTIME_ENV_AGENT, False],
  112. ["ray_process_reaper.py", False],
  113. ["gcs_server", True],
  114. ]
  115. # Max Concurrent SSH Calls to stop Docker
  116. MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS", 50)
  117. DISABLE_NODE_UPDATERS_KEY = "disable_node_updaters"
  118. DISABLE_LAUNCH_CONFIG_CHECK_KEY = "disable_launch_config_check"
  119. FOREGROUND_NODE_LAUNCH_KEY = "foreground_node_launch"
  120. WORKER_LIVENESS_CHECK_KEY = "worker_liveness_check"