defaults.yaml 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # An unique identifier for the head node and workers of this cluster.
  2. cluster_name: default
  3. # The maximum number of workers nodes to launch in addition to the head
  4. # node.
  5. max_workers: 2
  6. # The autoscaler will scale up the cluster faster with higher upscaling speed.
  7. # E.g., if the task requires adding more nodes then autoscaler will gradually
  8. # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
  9. # This number should be > 0.
  10. upscaling_speed: 1.0
  11. # This executes all commands on all nodes in the docker container,
  12. # and opens all the necessary ports to support the Ray cluster.
  13. # Empty string means disabled.
  14. docker: {}
  15. # If a node is idle for this many minutes, it will be removed.
  16. idle_timeout_minutes: 5
  17. # Cloud-provider specific configuration.
  18. provider:
  19. type: aws
  20. region: us-west-2
  21. # Availability zone(s), comma-separated, that nodes may be launched in.
  22. # Nodes will be launched in the first listed availability zone and will
  23. # be tried in the subsequent availability zones if launching fails.
  24. availability_zone: us-west-2a,us-west-2b
  25. # Whether to allow node reuse. If set to False, nodes will be terminated
  26. # instead of stopped.
  27. cache_stopped_nodes: True # If not present, the default is True.
  28. # How Ray will authenticate with newly launched nodes.
  29. auth:
  30. ssh_user: ubuntu
  31. # By default Ray creates a new private keypair, but you can also use your own.
  32. # If you do so, make sure to also set "KeyName" in the head and worker node
  33. # configurations below.
  34. # ssh_private_key: /path/to/your/key.pem
  35. # Tell the autoscaler the allowed node types and the resources they provide.
  36. # The key is the name of the node type, which is just for debugging purposes.
  37. # The node config specifies the launch config and physical instance type.
  38. available_node_types:
  39. ray.head.default:
  40. # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
  41. # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
  42. # You can also set custom resources.
  43. # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
  44. # resources: {"CPU": 1, "GPU": 1, "custom": 5}
  45. resources: {}
  46. # Provider-specific config for this node type, e.g. instance type. By default
  47. # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
  48. # For more documentation on available fields, see:
  49. # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
  50. node_config:
  51. InstanceType: m5.large
  52. # You can provision additional disk space with a conf as follows
  53. BlockDeviceMappings:
  54. - DeviceName: /dev/sda1
  55. Ebs:
  56. VolumeSize: 256
  57. # Additional options in the boto docs.
  58. ray.worker.default:
  59. # The minimum number of nodes of this type to launch.
  60. # This number should be >= 0.
  61. min_workers: 0
  62. # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
  63. # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
  64. # You can also set custom resources.
  65. # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
  66. # resources: {"CPU": 1, "GPU": 1, "custom": 5}
  67. resources: {}
  68. # Provider-specific config for this node type, e.g. instance type. By default
  69. # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
  70. # For more documentation on available fields, see:
  71. # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
  72. node_config:
  73. InstanceType: m5.large
  74. # Run workers on spot by default. Comment this out to use on-demand.
  75. InstanceMarketOptions:
  76. MarketType: spot
  77. # Additional options can be found in the boto docs, e.g.
  78. # SpotOptions:
  79. # MaxPrice: MAX_HOURLY_PRICE
  80. # Additional options in the boto docs.
  81. # Specify the node type of the head node (as configured above).
  82. head_node_type: ray.head.default
  83. # Files or directories to copy to the head and worker nodes. The format is a
  84. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
  85. file_mounts: {
  86. # "/path1/on/remote/machine": "/path1/on/local/machine",
  87. # "/path2/on/remote/machine": "/path2/on/local/machine",
  88. }
  89. # Files or directories to copy from the head node to the worker nodes. The format is a
  90. # list of paths. The same path on the head node will be copied to the worker node.
  91. # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
  92. # you should just use file_mounts. Only use this if you know what you're doing!
  93. cluster_synced_files: []
  94. # Whether changes to directories in file_mounts or cluster_synced_files in the head node
  95. # should sync to the worker node continuously
  96. file_mounts_sync_continuously: False
  97. # Patterns for files to exclude when running rsync up or rsync down
  98. rsync_exclude: []
  99. # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
  100. # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
  101. # as a value, the behavior will match git's behavior for finding and using .gitignore files.
  102. rsync_filter: []
  103. # List of commands that will be run before `setup_commands`. If docker is
  104. # enabled, these commands will run outside the container and before docker
  105. # is setup.
  106. initialization_commands: []
  107. # List of shell commands to run to set up nodes.
  108. setup_commands:
  109. - >-
  110. (stat $HOME/anaconda3/envs/tensorflow2_p310/ &> /dev/null &&
  111. echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_p310/bin:$PATH"' >> ~/.bashrc) || true
  112. - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
  113. # Custom commands that will be run on the head node after common setup.
  114. head_setup_commands:
  115. - pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions
  116. # Custom commands that will be run on worker nodes after common setup.
  117. worker_setup_commands: []
  118. # Command to start ray on the head node. You don't need to change this.
  119. head_start_ray_commands:
  120. - ray stop
  121. - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
  122. # Command to start ray on worker nodes. You don't need to change this.
  123. worker_start_ray_commands:
  124. - ray stop
  125. - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076