defaults.yaml 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. # An unique identifier for the head node and workers of this cluster.
  2. cluster_name: default
  3. # The maximum number of workers nodes to launch in addition to the head
  4. # node.
  5. max_workers: 2
  6. # The autoscaler will scale up the cluster faster with higher upscaling speed.
  7. # E.g., if the task requires adding more nodes then autoscaler will gradually
  8. # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
  9. # This number should be > 0.
  10. upscaling_speed: 1.0
  11. # This executes all commands on all nodes in the docker container,
  12. # and opens all the necessary ports to support the Ray cluster.
  13. # Empty string means disabled.
  14. docker: {}
  15. # If a node is idle for this many minutes, it will be removed.
  16. idle_timeout_minutes: 5
  17. # Cloud-provider specific configuration.
  18. provider:
  19. type: gcp
  20. region: us-west1
  21. availability_zone: us-west1-a
  22. project_id: null # Globally unique project id
  23. # How Ray will authenticate with newly launched nodes.
  24. auth:
  25. ssh_user: ubuntu
  26. # By default Ray creates a new private keypair, but you can also use your own.
  27. # If you do so, make sure to also set "KeyName" in the head and worker node
  28. # configurations below. This requires that you have added the key into the
  29. # project wide meta-data.
  30. # ssh_private_key: /path/to/your/key.pem
  31. # Tell the autoscaler the allowed node types and the resources they provide.
  32. # The key is the name of the node type, which is just for debugging purposes.
  33. # The node config specifies the launch config and physical instance type.
  34. available_node_types:
  35. ray_head_default:
  36. # The resources provided by this node type.
  37. resources: {"CPU": 2}
  38. # Provider-specific config for this node type, e.g. instance type. By default
  39. # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
  40. # For more documentation on available fields, see:
  41. # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
  42. node_config:
  43. machineType: n1-standard-2
  44. disks:
  45. - boot: true
  46. autoDelete: true
  47. type: PERSISTENT
  48. initializeParams:
  49. diskSizeGb: 50
  50. # See https://cloud.google.com/compute/docs/images for more images
  51. sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
  52. # Additional options can be found in in the compute docs at
  53. # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
  54. # If the network interface is specified as below in both head and worker
  55. # nodes, the manual network config is used. Otherwise an existing subnet is
  56. # used. To use a shared subnet, ask the subnet owner to grant permission
  57. # for 'compute.subnetworks.use' to the ray autoscaler account...
  58. # networkInterfaces:
  59. # - kind: compute#networkInterface
  60. # subnetwork: path/to/subnet
  61. # aliasIpRanges: []
  62. ray_worker_small:
  63. # The minimum number of nodes of this type to launch.
  64. # This number should be >= 0.
  65. min_workers: 0
  66. # The resources provided by this node type.
  67. resources: {"CPU": 2}
  68. # Provider-specific config for this node type, e.g. instance type. By default
  69. # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
  70. # For more documentation on available fields, see:
  71. # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
  72. node_config:
  73. machineType: n1-standard-2
  74. disks:
  75. - boot: true
  76. autoDelete: true
  77. type: PERSISTENT
  78. initializeParams:
  79. diskSizeGb: 50
  80. # See https://cloud.google.com/compute/docs/images for more images
  81. sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu
  82. # Run workers on preemtible instance by default.
  83. # Comment this out to use on-demand.
  84. scheduling:
  85. - preemptible: true
  86. # Additional options can be found in in the compute docs at
  87. # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
  88. # Specify the node type of the head node (as configured above).
  89. head_node_type: ray_head_default
  90. # Files or directories to copy to the head and worker nodes. The format is a
  91. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
  92. file_mounts: {
  93. # "/path1/on/remote/machine": "/path1/on/local/machine",
  94. # "/path2/on/remote/machine": "/path2/on/local/machine",
  95. }
  96. # Files or directories to copy from the head node to the worker nodes. The format is a
  97. # list of paths. The same path on the head node will be copied to the worker node.
  98. # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
  99. # you should just use file_mounts. Only use this if you know what you're doing!
  100. cluster_synced_files: []
  101. # Whether changes to directories in file_mounts or cluster_synced_files in the head node
  102. # should sync to the worker node continuously
  103. file_mounts_sync_continuously: False
  104. # Patterns for files to exclude when running rsync up or rsync down
  105. rsync_exclude: []
  106. # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
  107. # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
  108. # as a value, the behavior will match git's behavior for finding and using .gitignore files.
  109. rsync_filter: []
  110. # List of commands that will be run before `setup_commands`. If docker is
  111. # enabled, these commands will run outside the container and before docker
  112. # is setup.
  113. initialization_commands: []
  114. # List of shell commands to run to set up nodes.
  115. setup_commands:
  116. # Note: if you're developing Ray, you probably want to create an AMI that
  117. # has your Ray repo pre-cloned. Then, you can replace the pip installs
  118. # below with a git checkout <your_sha> (and possibly a recompile).
  119. # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
  120. # Install ray if not present
  121. - >-
  122. (stat /opt/conda/bin/ &> /dev/null &&
  123. echo 'export PATH="/opt/conda/bin:$PATH"' >> ~/.bashrc) || true
  124. - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
  125. # Custom commands that will be run on the head node after common setup.
  126. head_setup_commands:
  127. - pip install google-api-python-client==1.7.8
  128. # Custom commands that will be run on worker nodes after common setup.
  129. worker_setup_commands: []
  130. # Command to start ray on the head node. You don't need to change this.
  131. head_start_ray_commands:
  132. - ray stop
  133. - >-
  134. ulimit -n 65536;
  135. ray start
  136. --head
  137. --port=6379
  138. --object-manager-port=8076
  139. --autoscaling-config=~/ray_bootstrap_config.yaml
  140. --dashboard-host=0.0.0.0
  141. # Command to start ray on worker nodes. You don't need to change this.
  142. worker_start_ray_commands:
  143. - ray stop
  144. - >-
  145. ulimit -n 65536;
  146. ray start
  147. --address=$RAY_HEAD_IP:6379
  148. --object-manager-port=8076