defaults.yaml 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. # An unique identifier for the head node and workers of this cluster.
  2. cluster_name: default
  3. # The maximum number of workers nodes to launch in addition to the head
  4. # node.
  5. max_workers: 2
  6. # The autoscaler will scale up the cluster faster with higher upscaling speed.
  7. # E.g., if the task requires adding more nodes then autoscaler will gradually
  8. # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
  9. # This number should be > 0.
  10. upscaling_speed: 1.0
  11. # This executes all commands on all nodes in the docker container,
  12. # and opens all the necessary ports to support the Ray cluster.
  13. # Empty object means disabled.
  14. docker: {}
  15. # If a node is idle for this many minutes, it will be removed.
  16. idle_timeout_minutes: 5
  17. # Cloud-provider specific configuration.
  18. provider:
  19. type: azure
  20. # https://azure.microsoft.com/en-us/global-infrastructure/locations
  21. location: westus2
  22. resource_group: ray-cluster
  23. # set subscription id otherwise the default from az cli will be used
  24. # subscription_id: 00000000-0000-0000-0000-000000000000
  25. # set unique subnet mask or a random mask will be used
  26. # subnet_mask: 10.0.0.0/16
  27. # set unique id for resources in this cluster
  28. # if not set a default id will be generated based on the resource group and cluster name
  29. # unique_id: RAY1
  30. # Availability zones for VM placement (comma-separated). Examples:
  31. # availability_zone: "1,2,3" # Use zones 1, 2, and 3
  32. # availability_zone: "1" # Use only zone 1
  33. # availability_zone: "none" # Explicitly disable zones
  34. availability_zone: "auto" # Let Azure automatically pick zones
  35. # How Ray will authenticate with newly launched nodes.
  36. auth:
  37. ssh_user: ubuntu
  38. # SSH keys will be auto-generated with Ray-specific names if not specified
  39. # Uncomment and specify custom paths if you want to use different existing keys:
  40. # ssh_private_key: /path/to/your/key.pem
  41. # ssh_public_key: /path/to/your/key.pub
  42. # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
  43. # See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
  44. # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
  45. # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
  46. # Tell the autoscaler the allowed node types and the resources they provide.
  47. # The key is the name of the node type, which is just for debugging purposes.
  48. # The node config specifies the launch config and physical instance type.
  49. available_node_types:
  50. ray.head.default:
  51. resources: {"CPU": 2}
  52. # Provider-specific config, e.g. instance type.
  53. node_config:
  54. azure_arm_parameters:
  55. vmSize: Standard_D2s_v3
  56. # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
  57. imagePublisher: microsoft-dsvm
  58. imageOffer: ubuntu-2204
  59. imageSku: 2204-gen2
  60. imageVersion: latest
  61. # Head node: explicitly disable availability zones
  62. availability_zone: "none"
  63. ray.worker.default:
  64. # The minimum number of nodes of this type to launch.
  65. # This number should be >= 0.
  66. min_workers: 0
  67. # The resources provided by this node type.
  68. resources: {"CPU": 2}
  69. # Provider-specific config, e.g. instance type.
  70. node_config:
  71. azure_arm_parameters:
  72. vmSize: Standard_D2s_v3
  73. # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
  74. imagePublisher: microsoft-dsvm
  75. imageOffer: ubuntu-2204
  76. imageSku: 2204-gen2
  77. imageVersion: latest
  78. # comment lines below to not use Spot instances
  79. priority: Spot
  80. # set a maximum price for spot instances if desired
  81. # billingProfile:
  82. # maxPrice: -1
  83. # Workers: inherit provider availability_zone setting
  84. # Options: "1,2,3" for specific zones, "none" to disable zones,
  85. # or "auto" to let Azure pick zones automatically
  86. # Specify the node type of the head node (as configured above).
  87. head_node_type: ray.head.default
  88. # Files or directories to copy to the head and worker nodes. The format is a
  89. # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
  90. file_mounts: {
  91. # "/path1/on/remote/machine": "/path1/on/local/machine",
  92. # "/path2/on/remote/machine": "/path2/on/local/machine",
  93. }
  94. # Files or directories to copy from the head node to the worker nodes. The format is a
  95. # list of paths. The same path on the head node will be copied to the worker node.
  96. # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
  97. # you should just use file_mounts. Only use this if you know what you're doing!
  98. cluster_synced_files: []
  99. # Whether changes to directories in file_mounts or cluster_synced_files in the head node
  100. # should sync to the worker node continuously
  101. file_mounts_sync_continuously: False
  102. # Patterns for files to exclude when running rsync up or rsync down
  103. rsync_exclude: []
  104. # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
  105. # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
  106. # as a value, the behavior will match git's behavior for finding and using .gitignore files.
  107. rsync_filter: []
  108. # List of commands that will be run before `setup_commands`. If docker is
  109. # enabled, these commands will run outside the container and before docker
  110. # is setup.
  111. initialization_commands:
  112. # get rid of annoying Ubuntu message
  113. - touch ~/.sudo_as_admin_successful
  114. # List of shell commands to run to set up nodes.
  115. setup_commands:
  116. # Note: if you're developing Ray, you probably want to create an AMI that
  117. # has your Ray repo pre-cloned. Then, you can replace the pip installs
  118. # below with a git checkout <your_sha> (and possibly a recompile).
  119. # Note: The Ubuntu 22.04 dsvm image has a few venvs already configured but
  120. # they all contain python modules that are not compatible with Ray at the moment.
  121. - (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
  122. - conda tos accept
  123. - conda create -n ray-env python=3.10 -y
  124. - conda activate ray-env && echo 'conda activate ray-env' >> ~/.bashrc
  125. - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
  126. # Consider uncommenting these if you also want to run apt-get commands during setup
  127. # - sudo pkill -9 apt-get || true
  128. # - sudo pkill -9 dpkg || true
  129. # - sudo dpkg --configure -a
  130. # Custom commands that will be run on the head node after common setup.
  131. head_setup_commands:
  132. - pip install -U azure-core==1.35.0 azure-identity==1.23.1 azure-mgmt-compute==35.0.0 azure-mgmt-network==29.0.0 azure-mgmt-resource==24.0.0 azure-common==1.1.28 msrest==0.7.1 msrestazure==0.6.4.post1
  133. # Custom commands that will be run on worker nodes after common setup.
  134. worker_setup_commands: []
  135. # Command to start ray on the head node. You don't need to change this.
  136. head_start_ray_commands:
  137. - ray stop
  138. - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
  139. # Command to start ray on worker nodes. You don't need to change this.
  140. worker_start_ray_commands:
  141. - ray stop
  142. - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076