prom_metrics.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. from typing import Optional
  2. class NullMetric:
  3. """Mock metric class to be used in case of prometheus_client import error."""
  4. def set(self, *args, **kwargs):
  5. pass
  6. def observe(self, *args, **kwargs):
  7. pass
  8. def inc(self, *args, **kwargs):
  9. pass
  10. def labels(self, *args, **kwargs):
  11. return self
  12. def clear(self):
  13. pass
  14. try:
  15. from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
  16. # The metrics in this class should be kept in sync with
  17. # python/ray/tests/test_metrics_agent.py
  18. class AutoscalerPrometheusMetrics:
  19. def __init__(
  20. self, session_name: str = None, registry: Optional[CollectorRegistry] = None
  21. ):
  22. self.registry: CollectorRegistry = registry or CollectorRegistry(
  23. auto_describe=True
  24. )
  25. self._session_name = session_name
  26. # Buckets: 5 seconds, 10 seconds, 20 seconds, 30 seconds,
  27. # 45 seconds, 1 minute, 1.5 minutes, 2 minutes,
  28. # 3 minutes, 4 minutes, 5 minutes, 6 minutes,
  29. # 8 minutes, 10 minutes, 12 minutes, 15 minutes
  30. # 20 minutes, 25 minutes, 30 minutes
  31. # used for both worker launch time and worker update time
  32. histogram_buckets = [
  33. 5,
  34. 10,
  35. 20,
  36. 30,
  37. 45,
  38. 60,
  39. 90,
  40. 120,
  41. 180,
  42. 240,
  43. 300,
  44. 360,
  45. 480,
  46. 600,
  47. 720,
  48. 900,
  49. 1200,
  50. 1500,
  51. 1800,
  52. ]
  53. # Buckets: .01 seconds to 1000 seconds.
  54. # Used for autoscaler update time.
  55. update_time_buckets = [0.01, 0.1, 1, 10, 100, 1000]
  56. self.worker_create_node_time: Histogram = Histogram(
  57. "worker_create_node_time_seconds",
  58. "Worker launch time. This is the time it takes for a call to "
  59. "a node provider's create_node method to return. Note that "
  60. "when nodes are launched in batches, the launch time for that "
  61. "batch will be observed once for *each* node in that batch. "
  62. "For example, if 8 nodes are launched in 3 minutes, a launch "
  63. "time of 3 minutes will be observed 8 times.",
  64. labelnames=("SessionName",),
  65. unit="seconds",
  66. namespace="autoscaler",
  67. registry=self.registry,
  68. buckets=histogram_buckets,
  69. ).labels(SessionName=session_name)
  70. self.worker_update_time: Histogram = Histogram(
  71. "worker_update_time_seconds",
  72. "Worker update time. This is the time between when an updater "
  73. "thread begins executing and when it exits successfully. This "
  74. "metric only observes times for successful updates.",
  75. labelnames=("SessionName",),
  76. unit="seconds",
  77. namespace="autoscaler",
  78. registry=self.registry,
  79. buckets=histogram_buckets,
  80. ).labels(SessionName=session_name)
  81. self.update_time: Histogram = Histogram(
  82. "update_time",
  83. "Autoscaler update time. This is the time for an autoscaler "
  84. "update iteration to complete.",
  85. labelnames=("SessionName",),
  86. unit="seconds",
  87. namespace="autoscaler",
  88. registry=self.registry,
  89. buckets=update_time_buckets,
  90. ).labels(SessionName=session_name)
  91. self.pending_nodes: Gauge = Gauge(
  92. "pending_nodes",
  93. "Number of nodes pending to be started.",
  94. labelnames=(
  95. "NodeType",
  96. "SessionName",
  97. ),
  98. unit="nodes",
  99. namespace="autoscaler",
  100. registry=self.registry,
  101. )
  102. self.active_nodes: Gauge = Gauge(
  103. "active_nodes",
  104. "Number of nodes in the cluster.",
  105. labelnames=(
  106. "NodeType",
  107. "SessionName",
  108. ),
  109. unit="nodes",
  110. namespace="autoscaler",
  111. registry=self.registry,
  112. )
  113. self.recently_failed_nodes = Gauge(
  114. "recently_failed_nodes",
  115. "The number of recently failed nodes. This count could reset "
  116. "at undefined times.",
  117. labelnames=(
  118. "NodeType",
  119. "SessionName",
  120. ),
  121. unit="nodes",
  122. namespace="autoscaler",
  123. registry=self.registry,
  124. )
  125. self.started_nodes: Counter = Counter(
  126. "started_nodes",
  127. "Number of nodes started.",
  128. labelnames=("SessionName",),
  129. unit="nodes",
  130. namespace="autoscaler",
  131. registry=self.registry,
  132. ).labels(SessionName=session_name)
  133. self.stopped_nodes: Counter = Counter(
  134. "stopped_nodes",
  135. "Number of nodes stopped.",
  136. labelnames=("SessionName",),
  137. unit="nodes",
  138. namespace="autoscaler",
  139. registry=self.registry,
  140. ).labels(SessionName=session_name)
  141. self.updating_nodes: Gauge = Gauge(
  142. "updating_nodes",
  143. "Number of nodes in the process of updating.",
  144. labelnames=("SessionName",),
  145. unit="nodes",
  146. namespace="autoscaler",
  147. registry=self.registry,
  148. ).labels(SessionName=session_name)
  149. self.recovering_nodes: Gauge = Gauge(
  150. "recovering_nodes",
  151. "Number of nodes in the process of recovering.",
  152. labelnames=("SessionName",),
  153. unit="nodes",
  154. namespace="autoscaler",
  155. registry=self.registry,
  156. ).labels(SessionName=session_name)
  157. self.running_workers: Gauge = Gauge(
  158. "running_workers",
  159. "Number of worker nodes running.",
  160. labelnames=("SessionName",),
  161. unit="nodes",
  162. namespace="autoscaler",
  163. registry=self.registry,
  164. ).labels(SessionName=session_name)
  165. self.failed_create_nodes: Counter = Counter(
  166. "failed_create_nodes",
  167. "Number of nodes that failed to be created due to an "
  168. "exception in the node provider's create_node method.",
  169. labelnames=("SessionName",),
  170. unit="nodes",
  171. namespace="autoscaler",
  172. registry=self.registry,
  173. ).labels(SessionName=session_name)
  174. self.failed_updates: Counter = Counter(
  175. "failed_updates",
  176. "Number of failed worker node updates.",
  177. labelnames=("SessionName",),
  178. unit="updates",
  179. namespace="autoscaler",
  180. registry=self.registry,
  181. ).labels(SessionName=session_name)
  182. self.successful_updates: Counter = Counter(
  183. "successful_updates",
  184. "Number of succesfful worker node updates.",
  185. labelnames=("SessionName",),
  186. unit="updates",
  187. namespace="autoscaler",
  188. registry=self.registry,
  189. ).labels(SessionName=session_name)
  190. self.failed_recoveries: Counter = Counter(
  191. "failed_recoveries",
  192. "Number of failed node recoveries.",
  193. labelnames=("SessionName",),
  194. unit="recoveries",
  195. namespace="autoscaler",
  196. registry=self.registry,
  197. ).labels(SessionName=session_name)
  198. self.successful_recoveries: Counter = Counter(
  199. "successful_recoveries",
  200. "Number of successful node recoveries.",
  201. labelnames=("SessionName",),
  202. unit="recoveries",
  203. namespace="autoscaler",
  204. registry=self.registry,
  205. ).labels(SessionName=session_name)
  206. self.update_loop_exceptions: Counter = Counter(
  207. "update_loop_exceptions",
  208. "Number of exceptions raised in the update loop of the autoscaler.",
  209. labelnames=("SessionName",),
  210. unit="exceptions",
  211. namespace="autoscaler",
  212. registry=self.registry,
  213. ).labels(SessionName=session_name)
  214. self.node_launch_exceptions: Counter = Counter(
  215. "node_launch_exceptions",
  216. "Number of exceptions raised while launching nodes.",
  217. labelnames=("SessionName",),
  218. unit="exceptions",
  219. namespace="autoscaler",
  220. registry=self.registry,
  221. ).labels(SessionName=session_name)
  222. self.reset_exceptions: Counter = Counter(
  223. "reset_exceptions",
  224. "Number of exceptions raised while resetting the autoscaler.",
  225. labelnames=("SessionName",),
  226. unit="exceptions",
  227. namespace="autoscaler",
  228. registry=self.registry,
  229. ).labels(SessionName=session_name)
  230. self.config_validation_exceptions: Counter = Counter(
  231. "config_validation_exceptions",
  232. "Number of exceptions raised while validating the config "
  233. "during a reset.",
  234. labelnames=("SessionName",),
  235. unit="exceptions",
  236. namespace="autoscaler",
  237. registry=self.registry,
  238. ).labels(SessionName=session_name)
  239. self.drain_node_exceptions: Counter = Counter(
  240. "drain_node_exceptions",
  241. "Number of exceptions raised when making a DrainNode rpc"
  242. "prior to node termination.",
  243. labelnames=("SessionName",),
  244. unit="exceptions",
  245. namespace="autoscaler",
  246. registry=self.registry,
  247. ).labels(SessionName=session_name)
  248. # This represents the autoscaler's view of essentially
  249. # `ray.cluster_resources()`, it may be slightly different from the
  250. # core metric from an eventual consistency perspective.
  251. self.cluster_resources: Gauge = Gauge(
  252. "cluster_resources",
  253. "Total logical resources in the cluster.",
  254. labelnames=("resource", "SessionName"),
  255. unit="resources",
  256. namespace="autoscaler",
  257. registry=self.registry,
  258. )
  259. # This represents the pending launches + nodes being set up for the
  260. # autoscaler.
  261. self.pending_resources: Gauge = Gauge(
  262. "pending_resources",
  263. "Pending logical resources in the cluster.",
  264. labelnames=("resource", "SessionName"),
  265. unit="resources",
  266. namespace="autoscaler",
  267. registry=self.registry,
  268. )
  269. @property
  270. def session_name(self):
  271. return self._session_name
  272. except ImportError:
  273. class AutoscalerPrometheusMetrics(object):
  274. def __init__(self, session_name: str = None):
  275. pass
  276. def __getattr__(self, attr):
  277. return NullMetric()