dashboard_metrics.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. from typing import Optional
  2. from ray.dashboard.consts import COMPONENT_METRICS_TAG_KEYS
  3. class NullMetric:
  4. """Mock metric class to be used in case of prometheus_client import error."""
  5. def set(self, *args, **kwargs):
  6. pass
  7. def observe(self, *args, **kwargs):
  8. pass
  9. def inc(self, *args, **kwargs):
  10. pass
  11. try:
  12. from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
  13. # The metrics in this class should be kept in sync with
  14. # python/ray/tests/test_metrics_agent.py
  15. class DashboardPrometheusMetrics:
  16. def __init__(self, registry: Optional[CollectorRegistry] = None):
  17. self.registry: CollectorRegistry = registry or CollectorRegistry(
  18. auto_describe=True
  19. )
  20. # Buckets: 5ms, 10ms, 25ms, 50ms, 75ms
  21. # 100ms, 250ms, 500ms, 750ms
  22. # 1s, 2.5s, 5s, 7.5s, 10s
  23. # 20s, 40s, 60s
  24. # used for API duration
  25. histogram_buckets_s = [
  26. 0.005,
  27. 0.01,
  28. 0.025,
  29. 0.05,
  30. 0.075,
  31. 0.1,
  32. 0.25,
  33. 0.5,
  34. 0.75,
  35. 1,
  36. 2.5,
  37. 5,
  38. 7.5,
  39. 10,
  40. 20,
  41. 40,
  42. 60,
  43. ]
  44. self.metrics_request_duration = Histogram(
  45. "dashboard_api_requests_duration_seconds",
  46. "Total duration in seconds per endpoint",
  47. ("endpoint", "http_status", "Version", "SessionName", "Component"),
  48. unit="seconds",
  49. namespace="ray",
  50. registry=self.registry,
  51. buckets=histogram_buckets_s,
  52. )
  53. self.metrics_request_count = Counter(
  54. "dashboard_api_requests_count",
  55. "Total requests count per endpoint",
  56. (
  57. "method",
  58. "endpoint",
  59. "http_status",
  60. "Version",
  61. "SessionName",
  62. "Component",
  63. ),
  64. unit="requests",
  65. namespace="ray",
  66. registry=self.registry,
  67. )
  68. self.metrics_event_loop_tasks = Gauge(
  69. "dashboard_event_loop_tasks",
  70. "Number of tasks currently pending in the event loop's queue.",
  71. tuple(COMPONENT_METRICS_TAG_KEYS),
  72. unit="tasks",
  73. namespace="ray",
  74. registry=self.registry,
  75. )
  76. self.metrics_event_loop_lag = Gauge(
  77. "dashboard_event_loop_lag",
  78. "Event loop lag in seconds.",
  79. tuple(COMPONENT_METRICS_TAG_KEYS),
  80. unit="seconds",
  81. namespace="ray",
  82. registry=self.registry,
  83. )
  84. self.metrics_dashboard_cpu = Gauge(
  85. "component_cpu",
  86. "Dashboard CPU percentage usage.",
  87. tuple(COMPONENT_METRICS_TAG_KEYS),
  88. unit="percentage",
  89. namespace="ray",
  90. registry=self.registry,
  91. )
  92. self.metrics_dashboard_mem_uss = Gauge(
  93. "component_uss",
  94. "USS usage of all components on the node.",
  95. tuple(COMPONENT_METRICS_TAG_KEYS),
  96. unit="mb",
  97. namespace="ray",
  98. registry=self.registry,
  99. )
  100. self.metrics_dashboard_mem_rss = Gauge(
  101. "component_rss",
  102. "RSS usage of all components on the node.",
  103. tuple(COMPONENT_METRICS_TAG_KEYS),
  104. unit="mb",
  105. namespace="ray",
  106. registry=self.registry,
  107. )
  108. except ImportError:
  109. class DashboardPrometheusMetrics(object):
  110. def __getattr__(self, attr):
  111. return NullMetric()