metrics.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. from typing import Dict, List, Optional, Tuple, Union
  2. import ray
  3. from ray.serve import context
  4. from ray.util import metrics
  5. from ray.util.annotations import PublicAPI
  6. DEPLOYMENT_TAG = "deployment"
  7. REPLICA_TAG = "replica"
  8. APPLICATION_TAG = "application"
  9. ROUTE_TAG = "route"
  10. def _add_serve_metric_tags(tag_keys: Optional[Tuple[str]] = None) -> Tuple[str]:
  11. """Add serve context tags to the tag_keys"""
  12. if tag_keys is None:
  13. tag_keys = tuple()
  14. # If the context doesn't exist, no serve tag is added.
  15. if context._get_internal_replica_context() is None:
  16. return tag_keys
  17. # Check no collision with customer tag
  18. if DEPLOYMENT_TAG in tag_keys:
  19. raise ValueError(f"'{DEPLOYMENT_TAG}' tag is reserved for Ray Serve metrics")
  20. if REPLICA_TAG in tag_keys:
  21. raise ValueError(f"'{REPLICA_TAG}' tag is reserved for Ray Serve metrics")
  22. if APPLICATION_TAG in tag_keys:
  23. raise ValueError(f"'{APPLICATION_TAG}' tag is reserved for Ray Serve metrics")
  24. # Get serve tag inserted:
  25. ray_serve_tags = (DEPLOYMENT_TAG, REPLICA_TAG)
  26. if context._get_internal_replica_context().app_name:
  27. ray_serve_tags += (APPLICATION_TAG,)
  28. if tag_keys:
  29. tag_keys = ray_serve_tags + tag_keys
  30. else:
  31. tag_keys = ray_serve_tags
  32. return tag_keys
  33. def _add_serve_metric_default_tags(default_tags: Dict[str, str]):
  34. """Add serve context tags and values to the default_tags"""
  35. if context._get_internal_replica_context() is None:
  36. return default_tags
  37. if DEPLOYMENT_TAG in default_tags:
  38. raise ValueError(f"'{DEPLOYMENT_TAG}' tag is reserved for Ray Serve metrics")
  39. if REPLICA_TAG in default_tags:
  40. raise ValueError(f"'{REPLICA_TAG}' tag is reserved for Ray Serve metrics")
  41. if APPLICATION_TAG in default_tags:
  42. raise ValueError(f"'{APPLICATION_TAG}' tag is reserved for Ray Serve metrics")
  43. replica_context = context._get_internal_replica_context()
  44. # TODO(zcin): use replica_context.deployment for deployment tag
  45. default_tags[DEPLOYMENT_TAG] = replica_context.deployment
  46. default_tags[REPLICA_TAG] = replica_context.replica_tag
  47. if replica_context.app_name:
  48. default_tags[APPLICATION_TAG] = replica_context.app_name
  49. return default_tags
  50. def _add_serve_context_tag_values(tag_keys: Tuple, tags: Dict[str, str]):
  51. """Add serve context tag values to the metric tags"""
  52. _request_context = ray.serve.context._get_serve_request_context()
  53. if ROUTE_TAG in tag_keys and ROUTE_TAG not in tags:
  54. tags[ROUTE_TAG] = _request_context.route
  55. @PublicAPI(stability="beta")
  56. class Counter(metrics.Counter):
  57. """A serve cumulative metric that is monotonically increasing.
  58. This corresponds to Prometheus' counter metric:
  59. https://prometheus.io/docs/concepts/metric_types/#counter
  60. Serve-related tags ("deployment", "replica", "application", "route")
  61. are added automatically if not provided.
  62. .. code-block:: python
  63. @serve.deployment
  64. class MyDeployment:
  65. def __init__(self):
  66. self.num_requests = 0
  67. self.my_counter = metrics.Counter(
  68. "my_counter",
  69. description=("The number of odd-numbered requests "
  70. "to this deployment."),
  71. tag_keys=("model",),
  72. )
  73. self.my_counter.set_default_tags({"model": "123"})
  74. def __call__(self):
  75. self.num_requests += 1
  76. if self.num_requests % 2 == 1:
  77. self.my_counter.inc()
  78. .. note::
  79. Before Ray 2.10, this exports a Prometheus gauge metric instead of
  80. a counter metric.
  81. Starting in Ray 2.10, this exports both the proper counter metric
  82. (with a suffix "_total") and gauge metric (for compatibility).
  83. The gauge metric will be removed in a future Ray release and you can set
  84. `RAY_EXPORT_COUNTER_AS_GAUGE=0` to disable exporting it in the meantime.
  85. Args:
  86. name: Name of the metric.
  87. description: Description of the metric.
  88. tag_keys: Tag keys of the metric.
  89. """
  90. def __init__(
  91. self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
  92. ):
  93. if tag_keys and not isinstance(tag_keys, tuple):
  94. raise TypeError(
  95. "tag_keys should be a tuple type, got: " f"{type(tag_keys)}"
  96. )
  97. tag_keys = _add_serve_metric_tags(tag_keys)
  98. super().__init__(name, description, tag_keys)
  99. self.set_default_tags({})
  100. def set_default_tags(self, default_tags: Dict[str, str]):
  101. super().set_default_tags(_add_serve_metric_default_tags(default_tags))
  102. def inc(self, value: Union[int, float] = 1.0, tags: Dict[str, str] = None):
  103. """Increment the counter by the given value, add serve context
  104. tag values to the tags
  105. """
  106. _add_serve_context_tag_values(self._tag_keys, tags)
  107. super().inc(value, tags)
  108. @PublicAPI(stability="beta")
  109. class Gauge(metrics.Gauge):
  110. """Gauges keep the last recorded value and drop everything before.
  111. This corresponds to Prometheus' gauge metric:
  112. https://prometheus.io/docs/concepts/metric_types/#gauge
  113. Serve-related tags ("deployment", "replica", "application", "route")
  114. are added automatically if not provided.
  115. .. code-block:: python
  116. @serve.deployment
  117. class MyDeployment:
  118. def __init__(self):
  119. self.num_requests = 0
  120. self.my_gauge = metrics.Gauge(
  121. "my_gauge",
  122. description=("The current memory usage."),
  123. tag_keys=("model",),
  124. )
  125. self.my_counter.set_default_tags({"model": "123"})
  126. def __call__(self):
  127. process = psutil.Process()
  128. self.gauge.set(process.memory_info().rss)
  129. Args:
  130. name: Name of the metric.
  131. description: Description of the metric.
  132. tag_keys: Tag keys of the metric.
  133. """
  134. def __init__(
  135. self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
  136. ):
  137. if tag_keys and not isinstance(tag_keys, tuple):
  138. raise TypeError(
  139. "tag_keys should be a tuple type, got: " f"{type(tag_keys)}"
  140. )
  141. tag_keys = _add_serve_metric_tags(tag_keys)
  142. super().__init__(name, description, tag_keys)
  143. self.set_default_tags({})
  144. def set_default_tags(self, default_tags: Dict[str, str]):
  145. super().set_default_tags(_add_serve_metric_default_tags(default_tags))
  146. def set(self, value: Union[int, float], tags: Dict[str, str] = None):
  147. """Set the gauge to the given value, add serve context
  148. tag values to the tags
  149. """
  150. _add_serve_context_tag_values(self._tag_keys, tags)
  151. super().set(value, tags)
  152. @PublicAPI(stability="beta")
  153. class Histogram(metrics.Histogram):
  154. """Tracks the size and number of events in buckets.
  155. Histograms allow you to calculate aggregate quantiles
  156. such as 25, 50, 95, 99 percentile latency for an RPC.
  157. This corresponds to Prometheus' histogram metric:
  158. https://prometheus.io/docs/concepts/metric_types/#histogram
  159. Serve-related tags ("deployment", "replica", "application", "route")
  160. are added automatically if not provided.
  161. .. code-block:: python
  162. @serve.deployment
  163. class MyDeployment:
  164. def __init__(self):
  165. self.my_histogram = Histogram(
  166. "my_histogram",
  167. description=("Histogram of the __call__ method running time."),
  168. boundaries=[1,2,4,8,16,32,64],
  169. tag_keys=("model",),
  170. )
  171. self.my_histogram.set_default_tags({"model": "123"})
  172. def __call__(self):
  173. start = time.time()
  174. self.my_histogram.observe(time.time() - start)
  175. Args:
  176. name: Name of the metric.
  177. description: Description of the metric.
  178. boundaries: Boundaries of histogram buckets.
  179. tag_keys: Tag keys of the metric.
  180. """
  181. def __init__(
  182. self,
  183. name: str,
  184. description: str = "",
  185. boundaries: List[float] = None,
  186. tag_keys: Optional[Tuple[str]] = None,
  187. ):
  188. if tag_keys and not isinstance(tag_keys, tuple):
  189. raise TypeError(
  190. "tag_keys should be a tuple type, got: " f"{type(tag_keys)}"
  191. )
  192. tag_keys = _add_serve_metric_tags(tag_keys)
  193. super().__init__(name, description, boundaries, tag_keys)
  194. self.set_default_tags({})
  195. def set_default_tags(self, default_tags: Dict[str, str]):
  196. super().set_default_tags(_add_serve_metric_default_tags(default_tags))
  197. def observe(self, value: Union[int, float], tags: Dict[str, str] = None):
  198. """Observe the given value, add serve context
  199. tag values to the tags
  200. """
  201. _add_serve_context_tag_values(self._tag_keys, tags)
  202. super().observe(value, tags)