parser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. import io as StringIO
  2. import re
  3. import string
  4. from typing import Dict, Iterable, List, Match, Optional, TextIO, Tuple
  5. from .metrics_core import Metric
  6. from .samples import Sample
  7. from .validation import (
  8. _is_valid_legacy_metric_name, _validate_labelname, _validate_metric_name,
  9. )
  10. def text_string_to_metric_families(text: str) -> Iterable[Metric]:
  11. """Parse Prometheus text format from a unicode string.
  12. See text_fd_to_metric_families.
  13. """
  14. yield from text_fd_to_metric_families(StringIO.StringIO(text))
  15. ESCAPE_SEQUENCES = {
  16. '\\\\': '\\',
  17. '\\n': '\n',
  18. '\\"': '"',
  19. }
  20. def replace_escape_sequence(match: Match[str]) -> str:
  21. return ESCAPE_SEQUENCES[match.group(0)]
  22. HELP_ESCAPING_RE = re.compile(r'\\[\\n]')
  23. ESCAPING_RE = re.compile(r'\\[\\n"]')
  24. def _replace_help_escaping(s: str) -> str:
  25. return HELP_ESCAPING_RE.sub(replace_escape_sequence, s)
  26. def _replace_escaping(s: str) -> str:
  27. return ESCAPING_RE.sub(replace_escape_sequence, s)
  28. def _is_character_escaped(s: str, charpos: int) -> bool:
  29. num_bslashes = 0
  30. while (charpos > num_bslashes
  31. and s[charpos - 1 - num_bslashes] == '\\'):
  32. num_bslashes += 1
  33. return num_bslashes % 2 == 1
  34. def parse_labels(labels_string: str, openmetrics: bool = False) -> Dict[str, str]:
  35. labels: Dict[str, str] = {}
  36. # Copy original labels
  37. sub_labels = labels_string.strip()
  38. if openmetrics and sub_labels and sub_labels[0] == ',':
  39. raise ValueError("leading comma: " + labels_string)
  40. try:
  41. # Process one label at a time
  42. while sub_labels:
  43. # The label name is before the equal, or if there's no equal, that's the
  44. # metric name.
  45. name_term, value_term, sub_labels = _next_term(sub_labels, openmetrics)
  46. if not value_term:
  47. if openmetrics:
  48. raise ValueError("empty term in line: " + labels_string)
  49. continue
  50. label_name, quoted_name = _unquote_unescape(name_term)
  51. if not quoted_name and not _is_valid_legacy_metric_name(label_name):
  52. raise ValueError("unquoted UTF-8 metric name")
  53. # Check for missing quotes
  54. if not value_term or value_term[0] != '"':
  55. raise ValueError
  56. # The first quote is guaranteed to be after the equal.
  57. # Make sure that the next unescaped quote is the last character.
  58. i = 1
  59. while i < len(value_term):
  60. i = value_term.index('"', i)
  61. if not _is_character_escaped(value_term[:i], i):
  62. break
  63. i += 1
  64. # The label value is between the first and last quote
  65. quote_end = i + 1
  66. if quote_end != len(value_term):
  67. raise ValueError("unexpected text after quote: " + labels_string)
  68. label_value, _ = _unquote_unescape(value_term)
  69. if label_name == '__name__':
  70. _validate_metric_name(label_name)
  71. else:
  72. _validate_labelname(label_name)
  73. if label_name in labels:
  74. raise ValueError("invalid line, duplicate label name: " + labels_string)
  75. labels[label_name] = label_value
  76. return labels
  77. except ValueError:
  78. raise ValueError("Invalid labels: " + labels_string)
  79. def _next_term(text: str, openmetrics: bool) -> Tuple[str, str, str]:
  80. """Extract the next comma-separated label term from the text. The results
  81. are stripped terms for the label name, label value, and then the remainder
  82. of the string including the final , or }.
  83. Raises ValueError if the term is empty and we're in openmetrics mode.
  84. """
  85. # There may be a leading comma, which is fine here.
  86. if text[0] == ',':
  87. text = text[1:]
  88. if not text:
  89. return "", "", ""
  90. if text[0] == ',':
  91. raise ValueError("multiple commas")
  92. splitpos = _next_unquoted_char(text, '=,}')
  93. if splitpos >= 0 and text[splitpos] == "=":
  94. labelname = text[:splitpos]
  95. text = text[splitpos + 1:]
  96. splitpos = _next_unquoted_char(text, ',}')
  97. else:
  98. labelname = "__name__"
  99. if splitpos == -1:
  100. splitpos = len(text)
  101. term = text[:splitpos]
  102. if not term and openmetrics:
  103. raise ValueError("empty term:", term)
  104. rest = text[splitpos:]
  105. return labelname, term.strip(), rest.strip()
  106. def _next_unquoted_char(text: str, chs: Optional[str], startidx: int = 0) -> int:
  107. """Return position of next unquoted character in tuple, or -1 if not found.
  108. It is always assumed that the first character being checked is not already
  109. inside quotes.
  110. """
  111. in_quotes = False
  112. if chs is None:
  113. chs = string.whitespace
  114. for i, c in enumerate(text[startidx:]):
  115. if c == '"' and not _is_character_escaped(text, startidx + i):
  116. in_quotes = not in_quotes
  117. if not in_quotes:
  118. if c in chs:
  119. return startidx + i
  120. return -1
  121. def _last_unquoted_char(text: str, chs: Optional[str]) -> int:
  122. """Return position of last unquoted character in list, or -1 if not found."""
  123. i = len(text) - 1
  124. in_quotes = False
  125. if chs is None:
  126. chs = string.whitespace
  127. while i > 0:
  128. if text[i] == '"' and not _is_character_escaped(text, i):
  129. in_quotes = not in_quotes
  130. if not in_quotes:
  131. if text[i] in chs:
  132. return i
  133. i -= 1
  134. return -1
  135. def _split_quoted(text, separator, maxsplit=0):
  136. """Splits on split_ch similarly to strings.split, skipping separators if
  137. they are inside quotes.
  138. """
  139. tokens = ['']
  140. x = 0
  141. while x < len(text):
  142. split_pos = _next_unquoted_char(text, separator, x)
  143. if split_pos == -1:
  144. tokens[-1] = text[x:]
  145. x = len(text)
  146. continue
  147. # If the first character is the separator keep going. This happens when
  148. # there are double whitespace characters separating symbols.
  149. if split_pos == x:
  150. x += 1
  151. continue
  152. if maxsplit > 0 and len(tokens) > maxsplit:
  153. tokens[-1] = text[x:]
  154. break
  155. tokens[-1] = text[x:split_pos]
  156. x = split_pos + 1
  157. tokens.append('')
  158. return tokens
  159. def _unquote_unescape(text):
  160. """Returns the string, and true if it was quoted."""
  161. if not text:
  162. return text, False
  163. quoted = False
  164. text = text.strip()
  165. if text[0] == '"':
  166. if len(text) == 1 or text[-1] != '"':
  167. raise ValueError("missing close quote")
  168. text = text[1:-1]
  169. quoted = True
  170. if "\\" in text:
  171. text = _replace_escaping(text)
  172. return text, quoted
  173. # If we have multiple values only consider the first
  174. def _parse_value_and_timestamp(s: str) -> Tuple[float, Optional[float]]:
  175. s = s.lstrip()
  176. separator = " "
  177. if separator not in s:
  178. separator = "\t"
  179. values = [value.strip() for value in s.split(separator) if value.strip()]
  180. if not values:
  181. return float(s), None
  182. value = _parse_value(values[0])
  183. timestamp = (_parse_value(values[-1]) / 1000) if len(values) > 1 else None
  184. return value, timestamp
  185. def _parse_value(value):
  186. value = ''.join(value)
  187. if value != value.strip() or '_' in value:
  188. raise ValueError(f"Invalid value: {value!r}")
  189. try:
  190. return int(value)
  191. except ValueError:
  192. return float(value)
  193. def _parse_sample(text):
  194. separator = " # "
  195. # Detect the labels in the text
  196. label_start = _next_unquoted_char(text, '{')
  197. if label_start == -1 or separator in text[:label_start]:
  198. # We don't have labels, but there could be an exemplar.
  199. name_end = _next_unquoted_char(text, ' \t')
  200. name = text[:name_end].strip()
  201. if not _is_valid_legacy_metric_name(name):
  202. raise ValueError("invalid metric name:" + text)
  203. # Parse the remaining text after the name
  204. remaining_text = text[name_end + 1:]
  205. value, timestamp = _parse_value_and_timestamp(remaining_text)
  206. return Sample(name, {}, value, timestamp)
  207. name = text[:label_start].strip()
  208. label_end = _next_unquoted_char(text[label_start:], '}') + label_start
  209. labels = parse_labels(text[label_start + 1:label_end], False)
  210. if not name:
  211. # Name might be in the labels
  212. if '__name__' not in labels:
  213. raise ValueError
  214. name = labels['__name__']
  215. del labels['__name__']
  216. elif '__name__' in labels:
  217. raise ValueError("metric name specified more than once")
  218. # Parsing labels succeeded, continue parsing the remaining text
  219. remaining_text = text[label_end + 1:]
  220. value, timestamp = _parse_value_and_timestamp(remaining_text)
  221. return Sample(name, labels, value, timestamp)
  222. def text_fd_to_metric_families(fd: TextIO) -> Iterable[Metric]:
  223. """Parse Prometheus text format from a file descriptor.
  224. This is a laxer parser than the main Go parser,
  225. so successful parsing does not imply that the parsed
  226. text meets the specification.
  227. Yields Metric's.
  228. """
  229. name = ''
  230. documentation = ''
  231. typ = 'untyped'
  232. samples: List[Sample] = []
  233. allowed_names = []
  234. def build_metric(name: str, documentation: str, typ: str, samples: List[Sample]) -> Metric:
  235. # Munge counters into OpenMetrics representation
  236. # used internally.
  237. if typ == 'counter':
  238. if name.endswith('_total'):
  239. name = name[:-6]
  240. else:
  241. new_samples = []
  242. for s in samples:
  243. new_samples.append(Sample(s[0] + '_total', *s[1:]))
  244. samples = new_samples
  245. metric = Metric(name, documentation, typ)
  246. metric.samples = samples
  247. return metric
  248. for line in fd:
  249. line = line.strip()
  250. if line.startswith('#'):
  251. parts = _split_quoted(line, None, 3)
  252. if len(parts) < 2:
  253. continue
  254. candidate_name, quoted = '', False
  255. if len(parts) > 2:
  256. # Ignore comment tokens
  257. if parts[1] != 'TYPE' and parts[1] != 'HELP':
  258. continue
  259. candidate_name, quoted = _unquote_unescape(parts[2])
  260. if not quoted and not _is_valid_legacy_metric_name(candidate_name):
  261. raise ValueError
  262. if parts[1] == 'HELP':
  263. if candidate_name != name:
  264. if name != '':
  265. yield build_metric(name, documentation, typ, samples)
  266. # New metric
  267. name = candidate_name
  268. typ = 'untyped'
  269. samples = []
  270. allowed_names = [candidate_name]
  271. if len(parts) == 4:
  272. documentation = _replace_help_escaping(parts[3])
  273. else:
  274. documentation = ''
  275. elif parts[1] == 'TYPE':
  276. if len(parts) < 4:
  277. raise ValueError
  278. if candidate_name != name:
  279. if name != '':
  280. yield build_metric(name, documentation, typ, samples)
  281. # New metric
  282. name = candidate_name
  283. documentation = ''
  284. samples = []
  285. typ = parts[3]
  286. allowed_names = {
  287. 'counter': [''],
  288. 'gauge': [''],
  289. 'summary': ['_count', '_sum', ''],
  290. 'histogram': ['_count', '_sum', '_bucket'],
  291. }.get(typ, [''])
  292. allowed_names = [name + n for n in allowed_names]
  293. elif line == '':
  294. # Ignore blank lines
  295. pass
  296. else:
  297. sample = _parse_sample(line)
  298. if sample.name not in allowed_names:
  299. if name != '':
  300. yield build_metric(name, documentation, typ, samples)
  301. # New metric, yield immediately as untyped singleton
  302. name = ''
  303. documentation = ''
  304. typ = 'untyped'
  305. samples = []
  306. allowed_names = []
  307. yield build_metric(sample[0], documentation, typ, [sample])
  308. else:
  309. samples.append(sample)
  310. if name != '':
  311. yield build_metric(name, documentation, typ, samples)