| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653 |
- #!/usr/bin/env python
- import io as StringIO
- import math
- import re
- from ..metrics_core import Metric
- from ..parser import (
- _last_unquoted_char, _next_unquoted_char, _parse_value, _split_quoted,
- _unquote_unescape, parse_labels,
- )
- from ..samples import BucketSpan, Exemplar, NativeHistogram, Sample, Timestamp
- from ..utils import floatToGoString
- from ..validation import _is_valid_legacy_metric_name, _validate_metric_name
- def text_string_to_metric_families(text):
- """Parse Openmetrics text format from a unicode string.
- See text_fd_to_metric_families.
- """
- yield from text_fd_to_metric_families(StringIO.StringIO(text))
- _CANONICAL_NUMBERS = {float("inf")}
- def _isUncanonicalNumber(s):
- f = float(s)
- if f not in _CANONICAL_NUMBERS:
- return False # Only the canonical numbers are required to be canonical.
- return s != floatToGoString(f)
- ESCAPE_SEQUENCES = {
- '\\\\': '\\',
- '\\n': '\n',
- '\\"': '"',
- }
- def _replace_escape_sequence(match):
- return ESCAPE_SEQUENCES[match.group(0)]
- ESCAPING_RE = re.compile(r'\\[\\n"]')
- def _replace_escaping(s):
- return ESCAPING_RE.sub(_replace_escape_sequence, s)
- def _unescape_help(text):
- result = []
- slash = False
- for char in text:
- if slash:
- if char == '\\':
- result.append('\\')
- elif char == '"':
- result.append('"')
- elif char == 'n':
- result.append('\n')
- else:
- result.append('\\' + char)
- slash = False
- else:
- if char == '\\':
- slash = True
- else:
- result.append(char)
- if slash:
- result.append('\\')
- return ''.join(result)
- def _parse_timestamp(timestamp):
- timestamp = ''.join(timestamp)
- if not timestamp:
- return None
- if timestamp != timestamp.strip() or '_' in timestamp:
- raise ValueError(f"Invalid timestamp: {timestamp!r}")
- try:
- # Simple int.
- return Timestamp(int(timestamp), 0)
- except ValueError:
- try:
- # aaaa.bbbb. Nanosecond resolution supported.
- parts = timestamp.split('.', 1)
- return Timestamp(int(parts[0]), int(parts[1][:9].ljust(9, "0")))
- except ValueError:
- # Float.
- ts = float(timestamp)
- if math.isnan(ts) or math.isinf(ts):
- raise ValueError(f"Invalid timestamp: {timestamp!r}")
- return ts
- def _is_character_escaped(s, charpos):
- num_bslashes = 0
- while (charpos > num_bslashes
- and s[charpos - 1 - num_bslashes] == '\\'):
- num_bslashes += 1
- return num_bslashes % 2 == 1
- def _parse_sample(text):
- separator = " # "
- # Detect the labels in the text
- label_start = _next_unquoted_char(text, '{')
- if label_start == -1 or separator in text[:label_start]:
- # We don't have labels, but there could be an exemplar.
- name_end = _next_unquoted_char(text, ' ')
- name = text[:name_end]
- if not _is_valid_legacy_metric_name(name):
- raise ValueError("invalid metric name:" + text)
- # Parse the remaining text after the name
- remaining_text = text[name_end + 1:]
- value, timestamp, exemplar = _parse_remaining_text(remaining_text)
- return Sample(name, {}, value, timestamp, exemplar)
- name = text[:label_start]
- label_end = _next_unquoted_char(text, '}')
- labels = parse_labels(text[label_start + 1:label_end], True)
- if not name:
- # Name might be in the labels
- if '__name__' not in labels:
- raise ValueError
- name = labels['__name__']
- del labels['__name__']
- elif '__name__' in labels:
- raise ValueError("metric name specified more than once")
- # Parsing labels succeeded, continue parsing the remaining text
- remaining_text = text[label_end + 2:]
- value, timestamp, exemplar = _parse_remaining_text(remaining_text)
- return Sample(name, labels, value, timestamp, exemplar)
- def _parse_remaining_text(text):
- split_text = text.split(" ", 1)
- val = _parse_value(split_text[0])
- if len(split_text) == 1:
- # We don't have timestamp or exemplar
- return val, None, None
- timestamp = []
- exemplar_value = []
- exemplar_timestamp = []
- exemplar_labels = None
- state = 'timestamp'
- text = split_text[1]
- it = iter(text)
- in_quotes = False
- for char in it:
- if char == '"':
- in_quotes = not in_quotes
- if in_quotes:
- continue
- if state == 'timestamp':
- if char == '#' and not timestamp:
- state = 'exemplarspace'
- elif char == ' ':
- state = 'exemplarhash'
- else:
- timestamp.append(char)
- elif state == 'exemplarhash':
- if char == '#':
- state = 'exemplarspace'
- else:
- raise ValueError("Invalid line: " + text)
- elif state == 'exemplarspace':
- if char == ' ':
- state = 'exemplarstartoflabels'
- else:
- raise ValueError("Invalid line: " + text)
- elif state == 'exemplarstartoflabels':
- if char == '{':
- label_start = _next_unquoted_char(text, '{')
- label_end = _last_unquoted_char(text, '}')
- exemplar_labels = parse_labels(text[label_start + 1:label_end], True)
- state = 'exemplarparsedlabels'
- else:
- raise ValueError("Invalid line: " + text)
- elif state == 'exemplarparsedlabels':
- if char == '}':
- state = 'exemplarvaluespace'
- elif state == 'exemplarvaluespace':
- if char == ' ':
- state = 'exemplarvalue'
- else:
- raise ValueError("Invalid line: " + text)
- elif state == 'exemplarvalue':
- if char == ' ' and not exemplar_value:
- raise ValueError("Invalid line: " + text)
- elif char == ' ':
- state = 'exemplartimestamp'
- else:
- exemplar_value.append(char)
- elif state == 'exemplartimestamp':
- exemplar_timestamp.append(char)
- # Trailing space after value.
- if state == 'timestamp' and not timestamp:
- raise ValueError("Invalid line: " + text)
- # Trailing space after value.
- if state == 'exemplartimestamp' and not exemplar_timestamp:
- raise ValueError("Invalid line: " + text)
- # Incomplete exemplar.
- if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels', 'exemplarparsedlabels']:
- raise ValueError("Invalid line: " + text)
- ts = _parse_timestamp(timestamp)
- exemplar = None
- if exemplar_labels is not None:
- exemplar_length = sum(len(k) + len(v) for k, v in exemplar_labels.items())
- if exemplar_length > 128:
- raise ValueError("Exemplar labels are too long: " + text)
- exemplar = Exemplar(
- exemplar_labels,
- _parse_value(exemplar_value),
- _parse_timestamp(exemplar_timestamp),
- )
- return val, ts, exemplar
- def _parse_nh_sample(text, suffixes):
- """Determines if the line has a native histogram sample, and parses it if so."""
- labels_start = _next_unquoted_char(text, '{')
- labels_end = -1
- # Finding a native histogram sample requires careful parsing of
- # possibly-quoted text, which can appear in metric names, label names, and
- # values.
- #
- # First, we need to determine if there are metric labels. Find the space
- # between the metric definition and the rest of the line. Look for unquoted
- # space or {.
- i = 0
- has_metric_labels = False
- i = _next_unquoted_char(text, ' {')
- if i == -1:
- return
- # If the first unquoted char was a {, then that is the metric labels (which
- # could contain a UTF-8 metric name).
- if text[i] == '{':
- has_metric_labels = True
- # Consume the labels -- jump ahead to the close bracket.
- labels_end = i = _next_unquoted_char(text, '}', i)
- if labels_end == -1:
- raise ValueError
-
- # If there is no subsequent unquoted {, then it's definitely not a nh.
- nh_value_start = _next_unquoted_char(text, '{', i + 1)
- if nh_value_start == -1:
- return
-
- # Edge case: if there is an unquoted # between the metric definition and the {,
- # then this is actually an exemplar
- exemplar = _next_unquoted_char(text, '#', i + 1)
- if exemplar != -1 and exemplar < nh_value_start:
- return
-
- nh_value_end = _next_unquoted_char(text, '}', nh_value_start)
- if nh_value_end == -1:
- raise ValueError
-
- if has_metric_labels:
- labelstext = text[labels_start + 1:labels_end]
- labels = parse_labels(labelstext, True)
- name_end = labels_start
- name = text[:name_end]
- if name.endswith(suffixes):
- raise ValueError("the sample name of a native histogram with labels should have no suffixes", name)
- if not name:
- # Name might be in the labels
- if '__name__' not in labels:
- raise ValueError
- name = labels['__name__']
- del labels['__name__']
- # Edge case: the only "label" is the name definition.
- if not labels:
- labels = None
-
- nh_value = text[nh_value_start:]
- nat_hist_value = _parse_nh_struct(nh_value)
- return Sample(name, labels, None, None, None, nat_hist_value)
- # check if it's a native histogram
- else:
- nh_value = text[nh_value_start:]
- name_end = nh_value_start - 1
- name = text[:name_end]
- if name.endswith(suffixes):
- raise ValueError("the sample name of a native histogram should have no suffixes", name)
- # Not possible for UTF-8 name here, that would have been caught as having a labelset.
- nat_hist_value = _parse_nh_struct(nh_value)
- return Sample(name, None, None, None, None, nat_hist_value)
- def _parse_nh_struct(text):
- pattern = r'(\w+):\s*([^,}]+)'
- re_spans = re.compile(r'(positive_spans|negative_spans):\[(\d+:\d+(,\d+:\d+)*)\]')
- re_deltas = re.compile(r'(positive_deltas|negative_deltas):\[(-?\d+(?:,-?\d+)*)\]')
- items = dict(re.findall(pattern, text))
- span_matches = re_spans.findall(text)
- deltas = dict(re_deltas.findall(text))
- count_value = int(items['count'])
- sum_value = int(items['sum'])
- schema = int(items['schema'])
- zero_threshold = float(items['zero_threshold'])
- zero_count = int(items['zero_count'])
- pos_spans = _compose_spans(span_matches, 'positive_spans')
- neg_spans = _compose_spans(span_matches, 'negative_spans')
- pos_deltas = _compose_deltas(deltas, 'positive_deltas')
- neg_deltas = _compose_deltas(deltas, 'negative_deltas')
-
- return NativeHistogram(
- count_value=count_value,
- sum_value=sum_value,
- schema=schema,
- zero_threshold=zero_threshold,
- zero_count=zero_count,
- pos_spans=pos_spans,
- neg_spans=neg_spans,
- pos_deltas=pos_deltas,
- neg_deltas=neg_deltas
- )
-
- def _compose_spans(span_matches, spans_name):
- """Takes a list of span matches (expected to be a list of tuples) and a string
- (the expected span list name) and processes the list so that the values extracted
- from the span matches can be used to compose a tuple of BucketSpan objects"""
- spans = {}
- for match in span_matches:
- # Extract the key from the match (first element of the tuple).
- key = match[0]
- # Extract the value from the match (second element of the tuple).
- # Split the value string by commas to get individual pairs,
- # split each pair by ':' to get start and end, and convert them to integers.
- value = [tuple(map(int, pair.split(':'))) for pair in match[1].split(',')]
- # Store the processed value in the spans dictionary with the key.
- spans[key] = value
- if spans_name not in spans:
- return None
- out_spans = []
- # Iterate over each start and end tuple in the list of tuples for the specified spans_name.
- for start, end in spans[spans_name]:
- # Compose a BucketSpan object with the start and end values
- # and append it to the out_spans list.
- out_spans.append(BucketSpan(start, end))
- # Convert to tuple
- out_spans_tuple = tuple(out_spans)
- return out_spans_tuple
- def _compose_deltas(deltas, deltas_name):
- """Takes a list of deltas matches (a dictionary) and a string (the expected delta list name),
- and processes its elements to compose a tuple of integers representing the deltas"""
- if deltas_name not in deltas:
- return None
- out_deltas = deltas.get(deltas_name)
- if out_deltas is not None and out_deltas.strip():
- elems = out_deltas.split(',')
- # Convert each element in the list elems to an integer
- # after stripping whitespace and create a tuple from these integers.
- out_deltas_tuple = tuple(int(x.strip()) for x in elems)
- return out_deltas_tuple
-
- def _group_for_sample(sample, name, typ):
- if typ == 'info':
- # We can't distinguish between groups for info metrics.
- return {}
- if typ == 'summary' and sample.name == name:
- d = sample.labels.copy()
- del d['quantile']
- return d
- if typ == 'stateset':
- d = sample.labels.copy()
- del d[name]
- return d
- if typ in ['histogram', 'gaugehistogram'] and sample.name == name + '_bucket':
- d = sample.labels.copy()
- del d['le']
- return d
- return sample.labels
- def _check_histogram(samples, name):
- group = None
- timestamp = None
- def do_checks():
- if bucket != float('+Inf'):
- raise ValueError("+Inf bucket missing: " + name)
- if count is not None and value != count:
- raise ValueError("Count does not match +Inf value: " + name)
- if has_sum and count is None:
- raise ValueError("_count must be present if _sum is present: " + name)
- if has_gsum and count is None:
- raise ValueError("_gcount must be present if _gsum is present: " + name)
- if not (has_sum or has_gsum) and count is not None:
- raise ValueError("_sum/_gsum must be present if _count is present: " + name)
- if has_negative_buckets and has_sum:
- raise ValueError("Cannot have _sum with negative buckets: " + name)
- if not has_negative_buckets and has_negative_gsum:
- raise ValueError("Cannot have negative _gsum with non-negative buckets: " + name)
- for s in samples:
- suffix = s.name[len(name):]
- g = _group_for_sample(s, name, 'histogram')
- if len(suffix) == 0:
- continue
- if g != group or s.timestamp != timestamp:
- if group is not None:
- do_checks()
- count = None
- bucket = None
- has_negative_buckets = False
- has_sum = False
- has_gsum = False
- has_negative_gsum = False
- value = 0
- group = g
- timestamp = s.timestamp
- if suffix == '_bucket':
- b = float(s.labels['le'])
- if b < 0:
- has_negative_buckets = True
- if bucket is not None and b <= bucket:
- raise ValueError("Buckets out of order: " + name)
- if s.value < value:
- raise ValueError("Bucket values out of order: " + name)
- bucket = b
- value = s.value
- elif suffix in ['_count', '_gcount']:
- count = s.value
- elif suffix in ['_sum']:
- has_sum = True
- elif suffix in ['_gsum']:
- has_gsum = True
- if s.value < 0:
- has_negative_gsum = True
- if group is not None:
- do_checks()
- def text_fd_to_metric_families(fd):
- """Parse Prometheus text format from a file descriptor.
- This is a laxer parser than the main Go parser,
- so successful parsing does not imply that the parsed
- text meets the specification.
- Yields Metric's.
- """
- name = None
- allowed_names = []
- eof = False
- seen_names = set()
- type_suffixes = {
- 'counter': ['_total', '_created'],
- 'summary': ['', '_count', '_sum', '_created'],
- 'histogram': ['_count', '_sum', '_bucket', '_created'],
- 'gaugehistogram': ['_gcount', '_gsum', '_bucket'],
- 'info': ['_info'],
- }
- def build_metric(name, documentation, typ, unit, samples):
- if typ is None:
- typ = 'unknown'
- for suffix in set(type_suffixes.get(typ, []) + [""]):
- if name + suffix in seen_names:
- raise ValueError("Clashing name: " + name + suffix)
- seen_names.add(name + suffix)
- if documentation is None:
- documentation = ''
- if unit is None:
- unit = ''
- if unit and not name.endswith("_" + unit):
- raise ValueError("Unit does not match metric name: " + name)
- if unit and typ in ['info', 'stateset']:
- raise ValueError("Units not allowed for this metric type: " + name)
- if typ in ['histogram', 'gaugehistogram']:
- _check_histogram(samples, name)
- _validate_metric_name(name)
- metric = Metric(name, documentation, typ, unit)
- # TODO: check labelvalues are valid utf8
- metric.samples = samples
- return metric
- is_nh = False
- typ = None
- for line in fd:
- if line[-1] == '\n':
- line = line[:-1]
- if eof:
- raise ValueError("Received line after # EOF: " + line)
- if not line:
- raise ValueError("Received blank line")
- if line == '# EOF':
- eof = True
- elif line.startswith('#'):
- parts = _split_quoted(line, ' ', 3)
- if len(parts) < 4:
- raise ValueError("Invalid line: " + line)
- candidate_name, quoted = _unquote_unescape(parts[2])
- if not quoted and not _is_valid_legacy_metric_name(candidate_name):
- raise ValueError
- if candidate_name == name and samples:
- raise ValueError("Received metadata after samples: " + line)
- if candidate_name != name:
- if name is not None:
- yield build_metric(name, documentation, typ, unit, samples)
- # New metric
- name = candidate_name
- unit = None
- typ = None
- documentation = None
- group = None
- seen_groups = set()
- group_timestamp = None
- group_timestamp_samples = set()
- samples = []
- allowed_names = [candidate_name]
-
- if parts[1] == 'HELP':
- if documentation is not None:
- raise ValueError("More than one HELP for metric: " + line)
- documentation = _unescape_help(parts[3])
- elif parts[1] == 'TYPE':
- if typ is not None:
- raise ValueError("More than one TYPE for metric: " + line)
- typ = parts[3]
- if typ == 'untyped':
- raise ValueError("Invalid TYPE for metric: " + line)
- allowed_names = [name + n for n in type_suffixes.get(typ, [''])]
- elif parts[1] == 'UNIT':
- if unit is not None:
- raise ValueError("More than one UNIT for metric: " + line)
- unit = parts[3]
- else:
- raise ValueError("Invalid line: " + line)
- else:
- if typ == 'histogram':
- # set to true to account for native histograms naming exceptions/sanitizing differences
- is_nh = True
- sample = _parse_nh_sample(line, tuple(type_suffixes['histogram']))
- # It's not a native histogram
- if sample is None:
- is_nh = False
- sample = _parse_sample(line)
- else:
- is_nh = False
- sample = _parse_sample(line)
- if sample.name not in allowed_names and not is_nh:
- if name is not None:
- yield build_metric(name, documentation, typ, unit, samples)
- # Start an unknown metric.
- candidate_name, quoted = _unquote_unescape(sample.name)
- if not quoted and not _is_valid_legacy_metric_name(candidate_name):
- raise ValueError
- name = candidate_name
- documentation = None
- unit = None
- typ = 'unknown'
- samples = []
- group = None
- group_timestamp = None
- group_timestamp_samples = set()
- seen_groups = set()
- allowed_names = [sample.name]
- if typ == 'stateset' and name not in sample.labels:
- raise ValueError("Stateset missing label: " + line)
- if (name + '_bucket' == sample.name
- and (sample.labels.get('le', "NaN") == "NaN"
- or _isUncanonicalNumber(sample.labels['le']))):
- raise ValueError("Invalid le label: " + line)
- if (name + '_bucket' == sample.name
- and (not isinstance(sample.value, int) and not sample.value.is_integer())):
- raise ValueError("Bucket value must be an integer: " + line)
- if ((name + '_count' == sample.name or name + '_gcount' == sample.name)
- and (not isinstance(sample.value, int) and not sample.value.is_integer())):
- raise ValueError("Count value must be an integer: " + line)
- if (typ == 'summary' and name == sample.name
- and (not (0 <= float(sample.labels.get('quantile', -1)) <= 1)
- or _isUncanonicalNumber(sample.labels['quantile']))):
- raise ValueError("Invalid quantile label: " + line)
- if not is_nh:
- g = tuple(sorted(_group_for_sample(sample, name, typ).items()))
- if group is not None and g != group and g in seen_groups:
- raise ValueError("Invalid metric grouping: " + line)
- if group is not None and g == group:
- if (sample.timestamp is None) != (group_timestamp is None):
- raise ValueError("Mix of timestamp presence within a group: " + line)
- if group_timestamp is not None and group_timestamp > sample.timestamp and typ != 'info':
- raise ValueError("Timestamps went backwards within a group: " + line)
- else:
- group_timestamp_samples = set()
- series_id = (sample.name, tuple(sorted(sample.labels.items())))
- if sample.timestamp != group_timestamp or series_id not in group_timestamp_samples:
- # Not a duplicate due to timestamp truncation.
- samples.append(sample)
- group_timestamp_samples.add(series_id)
- group = g
- group_timestamp = sample.timestamp
- seen_groups.add(g)
- else:
- samples.append(sample)
- if typ == 'stateset' and sample.value not in [0, 1]:
- raise ValueError("Stateset samples can only have values zero and one: " + line)
- if typ == 'info' and sample.value != 1:
- raise ValueError("Info samples can only have value one: " + line)
- if typ == 'summary' and name == sample.name and sample.value < 0:
- raise ValueError("Quantile values cannot be negative: " + line)
- if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount', '_gsum'] and math.isnan(
- sample.value):
- raise ValueError("Counter-like samples cannot be NaN: " + line)
- if sample.name[len(name):] in ['_total', '_sum', '_count', '_bucket', '_gcount'] and sample.value < 0:
- raise ValueError("Counter-like samples cannot be negative: " + line)
- if sample.exemplar and not (
- (typ in ['histogram', 'gaugehistogram'] and sample.name.endswith('_bucket'))
- or (typ in ['counter'] and sample.name.endswith('_total'))):
- raise ValueError("Invalid line only histogram/gaugehistogram buckets and counters can have exemplars: " + line)
-
- if name is not None:
- yield build_metric(name, documentation, typ, unit, samples)
- if not eof:
- raise ValueError("Missing # EOF at end")
|