8000 optimize openmetrics text parsing (~4x perf) (#402) · rayandas/client_python@6740213 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6740213

Browse files
ahmed-mezbrian-brazil
authored andcommitted
optimize openmetrics text parsing (~4x perf) (prometheus#402)
Signed-off-by: Ahmed Mezghani <ahmed.mezghani@outlook.com>
1 parent 31f5557 commit 6740213

File tree

2 files changed

+247
-31
lines changed

2 files changed

+247
-31
lines changed

prometheus_client/openmetrics/parser.py

Lines changed: 151 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import unicode_literals
44

55
import math
6+
import re
67

78
from ..metrics_core import Metric, METRIC_LABEL_NAME_RE
89
from ..samples import Exemplar, Sample, Timestamp
@@ -24,6 +25,24 @@ def text_string_to_metric_families(text):
2425
yield metric_family
2526

2627

28+
ESCAPE_SEQUENCES = {
29+
'\\\\': '\\',
30+
'\\n': '\n',
31+
'\\"': '"',
32+
}
33+
34+
35+
def _replace_escape_sequence(match):
36+
return ESCAPE_SEQUENCES[match.group(0)]
37+
38+
39+
ESCAPING_RE = re.compile(r'\\[\\n"]')
40+
41+
42+
def _replace_escaping(s):
43+
return ESCAPING_RE.sub(_replace_escape_sequence, s)
44+
45+
2746
def _unescape_help(text):
2847
result = []
2948
slash = False
@@ -83,14 +102,23 @@ def _parse_timestamp(timestamp):
83102
return ts
84103

85104

86-
def _parse_labels(it, text):
105+
def _is_character_escaped(s, charpos):
106+
num_bslashes = 0
107+
while (charpos > 67E6 num_bslashes and
108+
s[charpos - 1 - num_bslashes] == '\\'):
109+
num_bslashes += 1
110+
return num_bslashes % 2 == 1
111+
112+
113+
def _parse_labels_with_state_machine(text):
87114
# The { has already been parsed.
88115
state = 'startoflabelname'
89116
labelname = []
90117
labelvalue = []
91118
labels = {}
119+
labels_len = 0
92120

93-
for char in it:
121+
for char in text:
94122
if state == 'startoflabelname':
95123
if char == '}':
96124
state = 'endoflabels'
@@ -141,37 +169,123 @@ def _parse_labels(it, text):
141169
break
142170
else:
143171
raise ValueError("Invalid line: " + text)
144-
return labels
172+
labels_len += 1
173+
return labels, labels_len
174+
175+
176+
def _parse_labels(text):
177+
labels = {}
178+
179+
# Raise error if we don't have valid labels
180+
if text and "=" not in text:
181+
raise ValueError
182+
183+
# Copy original labels
184+
sub_labels = text
185+
try:
186+
# Process one label at a time
187+
while sub_labels:
188+
# The label name is before the equal
189+
value_start = sub_labels.index("=")
190+
label_name = sub_labels[:value_start]
191+
sub_labels = sub_labels[value_start + 1:]
192+
193+
# Check for missing quotes
194+
if not sub_labels or sub_labels[0] != '"':
195+
raise ValueError
196+
197+
# The first quote is guaranteed to be after the equal
198+
value_substr = sub_labels[1:]
199+
200+
# Check for extra commas
201+
if not label_name or label_name[0] == ',':
202+
raise ValueError
203+
if not value_substr or value_substr[-1] == ',':
204+
raise ValueError
205+
206+
# Find the last unescaped quote
207+
i = 0
208+
while i < len(value_substr):
209+
i = value_substr.index('"', i)
210+
if not _is_character_escaped(value_substr[:i], i):
211+
break
212+
i += 1
213+
214+
# The label value is inbetween the first and last quote
215+
quote_end = i + 1
216+
label_value = sub_labels[1:quote_end]
217+
# Replace escaping if needed
218+
if "\\" in label_value:
219+
label_value = _replace_escaping(label_value)
220+
labels[label_name] = label_value
221+
222+
# Remove the processed label from the sub-slice for next iteration
223+
sub_labels = sub_labels[quote_end + 1:]
224+
if sub_labels.startswith(","):
225+
next_comma = 1
226+
else:
227+
next_comma = 0
228+
sub_labels = sub_labels[next_comma:]
229+
230+
# Check for missing commas
231+
if sub_labels and next_comma == 0:
232+
raise ValueError
233+
234+
return labels
235+
236+
except ValueError:
237+
raise ValueError("Invalid labels: " + text)
145238

146239

147240
def _parse_sample(text):
148-
name = []
149-
value = []
241+
# Detect the labels in the text
242+
label_start = text.find("{")
243+
if label_start == -1:
244+
# We don't have labels
245+
name_end = text.index(" ")
246+
name = text[:name_end]
247+
# Parse the remaining text after the name
248+
remaining_text = text[name_end + 1:]
249+
value, timestamp, exemplar = _parse_remaining_text(remaining_text)
250+
return Sample(name, {}, value, timestamp, exemplar)
251+
# The name is before the labels
252+
name = text[:label_start]
253+
seperator = " # "
254+
if text.count(seperator) == 0:
255+
# Line doesn't contain an exemplar
256+
# We can use `rindex` to find `label_end`
257+
label_end = text.rindex("}")
258+
label = text[label_start + 1:label_end]
259+
labels = _parse_labels(label)
260+
else:
261+
# Line potentially contains an exemplar
262+
# Fallback to parsing labels with a state machine
263+
labels, labels_len = _parse_labels_with_state_machine(text[label_start + 1:])
264+
label_end = labels_len + len(name)
265+
# Parsing labels succeeded, continue parsing the remaining text
266+
remaining_text = text[label_end + 2:]
267+
value, timestamp, exemplar = _parse_remaining_text(remaining_text)
268+
return Sample(name, labels, value, timestamp, exemplar)
269+
270+
271+
def _parse_remaining_text(text):
272+
split_text = text.split(" ", 1)
273+
val = _parse_value(split_text[0])
274+
if len(split_text) == 1:
275+
# We don't have timestamp or exemplar
276+
return val, None, None
277+
150278
timestamp = []
151-
labels = {}
152279
exemplar_value = []
153280
exemplar_timestamp = []
154281
exemplar_labels = None
155282

156-
state = 'name'
283+
state = 'timestamp'
284+
text = split_text[1]
157285

158286
it = iter(text)
159287
for char in it:
160-
if state == 'name':
161-
if char == '{':
162-
labels = _parse_labels(it, text)
163-
# Space has already been parsed.
164-
state = 'value'
165-
elif char == ' ':
166-
state = 'value'
167-
else:
168-
name.append(char)
169-
elif state == 'value':
170-
if char == ' ':
171-
state = 'timestamp'
172-
else:
173-
value.append(char)
174-
elif state == 'timestamp':
288+
if state == 'timestamp':
175289
if char == '#' and not timestamp:
176290
state = 'exemplarspace'
177291
elif char == ' ':
@@ -190,13 +304,23 @@ def _parse_sample(text):
190304
raise ValueError("Invalid line: " + text)
191305
elif state == 'exemplarstartoflabels':
192306
if char == '{':
193-
exemplar_labels = _parse_labels(it, text)
194-
# Space has already been parsed.
307+
label_start, label_end = text.index("{"), text.rindex("}")
308+
exemplar_labels = _parse_labels(text[label_start + 1:label_end])
309+
state = 'exemplarparsedlabels'
310+
else:
311+
raise ValueError("Invalid line: " + text)
312+
elif state == 'exemplarparsedlabels':
313+
if char == '}':
314+
state = 'exemplarvaluespace'
315+
elif state == 'exemplarvaluespace':
316+
if char == ' ':
195317
state = 'exemplarvalue'
196318
else:
197319
raise ValueError("Invalid line: " + text)
198320
elif state == 'exemplarvalue':
199-
if char == ' ':
321+
if char == ' ' and not exemplar_value:
322+
raise ValueError("Invalid line: " + text)
323+
elif char == ' ':
200324
state = 'exemplartimestamp'
201325
else:
202326
exemplar_value.append(char)
@@ -212,13 +336,9 @@ def _parse_sample(text):
212336
raise ValueError("Invalid line: " + text)
213337

214338
# Incomplete exemplar.
215-
if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels']:
339+
if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels', 'exemplarparsedlabels']:
216340
raise ValueError("Invalid line: " + text)
217341

218-
if not value:
219-
raise ValueError("Invalid line: " + text)
220-
value = ''.join(value)
221-
val = _parse_value(value)
222342
ts = _parse_timestamp(timestamp)
223343
exemplar = None
224344
if exemplar_labels is not None:
@@ -231,7 +351,7 @@ def _parse_sample(text):
231351
_parse_timestamp(exemplar_timestamp),
232352
)
233353

234-
return Sample(''.join(name), labels, val, ts, exemplar)
354+
return val, ts, exemplar
235355

236356

237357
def _group_for_sample(sample, name, typ):

tests/openmetrics/test_parser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,93 @@ def test_timestamps(self):
374374
b.add_metric([], 2, timestamp=Timestamp(1234567890, 0))
375375
self.assertEqual([a, b], list(families))
376376

377+
def test_hash_in_label_value(self):
378+
families = text_string_to_metric_families("""# TYPE a counter
379+
# HELP a help
380+
a_total{foo="foo # bar"} 1
381+
a_total{foo="} foo # bar # "} 1
382+
# EOF
383+
""")
384+
a = CounterMetricFamily("a", "help", labels=["foo"])
385+
a.add_metric(["foo # bar"], 1)
386+
a.add_metric(["} foo # bar # "], 1)
387+
self.assertEqual([a], list(families))
388+
389+
def test_exemplars_with_hash_in_label_values(self):
390+
families = text_string_to_metric_families("""# TYPE a histogram
391+
# HELP a help
392+
a_bucket{le="1.0",foo="bar # "} 0 # {a="b",foo="bar # bar"} 0.5
393+
a_bucket{le="2.0",foo="bar # "} 2 # {a="c",foo="bar # bar"} 0.5
394+
a_bucket{le="+Inf",foo="bar # "} 3 # {a="d",foo="bar # bar"} 4
395+
# EOF
396+
""")
397+
hfm = HistogramMetricFamily("a", "help")
398+
hfm.add_sample("a_bucket", {"le": "1.0", "foo": "bar # "}, 0.0, None, Exemplar({"a": "b", "foo": "bar # bar"}, 0.5))
399+
hfm.add_sample("a_bucket", {"le": "2.0", "foo": "bar # "}, 2.0, None, Exemplar({"a": "c", "foo": "bar # bar"}, 0.5))
400+
hfm.add_sample("a_bucket", {"le": "+Inf", "foo": "bar # "}, 3.0, None, Exemplar({"a": "d", "foo": "bar # bar"}, 4))
401+
self.assertEqual([hfm], list(families))
402+
403+
@unittest.skipIf(sys.version_info < (3, 3), "Test requires Python 3.3+.")
404+
def test_fallback_to_state_machine_label_parsing(self):
405+
from unittest.mock import patch
406+
from prometheus_client.openmetrics.parser import _parse_sample
407+
408+
parse_sample_function = "prometheus_client.openmetrics.parser._parse_sample"
409+
parse_labels_function = "prometheus_client.openmetrics.parser._parse_labels"
410+
parse_remaining_function = "prometheus_client.openmetrics.parser._parse_remaining_text"
411+
state_machine_function = "prometheus_client.openmetrics.parser._parse_labels_with_state_machine"
412+
413+
parse_sample_return_value = Sample("a_total", {"foo": "foo # bar"}, 1)
414+
with patch(parse_sample_function, return_value=parse_sample_return_value) as mock:
415+
families = text_string_to_metric_families("""# TYPE a counter
416+
# HELP a help
417+
a_total{foo="foo # bar"} 1
418+
# EOF
419+
""")
420+
a = CounterMetricFamily("a", "help", labels=["foo"])
421+
a.add_metric(["foo # bar"], 1)
422+
self.assertEqual([a], list(families))
423+
mock.assert_called_once_with('a_total{foo="foo # bar"} 1')
424+
425+
# First fallback case
426+
state_machine_return_values = [{"foo": "foo # bar"}, len('foo="foo # bar"}')]
427+
parse_remaining_values = [1, None, None]
428+
with patch(parse_labels_function) as mock1:
429+
with patch(state_machine_function, return_value=state_machine_return_values) as mock2:
430+
with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
431+
sample = _parse_sample('a_total{foo="foo # bar"} 1')
432+
s = Sample("a_total", {"foo": "foo # bar"}, 1)
433+
self.assertEqual(s, sample)
434+
mock1.assert_not_called()
435+
mock2.assert_called_once_with('foo="foo # bar"} 1')
436+
mock3.assert_called_once_with('1')
437+
438+
# Second fallback case
439+
state_machine_return_values = [{"le": "1.0"}, len('le="1.0"}')]
440+
parse_remaining_values = [0.0, Timestamp(123, 0), Exemplar({"a": "b"}, 0.5)]
441+
with patch(parse_labels_function) as mock1:
442+
with patch(state_machine_function, return_value=state_machine_return_values) as mock2:
443+
with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
444+
sample = _parse_sample('a_bucket{le="1.0"} 0 123 # {a="b"} 0.5')
445+
s = Sample("a_bucket", {"le": "1.0"}, 0.0, Timestamp(123, 0), Exemplar({"a": "b"}, 0.5))
446+
self.assertEqual(s, sample)
447+
mock1.assert_not_called()
448+
mock2.assert_called_once_with('le="1.0"} 0 123 # {a="b"} 0.5')
449+
mock3.assert_called_once_with('0 123 # {a="b"} 0.5')
450+
451+
# No need to fallback case
452+
parse_labels_return_values = {"foo": "foo#bar"}
453+
parse_remaining_values = [1, None, None]
454+
with patch(parse_labels_function, return_value=parse_labels_return_values) as mock1:
455+
with patch(state_machine_function) as mock2:
456+
with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
457+
sample = _parse_sample('a_total{foo="foo#bar"} 1')
458+
s = Sample("a_total", {"foo": "foo#bar"}, 1)
459+
self.assertEqual(s, sample)
460+
mock1.assert_called_once_with('foo="foo#bar"')
461+
mock2.assert_not_called()
462+
mock3.assert_called_once_with('1')
463+
377464
@unittest.skipIf(sys.version_info < (2, 7), "Test requires Python 2.7+.")
378465
def test_roundtrip(self):
379466
text = """# HELP go_gc_duration_seconds A summary of the GC invocation durations.
@@ -453,6 +540,12 @@ def test_invalid_input(self):
453540
('a{a=1} 1\n# EOF\n'),
454541
('a{a="1} 1\n# EOF\n'),
455542
('a{a=\'1\'} 1\n# EOF\n'),
543+
# Missing equal or label value.
544+
('a{a} 1\n# EOF\n'),
545+
('a{a"value"} 1\n# EOF\n'),
546+
('a{a""} 1\n# EOF\n'),
547+
('a{a=} 1\n# EOF\n'),
548+
('a{a="} 1\n# EOF\n'),
456549
# Missing or extra commas.
457550
('a{a="1"b="2"} 1\n# EOF\n'),
458551
('a{a="1",,b="2"} 1\n# EOF\n'),
@@ -523,6 +616,9 @@ def test_invalid_input(self):
523616
('# TYPE a histogram\na_sum 1 # {a="b"} 0.5\n# EOF\n'),
524617
('# TYPE a gaugehistogram\na_sum 1 # {a="b"} 0.5\n# EOF\n'),
525618
('# TYPE a_bucket gauge\na_bucket 1 # {a="b"} 0.5\n# EOF\n'),
619+
# Exemplars on unallowed metric types.
620+
('# TYPE a counter\na_total 1 # {a="b"} 1\n# EOF\n'),
621+
('# TYPE a gauge\na 1 # {a="b"} 1\n# EOF\n'),
526622
# Bad stateset/info values.
527623
('# TYPE a stateset\na 2\n# EOF\n'),
528624
('# TYPE a info\na 2\n# EOF\n'),

0 commit comments

Comments
 (0)
0