8000 Add deadlock detection · suligap/client_python@91d9893 · GitHub
[go: up one dir, main page]

Skip to content

Commit 91d9893

Browse files
committed
Add deadlock detection
Detect deadlocks during the library misuse, eg. by injecting code into the critical sections that itself might want to obtain the relevant lock. A follow up to promethe 10000 us#1076. Signed-off-by: Przemysław Suliga <mail@suligap.net>
1 parent ef95c4b commit 91d9893

File tree

6 files changed

+81
-13
lines changed

6 files changed

+81
-13
lines changed

prometheus_client/errors.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
class PrometheusClientRuntimeError(RuntimeError):
3+
pass

prometheus_client/metrics.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
from threading import Lock
32
import time
43
import types
54
from typing import (
@@ -13,7 +12,7 @@
1312
from .metrics_core import Metric
1413
from .registry import Collector, CollectorRegistry, REGISTRY
1514
from .samples import Exemplar, Sample
16-
from .utils import floatToGoString, INF 10000
15+
from .utils import floatToGoString, INF, WarnLock
1716
from .validation import (
1817
_validate_exemplar, _validate_labelnames, _validate_metric_name,
1918
)
@@ -120,7 +119,7 @@ def __init__(self: T,
120119

121120
if self._is_parent():
122121
# Prepare the fields needed for child metrics.
123-
self._lock = Lock()
122+
self._lock = WarnLock()
124123
self._metrics: Dict[Sequence[str], T] = {}
125124

126125
if self._is_observable():
@@ -673,7 +672,7 @@ class Info(MetricWrapperBase):
673672

674673
def _metric_init(self):
675674
self._labelname_set = set(self._labelnames)
676-
self._lock = Lock()
675+
self._lock = WarnLock()
677676
self._value = {}
678677

679678
def info(self, val: Dict[str, str]) -> None:
@@ -735,7 +734,7 @@ def __init__(self,
735734

736735
def _metric_init(self) -> None:
737736
self._value = 0
738-
self._lock = Lock()
737+
self._lock = WarnLock()
739738

740739
def state(self, state: str) -> None:
741740
"""Set enum metric state."""

prometheus_client/registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from abc import ABC, abstractmethod
22
import copy
3-
from threading import Lock
43
from typing import Dict, Iterable, List, Optional
54

65
from .metrics_core import Metric
6+
from .utils import WarnLock
77

88

99
# Ideally this would be a Protocol, but Protocols are only available in Python >= 3.8.
@@ -30,7 +30,7 @@ def __init__(self, auto_describe: bool = False, target_info: Optional[Dict[str,
3030
self._collector_to_names: Dict[Collector, List[str]] = {}
3131
self._names_to_collectors: Dict[str, Collector] = {}
3232
self._auto_describe = auto_describe
33-
self._lock = Lock()
33+
self._lock = WarnLock()
3434
self._target_info: Optional[Dict[str, str]] = {}
3535
self.set_target_info(target_info)
3636

prometheus_client/utils.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import math
2+
from threading import Lock, RLock
3+
4+
from .errors import PrometheusClientRuntimeError
25

36
INF = float("inf")
47
MINUS_INF = float("-inf")
@@ -22,3 +25,40 @@ def floatToGoString(d):
2225
mantissa = f'{s[0]}.{s[1:dot]}{s[dot + 1:]}'.rstrip('0.')
2326
return f'{mantissa}e+0{dot - 1}'
2427
return s
28+
29+
30+
class WarnLock:
31+
"""A wrapper around RLock and Lock that prevents deadlocks.
32+
33+
Raises a RuntimeError when it detects attempts to re-enter the critical
34+
section from a single thread. Intended to be used as a context manager.
35+
"""
36+
error_msg = (
37+
'Attempt to enter a non reentrant context from a single thread.'
38+
' It is possible that the client code is trying to register or update'
39+
' metrics from within metric registration code or from a signal handler'
40+
' while metrics are being registered or updated.'
41+
' This is unsafe and cannot be allowed. It would result in a deadlock'
42+
' if this exception was not raised.'
43+
)
44+
45+
def __init__(self):
46+
self._rlock = RLock()
47+
self._lock = Lock()
48+
49+
def __enter__(self):
50+
self._rlock.acquire()
51+
if not self._lock.acquire(blocking=False):
52+
self._rlock.release()
53+
raise PrometheusClientRuntimeError(self.error_msg)
54+
55+
def __exit__(self, exc_type, exc_value, traceback):
56+
self._lock.release()
57+
self._rlock.release()
58+
59+
def _locked(self):
60+
# For use in tests.
61+
if self._rlock.acquire(blocking=False):
62+
self._rlock.release()
63+
return False
64+
return True

prometheus_client/values.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import os
2-
from threading import Lock
32
import warnings
43

54
from .mmap_dict import mmap_key, MmapedDict
5+
from .utils import WarnLock
66

77

88
class MutexValue:
@@ -13,7 +13,7 @@ class MutexValue:
1313
def __init__(self, typ, metric_name, name, labelnames, labelvalues, help_text, **kwargs):
1414
self._value = 0.0
1515
self._exemplar = None
16-
self._lock = Lock()
16+
self._lock = WarnLock()
1717

1818
def inc(self, amount):
1919
with self._lock:
@@ -47,10 +47,9 @@ def MultiProcessValue(process_identifier=os.getpid):
4747
files = {}
4848
values = []
4949
pid = {'value': process_identifier()}
50-
# Use a single global lock when in multi-processing mode
51-
# as we presume this means there is no threading going on.
50+
# Use a single global lock when in multi-processing mode.
5251
# This avoids the need to also have mutexes in __MmapDict.
53-
lock = Lock()
52+
lock = WarnLock()
5453

5554
class MmapedValue:
5655
"""A float protected by a mutex backed by a per-process mmaped file."""

tests/test_core.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
StateSetMetricFamily, Summary, SummaryMetricFamily, UntypedMetricFamily,
1414
)
1515
from prometheus_client.decorator import getargspec
16+
from prometheus_client.errors import PrometheusClientRuntimeError
1617
from prometheus_client.metrics import _get_use_created
1718
from prometheus_client.validation import (
1819
disable_legacy_validation, enable_legacy_validation,
@@ -134,6 +135,19 @@ def test_exemplar_too_long(self):
134135
'y123456': '7+15 characters',
135136
})
136137

138+
def test_single_thread_deadlock_detection(self):
139+
counter = self.counter
140+
141+
class Tracked(float):
142+
def __radd__(self, other):
143+
counter.inc(10)
144+
return self + other
145+
146+
expected_msg = 'Attempt to enter a non reentrant context from a single thread.'
147+
self.assertRaisesRegex(
148+
PrometheusClientRuntimeError, expected_msg, counter.inc, Tracked(100)
149+
)
150+
137151

138152
class TestDisableCreated(unittest.TestCase):
139153
def setUp(self):
@@ -1004,7 +1018,20 @@ def test_restricted_registry_does_not_yield_while_locked(self):
10041018
m = Metric('target', 'Target metadata', 'info')
10051019
m.samples = [Sample('target_info', {'foo': 'bar'}, 1)]
10061020
for _ in registry.restricted_registry(['target_info', 's_sum']).collect():
1007-
self.assertFalse(registry._lock.locked())
1021+
self.assertFalse(registry._lock._locked())
1022+
1023+
def test_registry_deadlock_detection(self):
1024+
registry = CollectorRegistry(auto_describe=True)
1025+
1026+
class RecursiveCollector:
1027+
def collect(self):
1028+
Counter('x', 'help', registry=registry)
1029+
return [CounterMetricFamily('c_total', 'help', value=1)]
1030+
1031+
expected_msg = 'Attempt to enter a non reentrant context from a single thread.'
1032+
self.assertRaisesRegex(
1033+
PrometheusClientRuntimeError, expected_msg, registry.register, RecursiveCollector()
1034+
)
10081035

10091036

10101037
if __name__ == '__main__':

0 commit comments

Comments
 (0)
0