Skip to content

Commit 18ca85e

Browse files
fix(tracer): use RLOCK in span aggregator (#6325)
This fix resolves an issue where sqlalchemy rollbacks could intermittently cause deadlocks due SpanAggregator's `on_span_start` using a Lock. This fix was put behind a flag `DD_TRACE_SPAN_AGGREGATOR_RLOCK` which defaults to false due to possible performance impacts. While our benchmarks don't show anything major, and we suspect this change will not impact performance much at all, we want to be cautious and test on internal services before making it true by default. We plan on gathering feedback and testing on internal services, as long as we don't see a negative impact, we plan on making this the new default. Should fix: #4978 ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: Brett Langdon <[email protected]>
1 parent 680915b commit 18ca85e

File tree

5 files changed

+29
-2
lines changed

5 files changed

+29
-2
lines changed

ddtrace/internal/processor/trace.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import abc
22
from collections import defaultdict
3-
import threading
3+
from threading import Lock
4+
from threading import RLock
45
from typing import Dict
56
from typing import Iterable
67
from typing import List
78
from typing import Optional
9+
from typing import Union
810

911
import attr
1012
import six
@@ -180,7 +182,10 @@ class _Trace(object):
180182
type=DefaultDict[int, "_Trace"],
181183
repr=False,
182184
)
183-
_lock = attr.ib(init=False, factory=threading.Lock, repr=False)
185+
if config._span_aggregator_rlock:
186+
_lock = attr.ib(init=False, factory=RLock, repr=False, type=Union[RLock, Lock])
187+
else:
188+
_lock = attr.ib(init=False, factory=Lock, repr=False, type=Union[RLock, Lock])
184189
# Tracks the number of spans created and tags each count with the api that was used
185190
# ex: otel api, opentracing api, datadog api
186191
_span_metrics = attr.ib(

ddtrace/settings/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ def __init__(self):
320320
self._propagation_style_inject.append(_PROPAGATION_STYLE_W3C_TRACECONTEXT)
321321

322322
self._ddtrace_bootstrapped = False
323+
self._span_aggregator_rlock = asbool(os.getenv("DD_TRACE_SPAN_AGGREGATOR_RLOCK", False))
323324

324325
def __getattr__(self, name):
325326
if name not in self._config:

docs/configuration.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,11 @@ The following environment variables for the tracer are supported:
413413
type: Boolean
414414
default: True
415415
description: Send query strings in http.url tag in http server integrations.
416+
417+
DD_TRACE_SPAN_AGGREGATOR_RLOCK:
418+
type: Boolean
419+
default: False
420+
description: Whether the ``SpanAggregator`` should use an RLock or a Lock.
416421

417422
DD_IAST_ENABLED:
418423
type: Boolean
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
issues:
3+
- |
4+
sqlalchemy: sqlalchemy rollbacks can intermittently cause deadlocks in some cases.
5+
If experiencing this issue, set ``DD_TRACE_SPAN_AGGREGATOR_RLOCK=True``.
6+
After testing and feedback we intend to make True the default value.

tests/contrib/gunicorn/test_gunicorn.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def _gunicorn_settings_factory(
5757
debug_mode=False, # type: bool
5858
dd_service=None, # type: Optional[str]
5959
schema_version=None, # type: Optional[str]
60+
rlock=False, # type: bool
6061
):
6162
# type: (...) -> GunicornServerSettings
6263
"""Factory for creating gunicorn settings with simple defaults if settings are not defined."""
@@ -73,6 +74,8 @@ def _gunicorn_settings_factory(
7374
env["DD_SERVICE"] = dd_service
7475
if schema_version is not None:
7576
env["DD_TRACE_SPAN_ATTRIBUTE_SCHEMA"] = schema_version
77+
if rlock is not None:
78+
env["DD_TRACE_SPAN_AGGREGATOR_RLOCK"] = "true"
7679
return GunicornServerSettings(
7780
env=env,
7881
directory=directory,
@@ -164,6 +167,12 @@ def gunicorn_server(gunicorn_server_settings, tmp_path):
164167
debug_mode=True,
165168
enable_module_cloning=True,
166169
)
170+
SETTINGS_GEVENT_SPANAGGREGATOR_RLOCK = _gunicorn_settings_factory(
171+
worker_class="gevent",
172+
use_ddtracerun=False,
173+
import_auto_in_app=True,
174+
rlock=True,
175+
)
167176

168177

169178
@pytest.mark.skipif(sys.version_info >= (3, 11), reason="Gunicorn is only supported up to 3.10")
@@ -175,6 +184,7 @@ def gunicorn_server(gunicorn_server_settings, tmp_path):
175184
SETTINGS_GEVENT_DDTRACERUN,
176185
SETTINGS_GEVENT_DDTRACERUN_MODULE_CLONE,
177186
SETTINGS_GEVENT_DDTRACERUN_DEBUGMODE_MODULE_CLONE,
187+
SETTINGS_GEVENT_SPANAGGREGATOR_RLOCK,
178188
],
179189
)
180190
def test_no_known_errors_occur(gunicorn_server_settings, tmp_path):

0 commit comments

Comments
 (0)