From 657b85fe23b88a4af627ede5a53443401721d0f7 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 20 Jan 2025 15:13:27 +0000 Subject: [PATCH] [NO-TICKET] Fix breaking applications due to concurrency issue in tracing **What does this PR do?** This PR fixes #2851 . As explained in the original bug report, there's a concurrency issue that can be triggered by the environment logger running concurrently with activating an instrumentation. **Motivation:** Fix a long-standing concurrency issue. **Additional Notes:** To fix this issue, I've made two changes: 1. Introduced a mutex to protect the `@instrumented_integrations`, thus making sure no two threads can be touching it at the same time 2. Took advantage of the fact that `#instrumented_integrations` was marked as private, and only being used by the environment logger and telemetry (e.g. read-only usage) to return a copy of the data. This way, we can safely iterate on the data while reconfiguration is happening concurrently. I believe the overhead of this lock is negligible, since we don't need to read this information very often. **How to test the change?** I wrote a reproducer to be able to see this issue easily. The reproducer is in two parts -- `concurrent_bug_repo.rb`: ```ruby require 'datadog' Datadog.configure do |c| c.tracing.instrument(:http, split_by_domain: true) end $sync_with_logger = Queue.new Thread.new do Datadog::Tracing::Diagnostics::EnvironmentLogger.collect_and_log! puts "Background thread finished!" end $sync_with_logger.pop Datadog.configure do |c| c.tracing.instrument(:rake, split_by_domain: true) end ``` and the following change to the environment logger: ```diff diff --git a/lib/datadog/tracing/diagnostics/environment_logger.rb b/lib/datadog/tracing/diagnostics/environment_logger.rb index d96cfa14c6..427861bff7 100644 --- a/lib/datadog/tracing/diagnostics/environment_logger.rb +++ b/lib/datadog/tracing/diagnostics/environment_logger.rb @@ -128,7 +128,15 @@ module Datadog end def collect_integrations_settings! + sync_once = $sync_with_logger + instrumented_integrations.each_with_object({}) do |(name, integration), result| + if sync_once + sync_once << true + sync_once = nil + sleep 5 + end + integration.configuration.to_h.each do |setting, value| next if setting == :tracer # Skip internal objects ``` That is, I specifically injected a `sleep` in the environment logger, and made it run on a background thread, triggering the exact same issue that had been plaguing us for years. Running the above change without my PR will trigger the issue, and with my PR it no longer does -- since the environment logger now gets its own copy of the instrumented integrations. --- lib/datadog/tracing/contrib/extensions.rb | 18 +++++++++++++++--- .../datadog/tracing/contrib/extensions_spec.rb | 10 ++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/lib/datadog/tracing/contrib/extensions.rb b/lib/datadog/tracing/contrib/extensions.rb index fde0a34c0f0..c706eb5e66f 100644 --- a/lib/datadog/tracing/contrib/extensions.rb +++ b/lib/datadog/tracing/contrib/extensions.rb @@ -110,6 +110,13 @@ def configure(&block) module Settings InvalidIntegrationError = Class.new(StandardError) + # Used to avoid concurrency issues between registering integrations (e.g. mutation) and reporting the + # current integrations for logging/debugging/telemetry purposes (e.g. iteration) in the + # `@instrumented_integrations` hash. + # + # See https://github.com/DataDog/dd-trace-rb/issues/2851 for details on the original issue. + INSTRUMENTED_INTEGRATIONS_LOCK = Mutex.new + def self.included(base) base.class_eval do settings :contrib do @@ -161,7 +168,10 @@ def instrument(integration_name, options = {}, &block) configuration_name = options[:describes] || :default filtered_options = options.reject { |k, _v| k == :describes } integration.configure(configuration_name, filtered_options, &block) - instrumented_integrations[integration_name] = integration + INSTRUMENTED_INTEGRATIONS_LOCK.synchronize do + @instrumented_integrations ||= {} + @instrumented_integrations[integration_name] = integration + end # Add to activation list integrations_pending_activation << integration @@ -192,14 +202,16 @@ def integrations_pending_activation @integrations_pending_activation ||= Set.new end + # This method is only for logging/debugging/telemetry purposes (e.g. iteration) in the + # `@instrumented_integrations` hash. # @!visibility private def instrumented_integrations - @instrumented_integrations ||= {} + INSTRUMENTED_INTEGRATIONS_LOCK.synchronize { (@instrumented_integrations&.dup || {}).freeze } end # @!visibility private def reset! - instrumented_integrations.clear + INSTRUMENTED_INTEGRATIONS_LOCK.synchronize { @instrumented_integrations&.clear } super end diff --git a/spec/datadog/tracing/contrib/extensions_spec.rb b/spec/datadog/tracing/contrib/extensions_spec.rb index 461fd09852f..ee969b45ef5 100644 --- a/spec/datadog/tracing/contrib/extensions_spec.rb +++ b/spec/datadog/tracing/contrib/extensions_spec.rb @@ -26,6 +26,10 @@ before { registry.add(integration_name, integration) } end + before do + allow(Datadog.logger).to receive(:warn) + end + context 'for' do describe Datadog do describe '#configure' do @@ -244,6 +248,12 @@ def self.patch end end end + + describe '#instrumented_integrations' do + subject(:instrumented_integrations) { settings.instrumented_integrations } + + it { is_expected.to be_frozen } + end end end end