From 4f63a3431104efd677bfd841a2b5d7a8c1a26da5 Mon Sep 17 00:00:00 2001 From: Gustavo Lopes Date: Mon, 1 Jul 2024 14:46:07 +0100 Subject: [PATCH] telemetry worker: flush data after stops Telemetry workers are functionally dead after a Stop lifecycle action, provided there's no intervening Start. While AddPoint actions are still processed, their data is never flushed, since the Stop action handler unschedules FlushMetrics and FlushData actions. PHP sends a Stop action at the end of every request via ddog_sidecar_telemetry_end(), but a Start action is only generated just after a telemetry worker is spawned. It is not clear to me whether the intention is to a Start/Stop pair on every PHP requests (where Stop flushes the metrics) or if the intention is to to have only such a pair in the first request, with the Stop event generated by ddog_sidecar_telemetry_end() effectively a noop. It would appear, judging by [this comment](https://github.com/DataDog/libdatadog/pull/391): > Also allow the telemetry worker to have a mode where it's continuing execution after a start-stop cycle, otherwise it won't send any more metrics afterwards. that the intention is to keep sending metrics after a Start/Stop pair. In that case: * The Stop action handler should not unschedule FlushData and FlushMetrics events and * FlushData, if called outside a Start-Stop pair, should not be a noop. Finally: swap the order in which FlushData and FlushMetrics are scheduled so that FlushMetrics runs first and therefore its generated data can be sent by the next FlushData. --- ddtelemetry/src/worker/mod.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ddtelemetry/src/worker/mod.rs b/ddtelemetry/src/worker/mod.rs index 77a9cb0247..2051f94ff0 100644 --- a/ddtelemetry/src/worker/mod.rs +++ b/ddtelemetry/src/worker/mod.rs @@ -241,10 +241,10 @@ impl TelemetryWorker { Lifecycle(Start) => { if !self.data.started { self.deadlines - .schedule_event(LifecycleAction::FlushData) + .schedule_event(LifecycleAction::FlushMetricAggr) .unwrap(); self.deadlines - .schedule_event(LifecycleAction::FlushMetricAggr) + .schedule_event(LifecycleAction::FlushData) .unwrap(); self.data.started = true; } @@ -265,9 +265,6 @@ impl TelemetryWorker { .unwrap(); } Lifecycle(FlushData) => { - if !self.data.started { - return CONTINUE; - } let batch = self.build_observability_batch(); if !batch.is_empty() { let payload = data::Payload::MessageBatch(batch); @@ -296,7 +293,9 @@ impl TelemetryWorker { self.log_err(&e); } self.data.started = false; - self.deadlines.clear_pending(); + if !self.config.restartable { + self.deadlines.clear_pending(); + } return BREAK; } CollectStats(stats_sender) => { @@ -341,10 +340,10 @@ impl TelemetryWorker { Err(err) => self.log_err(&err), } self.deadlines - .schedule_event(LifecycleAction::FlushData) + .schedule_event(LifecycleAction::FlushMetricAggr) .unwrap(); self.deadlines - .schedule_event(LifecycleAction::FlushMetricAggr) + .schedule_event(LifecycleAction::FlushData) .unwrap(); self.data.started = true; } @@ -368,9 +367,6 @@ impl TelemetryWorker { .unwrap(); } Lifecycle(FlushData) => { - if !self.data.started { - return CONTINUE; - } let mut batch = self.build_app_events_batch(); let payload = if batch.is_empty() { data::Payload::AppHeartbeat(()) @@ -458,7 +454,9 @@ impl TelemetryWorker { .await; self.data.started = false; - self.deadlines.clear_pending(); + if !self.config.restartable { + self.deadlines.clear_pending(); + } return BREAK; } CollectStats(stats_sender) => {