From f6fb3a936e3059eb1309356b39977dbbbdeb1f72 Mon Sep 17 00:00:00 2001 From: Conrad Hoffmann Date: Wed, 6 Nov 2024 12:33:07 +0100 Subject: [PATCH] Export last replay age in replication collector The exported replication lag does not handle all failure modes, and can report 0 for replicas that are out of sync and incapable of recovery. A proper replacement for that metric would require a different approach (see e.g. #1007), but for a lot of folks, simply exporting the age of the last replay can provide a pretty strong signal for something being amiss. I think this solution might be preferable to #977, though the lag metric needs to be fixed or abandoned eventually. Signed-off-by: Conrad Hoffmann --- collector/pg_replication.go | 19 +++++++++++++++++-- collector/pg_replication_test.go | 4 ++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/collector/pg_replication.go b/collector/pg_replication.go index 6067cc9b1..7f8b2fbd7 100644 --- a/collector/pg_replication.go +++ b/collector/pg_replication.go @@ -51,6 +51,15 @@ var ( "Indicates if the server is a replica", []string{}, nil, ) + pgReplicationLastReplay = prometheus.NewDesc( + prometheus.BuildFQName( + namespace, + replicationSubsystem, + "last_replay_seconds", + ), + "Age of last replay in seconds", + []string{}, nil, + ) pgReplicationQuery = `SELECT CASE @@ -61,7 +70,8 @@ var ( CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 - END as is_replica` + END as is_replica, + GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay` ) func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error { @@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, var lag float64 var isReplica int64 - err := row.Scan(&lag, &isReplica) + var replayAge float64 + err := row.Scan(&lag, &isReplica, &replayAge) if err != nil { return err } @@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, pgReplicationIsReplica, prometheus.GaugeValue, float64(isReplica), ) + ch <- prometheus.MustNewConstMetric( + pgReplicationLastReplay, + prometheus.GaugeValue, replayAge, + ) return nil } diff --git a/collector/pg_replication_test.go b/collector/pg_replication_test.go index b6df698e3..6fb1c8ef1 100644 --- a/collector/pg_replication_test.go +++ b/collector/pg_replication_test.go @@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) { inst := &instance{db: db} - columns := []string{"lag", "is_replica"} + columns := []string{"lag", "is_replica", "last_replay"} rows := sqlmock.NewRows(columns). - AddRow(1000, 1) + AddRow(1000, 1, 3) mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows) ch := make(chan prometheus.Metric)