Skip to content

Commit

Permalink
Export last replay age in replication collector
Browse files Browse the repository at this point in the history
The exported replication lag does not handle all failure modes, and can
report 0 for replicas that are out of sync and incapable of recovery.

A proper replacement for that metric would require a different approach
(see e.g. prometheus-community#1007), but for a lot of folks, simply exporting the age of
the last replay can provide a pretty strong signal for something being
amiss.

I think this solution might be preferable to prometheus-community#977, though the lag
metric needs to be fixed or abandoned eventually.

Signed-off-by: Conrad Hoffmann <[email protected]>
  • Loading branch information
bitfehler committed Nov 6, 2024
1 parent f9c7457 commit f6fb3a9
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
19 changes: 17 additions & 2 deletions collector/pg_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ var (
"Indicates if the server is a replica",
[]string{}, nil,
)
pgReplicationLastReplay = prometheus.NewDesc(
prometheus.BuildFQName(
namespace,
replicationSubsystem,
"last_replay_seconds",
),
"Age of last replay in seconds",
[]string{}, nil,
)

pgReplicationQuery = `SELECT
CASE
Expand All @@ -61,7 +70,8 @@ var (
CASE
WHEN pg_is_in_recovery() THEN 1
ELSE 0
END as is_replica`
END as is_replica,
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
)

func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
Expand All @@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,

var lag float64
var isReplica int64
err := row.Scan(&lag, &isReplica)
var replayAge float64
err := row.Scan(&lag, &isReplica, &replayAge)
if err != nil {
return err
}
Expand All @@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
pgReplicationIsReplica,
prometheus.GaugeValue, float64(isReplica),
)
ch <- prometheus.MustNewConstMetric(
pgReplicationLastReplay,
prometheus.GaugeValue, replayAge,
)
return nil
}
4 changes: 2 additions & 2 deletions collector/pg_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) {

inst := &instance{db: db}

columns := []string{"lag", "is_replica"}
columns := []string{"lag", "is_replica", "last_replay"}
rows := sqlmock.NewRows(columns).
AddRow(1000, 1)
AddRow(1000, 1, 3)
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows)

ch := make(chan prometheus.Metric)
Expand Down

0 comments on commit f6fb3a9

Please sign in to comment.