From f371186f20e1c341eb1befdc3062cdafd66625cf Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 28 Jan 2025 20:01:10 +0100 Subject: [PATCH] fix(pageserver): gc-compaction race with read Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline.rs | 8 ++++++++ pageserver/src/tenant/timeline/compaction.rs | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 076220df5151..ff0ac244fa40 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -264,6 +264,8 @@ pub struct Timeline { /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. pub(crate) layers: tokio::sync::RwLock, + pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, + last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, @@ -2423,6 +2425,7 @@ impl Timeline { shard_identity, pg_version, layers: Default::default(), + gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()), walredo_mgr, walreceiver: Mutex::new(None), @@ -3459,6 +3462,11 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + // The read layer guard optimization is not correct when the compaction algorithm + // merges the history of a key. To optimize for common cases while avoid correctness + // issues, we need to hold the `gc_compaction_layer_update_lock` on the read path. + let _guard = timeline.gc_compaction_layer_update_lock.read().await; + loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 28c33813185d..6c9418602ebc 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -2915,6 +2915,19 @@ impl Timeline { // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only // operate on L1 layers. { + // The read guard is used on the read path in `get_vectored_reconstruct_data_timeline`. + // Gc-compaction will consolidate some deltas into a single image. Therefore, in the + // following case, it's incorrect to read without this guard: + // 0. Gc-compaction running in the background. + // 1. A key is in 2 delta layers. The read path processes the first layer and drops the + // layers guard. Both of the layers are below the gc-horizon. + // 2. Gc-compaction finishes consolidating these two layers into a single image and updates + // the layer map. The image is at the gc-horizon. + // 3. The read path continues to read from the end LSN of the second layer, but now it cannot + // find any existing layer. + // By holding the update guard and avoid the read path to process a single read before/after + // gc-compaction, we can avoid this race condition. + let _update_guard = self.gc_compaction_layer_update_lock.write().await; let mut guard = self.layers.write().await; guard .open_mut()?